#  **Install necessary packages and libraries**

In [None]:
!pip install vaderSentiment transformers gensim nltk sklearn
import requests
from bs4 import BeautifulSoup
import re
import json
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline
from gensim import corpora, models
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


# **Scraping Amazon reviews**

In [None]:
def scrape_amazon_reviews(url, headers):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    reviews = soup.find_all('span', {'data-hook': 'review-body'})
    reviews_list = [review.text.strip() for review in reviews]
    return reviews_list


# **Text cleaning functions**

In [None]:
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    return text.lower().strip()

# **Vader Sentiment Analysis**

In [None]:
def analyze_sentiment_vader(review):
    analyzer = SentimentIntensityAnalyzer()
    sentiment_score = analyzer.polarity_scores(review)
    if sentiment_score['compound'] >= 0.05:
        return 'positive', sentiment_score['compound']
    elif sentiment_score['compound'] <= -0.05:
        return 'negative', sentiment_score['compound']
    else:
        return 'neutral', sentiment_score['compound']

# **BERT-based Sentiment Analysis**

In [None]:
def analyze_review_with_bert(review):
    classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')
    result = classifier(review)
    return result[0]['label'], result[0]['score']


# **Topic Modeling using LDA**

In [None]:
def perform_topic_modeling(reviews):
    stop_words = set(stopwords.words('english'))
    texts = [[word for word in simple_preprocess(review) if word not in stop_words] for review in reviews]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda_model = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes=15)
    topics = lda_model.print_topics(num_words=4)
    return topics


# **Convert the results to a structured JSON output**

In [None]:
def create_json_output(review, sentiment, score, themes):
    output = {
        "review_text": review,
        "sentiment": sentiment,
        "sentiment_score": score,
        "themes": themes
    }
    return json.dumps(output)

# **Model training function**

In [None]:
def train_bert_model(train_dataset, eval_dataset):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

    train_encodings = tokenizer([example["text"] for example in train_dataset], truncation=True, padding=True)
    eval_encodings = tokenizer([example["text"] for example in eval_dataset], truncation=True, padding=True)

    class MyDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            return item

        def __len__(self):
            return len(self.labels)

    train_dataset = MyDataset(train_encodings, [example["label"] for example in train_dataset])
    eval_dataset = MyDataset(eval_encodings, [example["label"] for example in eval_dataset])

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        evaluation_strategy="epoch",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )

    trainer.train()

# **Metrics calculation**

In [None]:
def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")


# **Example usage**

In [None]:
if __name__ == "__main__":
    # Replace this URL with the actual product page you want to scrape
    url = 'https://www.amazon.com/product-reviews/B07XJ8C8F5/'
    headers = {'User-Agent': 'Your user agent string'}
    reviews_list = scrape_amazon_reviews(url, headers)

    # Clean and analyze the reviews
    cleaned_reviews = [clean_text(review) for review in reviews_list]

    for review in cleaned_reviews:
        sentiment, score = analyze_sentiment_vader(review)
        topics = perform_topic_modeling([review])
        json_output = create_json_output(review, sentiment, score, topics)
        print(json_output)


# **Placeholder for testing metrics calculation**

In [None]:
# Example of actual labels (y_true) and predicted labels (y_pred) for testing
y_true = ['positive', 'negative', 'neutral', 'positive', 'negative']  # Example true labels
y_pred = ['positive', 'neutral', 'neutral', 'positive', 'negative']   # Example predicted labels

# Converting these string labels to numeric values for the model (e.g., 0 = negative, 1 = neutral, 2 = positive)
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
y_true_numeric = [label_mapping[label] for label in y_true]
y_pred_numeric = [label_mapping[label] for label in y_pred]

# Calculate metrics
accuracy = accuracy_score(y_true_numeric, y_pred_numeric)
precision = precision_score(y_true_numeric, y_pred_numeric, average='weighted')
recall = recall_score(y_true_numeric, y_pred_numeric, average='weighted')
f1 = f1_score(y_true_numeric, y_pred_numeric, average='weighted')

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.8
Precision: 0.9
Recall: 0.8
F1 Score: 0.8
