In [None]:
!pip install transformers torch pandas numpy scikit-learn matplotlib seaborn nltk wordcloud
import pandas as pd
import numpy as np
import torch
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import re
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax

# Ensure NLTK resources are available
nltk.download("stopwords")
nltk.download("wordnet")

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load Dataset
df = pd.read_csv("product_reviews.csv")  # Update with actual dataset path

# Display dataset info
print("\nDataset Info:\n", df.info())
print("\nFirst Few Rows:\n", df.head())

# -------------------------------
# ** NLP PREPROCESSING **
# -------------------------------
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    """Cleans and preprocesses text data."""
    text = text.lower()  # Lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

# Apply NLP preprocessing to 'Description' column
df["Cleaned_Description"] = df["Description"].astype(str).apply(preprocess_text)

# -------------------------------
# ** EXPLORATORY DATA ANALYSIS (EDA) **
# -------------------------------
# 1. Word Cloud for common words
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(df["Cleaned_Description"]))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Most Common Words in Product Descriptions")
plt.show()

# 2. Sentiment Distribution Plot
sns.countplot(x=df["Sentiment"])
plt.title("Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Count")
plt.show()

# 3. Most Common Words
vectorizer = CountVectorizer(stop_words="english", max_features=20)
word_freq = vectorizer.fit_transform(df["Cleaned_Description"])
word_counts = dict(zip(vectorizer.get_feature_names_out(), np.ravel(word_freq.sum(axis=0))))
sns.barplot(x=list(word_counts.keys()), y=list(word_counts.values()))
plt.xticks(rotation=45)
plt.title("Top 20 Most Common Words")
plt.show()

# -------------------------------
# ** SENTIMENT ANALYSIS (BERT & RoBERTa) **
# -------------------------------
df["Predicted_Sentiment"] = ""

models = {
    "BERT": "nlptown/bert-base-multilingual-uncased-sentiment",
    "RoBERTa": "cardiffnlp/twitter-roberta-base-sentiment"
}

def analyze_sentiment(model_name, text):
    """Performs sentiment analysis using BERT/RoBERTa."""
    tokenizer = AutoTokenizer.from_pretrained(models[model_name])
    model = AutoModelForSequenceClassification.from_pretrained(models[model_name])

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    scores = softmax(outputs.logits, dim=1).detach().numpy()[0]

    labels = ["Negative", "Neutral", "Positive"]  # Adjust based on model
    return labels[scores.argmax()]

# Apply RoBERTa Sentiment Analysis
for index, row in df.iterrows():
    text = f"{row['Product']} {row['Cleaned_Description']}"
    df.at[index, "Predicted_Sentiment"] = analyze_sentiment("RoBERTa", text)

# Save results
df.to_csv("product_sentiment_results.csv", index=False)
print("Sentiment analysis completed and results saved!")
