In [None]:

# TODO 1 : Import required libraries
# Step 1: Import Required Libraries

# Basic data and visualization libraries
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Natural Language Processing
from nltk.corpus import stopwords
from textblob import TextBlob

# Vectorization (optional for extended analysis)
from sklearn.feature_extraction.text import CountVectorizer

# Download required NLTK data
nltk.download('stopwords')
nltk.download('vader_lexicon')

# Load dataset
df = pd.read_csv('https://gitlab.crio.do/me_notebook/me_jupyter_amazonfeedbackanalysis/-/raw/master/amazon_product_reviews.csv')
df.head()

# Rename column if needed
df.columns = [col.lower().strip() for col in df.columns]
if 'reviews' not in df.columns:
    df.rename(columns={'reviews.text': 'reviews'}, inplace=True)

# Drop missing reviews
df.dropna(subset=['reviews'], inplace=True)
df.reset_index(drop=True, inplace=True)

# Text Cleaning Function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["cleaned_review"] = df["reviews"].apply(clean_text)

# Review Length
df["review_length"] = df["cleaned_review"].apply(lambda x: len(x.split()))

plt.figure(figsize=(10, 5))
sns.histplot(df["review_length"], bins=40, kde=True)
plt.title("Distribution of Review Lengths")
plt.xlabel("Number of Words")
plt.ylabel("Frequency")
plt.show()

# Sentiment Analysis (VADER)
sia = SentimentIntensityAnalyzer()

def get_sentiment(score):
    if score >= 0.05:
        return "Positive"
    elif score <= -0.05:
        return "Negative"
    else:
        return "Neutral"

df["compound_score"] = df["cleaned_review"].apply(lambda x: sia.polarity_scores(x)["compound"])
df["sentiment"] = df["compound_score"].apply(get_sentiment)

sns.countplot(data=df, x="sentiment", order=["Positive", "Neutral", "Negative"])
plt.title("Review Rating Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Count")
plt.show()

# WordCloud for All Reviews
text_all = " ".join(df["cleaned_review"].astype(str).tolist())
stop_words = set(stopwords.words("english"))

wordcloud = WordCloud(width=1000, height=500, background_color="white", stopwords=stop_words).generate(text_all)

plt.figure(figsize=(15, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Commonly Used Words")
plt.show()

# TextBlob Sentiment
def get_sentiment_textblob(text):
    polarity = TextBlob(text).sentiment.polarity
    if polarity > 0:
        return "Positive"
    elif polarity < 0:
        return "Negative"
    else:
        return "Neutral"

df['sentiment'] = df['cleaned_review'].apply(get_sentiment_textblob)

plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='sentiment', order=['Positive', 'Neutral', 'Negative'])
plt.title("Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Number of Reviews")
plt.show()

# Tokenization for Negative Reviews
negative_reviews = df[df["sentiment"] == "Negative"].copy()

def tokenize(text):
    return [word for word in text.split() if word not in stop_words]

negative_reviews["tokens"] = negative_reviews["cleaned_review"].apply(tokenize)

flat_words = [word for tokens in negative_reviews["tokens"] for word in tokens]

top_negative_words = Counter(flat_words).most_common(15)
words, counts = zip(*top_negative_words)

plt.figure(figsize=(10, 6))
sns.barplot(x=list(counts), y=list(words))
plt.title("Top Issues from Negative Reviews")
plt.xlabel("Frequency")
plt.ylabel("Words")
plt.show()
