# Product Review Analysis

This notebook analyzes product reviews data to provide insights on ratings, sentiment, and trends over time.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

# Set style for better-looking plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load the data
df = pd.read_csv('scraped_data/processed_reviews_20250905_003849.csv')

# Display basic info
print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print("\nColumns:")
for col in df.columns:
    print(f"  - {col}")
print("\nFirst few rows:")
df.head()

## 1. Average Rating Calculation

In [None]:
# Calculate average rating
avg_rating = df['rating'].mean()
print(f"Average Rating: {avg_rating:.2f} out of 5.0")

# Additional rating statistics
print(f"\nRating Statistics:")
print(f"  - Minimum: {df['rating'].min()}")
print(f"  - Maximum: {df['rating'].max()}")
print(f"  - Median: {df['rating'].median()}")
print(f"  - Standard Deviation: {df['rating'].std():.2f}")

## 2. Rating Distribution Visualization

In [None]:
# Create rating distribution bar chart
plt.figure(figsize=(10, 6))

# Count ratings
rating_counts = df['rating'].value_counts().sort_index()

# Create bar chart
bars = plt.bar(rating_counts.index, rating_counts.values, color=['#ff9999','#66b3ff','#99ff99','#ffcc99','#ff99cc'])

# Add value labels on bars
for i, bar in enumerate(bars):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.1,
             f'{int(height)}', ha='center', va='bottom')

# Customize the plot
plt.title('Distribution of Product Ratings', fontsize=16, pad=20)
plt.xlabel('Rating (Stars)', fontsize=12)
plt.ylabel('Number of Reviews', fontsize=12)
plt.xticks(rating_counts.index)
plt.grid(axis='y', alpha=0.3)

# Add average line
plt.axvline(avg_rating, color='red', linestyle='--', linewidth=2, 
            label=f'Average Rating: {avg_rating:.2f}')
plt.legend()

# Show the plot
plt.tight_layout()
plt.show()

# Print distribution percentages
print("Rating Distribution (%):")
for rating in sorted(df['rating'].unique()):
    count = (df['rating'] == rating).sum()
    percentage = (count / len(df)) * 100
    print(f"  {rating} stars: {count} reviews ({percentage:.1f}%)")

## 3. Word Cloud of Most Common Positive vs Negative Words

In [None]:
# Categorize reviews as positive or negative based on rating
df['sentiment'] = df['rating'].apply(lambda x: 'positive' if x >= 4 else ('neutral' if x == 3 else 'negative'))

# Separate positive and negative reviews
positive_reviews = df[df['sentiment'] == 'positive']
negative_reviews = df[df['sentiment'] == 'negative']

print(f"Positive reviews (4-5 stars): {len(positive_reviews)}")
print(f"Negative reviews (1-2 stars): {len(negative_reviews)}")
print(f"Neutral reviews (3 stars): {len(df[df['sentiment'] == 'neutral'])}")

In [None]:
# Function to create word cloud
def create_wordcloud(text, title, color='black'):
    if not text or len(text.strip()) == 0:
        print(f"No text available for {title}")
        return
    
    wordcloud = WordCloud(width=800, height=400, background_color='white', 
                         colormap='viridis', max_words=100).generate(text)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=16, pad=20)
    plt.tight_layout()
    plt.show()

In [None]:
# Combine all cleaned reviews for positive and negative sentiments
positive_text = ' '.join(positive_reviews['cleaned_review'].dropna().astype(str))
negative_text = ' '.join(negative_reviews['cleaned_review'].dropna().astype(str))

# Create word clouds
create_wordcloud(positive_text, 'Most Common Words in Positive Reviews (4-5 stars)')
create_wordcloud(negative_text, 'Most Common Words in Negative Reviews (1-2 stars)')

## 4. Timeline Chart Showing How Average Rating Changed Over Time

In [None]:
# Convert review_date to datetime
df['review_date'] = pd.to_datetime(df['review_date'])

# Extract month-year for grouping
df['month_year'] = df['review_date'].dt.to_period('M')

# Calculate average rating by month
monthly_avg = df.groupby('month_year')['rating'].agg(['mean', 'count']).reset_index()
monthly_avg['month_year'] = monthly_avg['month_year'].astype(str)

# Display the timeline data
print("Monthly Average Ratings:")
print(monthly_avg)

In [None]:
# Create timeline chart
plt.figure(figsize=(12, 6))

# Plot average ratings over time
plt.plot(monthly_avg['month_year'], monthly_avg['mean'], marker='o', linewidth=2, markersize=8)

# Add overall average line
overall_avg = df['rating'].mean()
plt.axhline(y=overall_avg, color='red', linestyle='--', alpha=0.7, 
            label=f'Overall Average: {overall_avg:.2f}')

# Customize the plot
plt.title('Average Rating Trend Over Time', fontsize=16, pad=20)
plt.xlabel('Review Date (Month-Year)', fontsize=12)
plt.ylabel('Average Rating (Stars)', fontsize=12)
plt.ylim(0, 5.5)
plt.grid(True, alpha=0.3)
plt.legend()

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Add value labels on points
for i, row in monthly_avg.iterrows():
    plt.annotate(f'{row["mean"]:.2f}', 
                 (row['month_year'], row['mean']), 
                 textcoords="offset points", 
                 xytext=(0,10), 
                 ha='center')

plt.tight_layout()
plt.show()

## Summary of Findings

This analysis provides insights into the product reviews dataset:

1. **Average Rating**: The overall average rating gives a quick snapshot of customer satisfaction.
2. **Rating Distribution**: The bar chart shows how reviews are distributed across different rating levels.
3. **Sentiment Analysis**: Word clouds reveal the most common words in positive vs negative reviews, helping identify key themes.
4. **Trend Analysis**: The timeline chart shows how product satisfaction has changed over time.

These visualizations can help businesses understand customer sentiment, identify product strengths and weaknesses, and track satisfaction trends.