In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import os

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

# === Load cleaned data ===
df = pd.read_csv("data/processed/cleaned_reviews.csv", parse_dates=["review_date"])

# === Basic Info ===
print("\n=== Dataset Info ===")
print(df.info())
print("\n=== Descriptive Statistics ===")
print(df.describe(include="all"))

# === Rating Distribution ===
sns.countplot(data=df, x="Score", palette="viridis")
plt.title("Review Rating Distribution")
plt.xlabel("Score")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig("data/results/eda_rating_distribution.png")
plt.clf()

# === Verified vs Unverified Rating Distribution ===
sns.boxplot(data=df, x="verified", y="Score", palette="Set2")
plt.title("Score Distribution by Verified Status")
plt.savefig("data/results/eda_verified_score_boxplot.png")
plt.clf()

sns.countplot(data=df, x="verified", hue="Score", palette="pastel")
plt.title("Score Counts by Verified Status")
plt.savefig("data/results/eda_verified_score_counts.png")
plt.clf()

# === Helpfulness Ratio ===
sns.histplot(df["helpfulness_ratio"], bins=50, kde=True, color="green")
plt.title("Helpfulness Ratio Distribution")
plt.xlabel("Helpfulness Ratio")
plt.savefig("data/results/eda_helpfulness_ratio.png")
plt.clf()

# === Text Length Distribution ===
df["text_len"] = df["clean_text"].apply(lambda x: len(str(x).split()))
sns.histplot(df["text_len"], bins=50, kde=True, color="coral")
plt.title("Review Text Length Distribution (words)")
plt.xlabel("Word Count")
plt.savefig("data/results/eda_text_length_distribution.png")
plt.clf()

# === Reviews Over Time ===
df["year"] = df["review_date"].dt.year
yearly = df.groupby("year")["Score"].agg(["count", "mean"]).reset_index()

fig, ax1 = plt.subplots()
sns.lineplot(data=yearly, x="year", y="count", ax=ax1, color="blue", label="Review Count")
ax1.set_ylabel("Number of Reviews", color="blue")
ax2 = ax1.twinx()
sns.lineplot(data=yearly, x="year", y="mean", ax=ax2, color="red", label="Average Score")
ax2.set_ylabel("Average Rating", color="red")
plt.title("Review Volume and Average Score Over Time")
fig.tight_layout()
fig.savefig("data/results/eda_reviews_over_time.png")
plt.clf()

# === WordCloud ===
text = " ".join(df["clean_text"].sample(min(1000, len(df))).dropna().tolist())
wordcloud = WordCloud(width=1000, height=500, background_color="white").generate(text)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("WordCloud of Review Text")
plt.tight_layout()
plt.savefig("data/results/eda_wordcloud.png")
plt.clf()

print("\n EDA plots saved to: data/results/")
