In [None]:
# EDA for Recommendation Engine
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load processed data
interactions = pd.read_csv("../data/processed/interactions_processed.csv")
users = pd.read_csv("../data/raw/users.csv")
items = pd.read_csv("../data/raw/movies.csv")

# Merge data for analysis
df = interactions.merge(users, on="userId", how="left").merge(items, on="movieId", how="left")

# --- 1. General Stats ---
print("Number of users:", df["userId"].nunique())
print("Number of items:", df["movieId"].nunique())
print("Number of interactions:", len(df))
print("Rating distribution:")
print(df["rating"].describe())

# --- 2. Rating distribution ---
plt.figure(figsize=(6,4))
sns.histplot(df["rating"], bins=5, kde=False)
plt.title("Rating Distribution")
plt.xlabel("Rating")
plt.ylabel("Count")
plt.show()

# --- 3. User activity ---
user_counts = df.groupby("userId").size()
plt.figure(figsize=(6,4))
sns.histplot(user_counts, bins=20, kde=False)
plt.title("Number of Interactions per User")
plt.xlabel("Interactions")
plt.ylabel("Number of Users")
plt.show()

# --- 4. Item popularity ---
item_counts = df.groupby("movieId").size()
plt.figure(figsize=(6,4))
sns.histplot(item_counts, bins=20, kde=False)
plt.title("Item Popularity (Number of Ratings)")
plt.xlabel("Number of Ratings")
plt.ylabel("Number of Items")
plt.show()

# --- 5. Fairness-sensitive analysis ---
plt.figure(figsize=(6,4))
sns.countplot(x="gender", data=df)
plt.title("User Gender Distribution")
plt.show()

# --- 6. Category distribution ---
df['category_encoded'] = df['genres'].astype('category').cat.codes
plt.figure(figsize=(8,4))
sns.histplot(df['category_encoded'], bins=20)
plt.title("Item Categories Distribution")
plt.show()
