In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("seaborn-v0_8")
sns.set_context("talk")

print("ReelSense EDA environment ready")


In [None]:
ratings = pd.read_csv("../data/raw/ratings.csv")
movies = pd.read_csv("../data/raw/movies.csv")
tags = pd.read_csv("../data/raw/tags.csv")
links = pd.read_csv("../data/raw/links.csv")

ratings.head()


In [None]:
print("Ratings:", ratings.shape)
print("Movies:", movies.shape)
print("Tags:", tags.shape)

ratings.describe()


In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x="rating", data=ratings)
plt.title("Distribution of Movie Ratings")
plt.xlabel("Rating")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


In [None]:
user_activity = ratings.groupby("userId").size()

plt.figure(figsize=(8,5))
sns.histplot(user_activity, bins=50)
plt.title("User Activity Distribution")
plt.xlabel("Ratings per User")
plt.ylabel("Number of Users")
plt.tight_layout()
plt.show()


In [None]:
movie_popularity = ratings.groupby("movieId").size().sort_values(ascending=False)

plt.figure(figsize=(9,5))
plt.plot(movie_popularity.values)
plt.title("Long-Tail Distribution of Movie Popularity")
plt.xlabel("Movies (sorted)")
plt.ylabel("Number of Ratings")
plt.tight_layout()
plt.show()


In [None]:
genres = movies["genres"].str.split("|", expand=True)
genre_counts = genres.stack().value_counts()

plt.figure(figsize=(10,6))
sns.barplot(x=genre_counts.values, y=genre_counts.index)
plt.title("Genre Frequency in Dataset")
plt.xlabel("Count")
plt.ylabel("Genre")
plt.tight_layout()
plt.show()


The long-tail distribution shows that a small number of movies receive the majority of ratings, motivating the need for diversity-aware recommendation strategies to reduce popularity bias.