In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load processed data (ensure this path and file exist)
data = pd.read_csv('data/processed/ratings_data.csv')

# Distribution of Movie Ratings
plt.figure(figsize=(10, 6))
sns.histplot(data['rating'], bins=5, kde=True)
plt.xlabel('Rating')
plt.ylabel('Number of Ratings')
plt.title('Distribution of Movie Ratings')
plt.show()

# Top 10 Most Rated Movies
most_rated = data.groupby('title').size().sort_values(ascending=False).head(10)
plt.figure(figsize=(10, 6))
sns.barplot(x=most_rated.index, y=most_rated.values)
plt.xlabel('Title')
plt.ylabel('Number of Ratings')
plt.title('Top 10 Most Rated Movies')
plt.xticks(rotation=45, ha='right')
plt.show()

# Average Rating for Top 10 Most Rated Movies
top_10_movies = most_rated.index
avg_ratings = data[data['title'].isin(top_10_movies)].groupby('title')['rating'].mean().sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=avg_ratings.index, y=avg_ratings.values)
plt.xlabel('Title')
plt.ylabel('Average Rating')
plt.title('Average Rating for Top 10 Most Rated Movies')
plt.xticks(rotation=45, ha='right')
plt.show()

# Rating Distribution for Top 5 Most Rated Movies
top_5_movies = most_rated.head().index
plt.figure(figsize=(12, 6))
sns.boxplot(x='title', y='rating', data=data[data['title'].isin(top_5_movies)])
plt.xlabel('Title')
plt.ylabel('Rating')
plt.title('Rating Distribution for Top 5 Most Rated Movies')
plt.xticks(rotation=45, ha='right')
plt.show()

# Correlation between Number of Ratings and Average Rating
movie_stats = data.groupby('title').agg({'rating': ['count', 'mean']})
movie_stats.columns = ['rating_count', 'rating_mean']
plt.figure(figsize=(10, 6))
sns.scatterplot(x='rating_count', y='rating_mean', data=movie_stats)
plt.xlabel('Number of Ratings')
plt.ylabel('Average Rating')
plt.title('Correlation between Number of Ratings and Average Rating')
plt.show()