# Movie Recommendation System - Data Exploration

This notebook explores the TMDB and MovieLens datasets used for building the hybrid recommendation system.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
%matplotlib inline

## 1. Load Data

In [None]:
# Load datasets
movies = pd.read_csv('../data/movies.csv')
ratings = pd.read_csv('../data/ratings.csv')

print(f"Movies: {movies.shape}")
print(f"Ratings: {ratings.shape}")

## 2. Explore Movies Dataset

In [None]:
# Display first few rows
movies.head()

In [None]:
# Dataset info
movies.info()

In [None]:
# Check for missing values
movies.isnull().sum()

## 3. Explore Ratings Dataset

In [None]:
# Display first few rows
ratings.head()

In [None]:
# Rating distribution
plt.figure(figsize=(10, 6))
ratings['rating'].value_counts().sort_index().plot(kind='bar')
plt.title('Rating Distribution')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

In [None]:
# Statistics
print(f"Total users: {ratings['userId'].nunique()}")
print(f"Total movies rated: {ratings['movieId'].nunique()}")
print(f"Average rating: {ratings['rating'].mean():.2f}")
print(f"Median rating: {ratings['rating'].median():.2f}")

## 4. Genre Analysis

In [None]:
# Extract genres
from collections import Counter

all_genres = []
for genres in movies['genres'].dropna():
    all_genres.extend(genres.split('|'))

genre_counts = Counter(all_genres)

# Plot top genres
plt.figure(figsize=(12, 6))
genres_df = pd.DataFrame(genre_counts.most_common(10), columns=['Genre', 'Count'])
sns.barplot(data=genres_df, x='Genre', y='Count')
plt.title('Top 10 Movie Genres')
plt.xticks(rotation=45)
plt.show()

## 5. User Activity Analysis

In [None]:
# Ratings per user
user_ratings = ratings.groupby('userId').size()

plt.figure(figsize=(12, 6))
plt.hist(user_ratings, bins=50, edgecolor='black')
plt.title('Distribution of Ratings per User')
plt.xlabel('Number of Ratings')
plt.ylabel('Number of Users')
plt.show()

print(f"Average ratings per user: {user_ratings.mean():.2f}")
print(f"Median ratings per user: {user_ratings.median():.2f}")

## 6. Movie Popularity Analysis

In [None]:
# Ratings per movie
movie_ratings = ratings.groupby('movieId').agg({
    'rating': ['count', 'mean']
}).reset_index()
movie_ratings.columns = ['movieId', 'rating_count', 'avg_rating']

# Merge with movie titles
movie_stats = movie_ratings.merge(movies[['movieId', 'title']], on='movieId')

# Top rated movies (with minimum 50 ratings)
popular_movies = movie_stats[movie_stats['rating_count'] >= 50].nlargest(10, 'avg_rating')

plt.figure(figsize=(12, 6))
sns.barplot(data=popular_movies, y='title', x='avg_rating')
plt.title('Top 10 Highest Rated Movies (min 50 ratings)')
plt.xlabel('Average Rating')
plt.show()

## 7. Sparsity Analysis

In [None]:
# Calculate matrix sparsity
n_users = ratings['userId'].nunique()
n_movies = ratings['movieId'].nunique()
n_ratings = len(ratings)

sparsity = 1 - (n_ratings / (n_users * n_movies))

print(f"Number of users: {n_users:,}")
print(f"Number of movies: {n_movies:,}")
print(f"Total ratings: {n_ratings:,}")
print(f"Matrix sparsity: {sparsity:.4f} ({sparsity*100:.2f}%)")

## 8. Conclusion

Key insights from the data exploration:
- The dataset contains diverse movies across multiple genres
- User rating behavior varies significantly
- The user-movie matrix is highly sparse, making collaborative filtering challenging
- Hybrid approach combining content and collaborative filtering is appropriate