# Movie Recommendation System - EDA & Model Comparison

This notebook includes:
- 📊 Data Visualization (EDA)
- 📌 Feature Analysis
- 🤖 Model Building (SVD++, KNN)
- 🛠️ Hyperparameter Tuning
- 📈 Model Comparison


In [None]:
# 📦 Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import SVD, SVDpp, KNNBasic, Dataset, Reader
from surprise.model_selection import cross_validate, GridSearchCV

# Load datasets
ratings = pd.read_csv("../ml-latest-small/ratings.csv")
movies = pd.read_csv("../ml-latest-small/movies.csv")
tags = pd.read_csv("../ml-latest-small/tags.csv")

## 📊 EDA - Ratings and Movies

In [None]:
# Ratings distribution
sns.countplot(x='rating', data=ratings)
plt.title('Ratings Distribution')
plt.show()

# Most rated movies
most_rated = ratings['movieId'].value_counts().head(10)
top_movies = movies[movies['movieId'].isin(most_rated.index)]
top_movies['rating_count'] = top_movies['movieId'].map(most_rated)
top_movies[['title', 'rating_count']]

## 🧪 Feature Engineering - Genre and Tag Frequency

In [None]:
movies['genres'] = movies['genres'].str.split('|')
genre_counts = pd.Series([g for sub in movies['genres'] for g in sub]).value_counts()
genre_counts.plot(kind='bar', title='Genre Frequency')
plt.show()

## 🛠️ Model Building & Comparison

In [None]:
# Prepare Surprise dataset
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# SVD++
print("SVD++")
svdpp = SVDpp()
cross_validate(svdpp, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

# KNNBasic
print("KNNBasic")
knn = KNNBasic()
cross_validate(knn, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

## 🔧 Hyperparameter Tuning for SVD

In [None]:
param_grid = {
    'n_epochs': [5, 10],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.4, 0.6]
}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)
print("Best RMSE:", gs.best_score['rmse'])
print("Best Params:", gs.best_params['rmse'])