# Implementing Recommendation Models using Surprise Library
This notebook demonstrates loading MovieLens, training SVD, KNNBasic and NMF models using Surprise, evaluating them, performing grid-search, and generating Top‑N recommendations.

**Dataset:** MovieLens 100K (Surprise built-in)

In [None]:
# Install (run once if needed)
# !pip install scikit-surprise pandas numpy matplotlib

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from surprise import Dataset, Reader, SVD, NMF, KNNBasic
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise import accuracy
from collections import defaultdict


## 1) Load data and prepare train/test split

In [None]:
# Load MovieLens 100K
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
print('Trainset users:', trainset.n_users, 'items:', trainset.n_items)

## 2) Train baseline models: SVD, NMF, KNNBasic

In [None]:
# SVD
svd = SVD(n_factors=50, random_state=42)
svd.fit(trainset)
pred_svd = svd.test(testset)
rmse_svd = accuracy.rmse(pred_svd, verbose=True)
mae_svd = accuracy.mae(pred_svd, verbose=True)

# NMF
nmf = NMF(n_factors=15, random_state=42)
nmf.fit(trainset)
pred_nmf = nmf.test(testset)
rmse_nmf = accuracy.rmse(pred_nmf, verbose=True)
mae_nmf = accuracy.mae(pred_nmf, verbose=True)

# KNNBasic (item-based)
sim_options = {'name': 'cosine', 'user_based': False}
knn = KNNBasic(k=40, sim_options=sim_options)
knn.fit(trainset)
pred_knn = knn.test(testset)
rmse_knn = accuracy.rmse(pred_knn, verbose=True)
mae_knn = accuracy.mae(pred_knn, verbose=True)

results = pd.DataFrame({
    'Model': ['SVD', 'NMF', 'KNNBasic'],
    'RMSE': [rmse_svd, rmse_nmf, rmse_knn],
    'MAE': [mae_svd, mae_nmf, mae_knn]
})
results

## 3) Cross-validation (5-fold) comparison

In [None]:
# Cross-validate SVD and NMF for more robust comparison
models = {'SVD': SVD(n_factors=50, random_state=42),
          'NMF': NMF(n_factors=15, random_state=42)}
cv_results = {}
for name, model in models.items():
    cv = cross_validate(model, data, measures=['RMSE','MAE'], cv=5, verbose=False)
    cv_results[name] = {'RMSE_mean': np.mean(cv['test_rmse']), 'MAE_mean': np.mean(cv['test_mae'])}

pd.DataFrame(cv_results).T

## 4) Grid Search for SVD hyperparameters

In [None]:
# Grid search for SVD
param_grid = {'n_factors': [20,50,100], 'lr_all': [0.005, 0.01], 'reg_all': [0.02, 0.1]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, n_jobs=-1)
gs.fit(data)
print('Best RMSE score:', gs.best_score['rmse'])
print('Best params:', gs.best_params['rmse'])

## 5) Generate Top-N recommendations for a user

In [None]:
# Utility to get Top-N recommendations from predictions
def get_top_n(predictions, n=10, min_rating=4.0):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = [(iid, est) for (iid, est) in user_ratings if est >= min_rating][:n]
    return top_n

# Predict on all pairs (build anti-testset)
trainset_full = data.build_full_trainset()
algo = SVD(n_factors=gs.best_params['rmse']['n_factors'] if hasattr(gs, 'best_params') else 50)
algo.fit(trainset_full)
anti_testset = trainset_full.build_anti_testset()
predictions_all = algo.test(anti_testset)
top_n = get_top_n(predictions_all, n=10, min_rating=4.0)

# Show Top-10 for a random sample user
sample_user = list(top_n.keys())[0]
sample_user, top_n[sample_user]

## 6) Precision@K and Recall@K evaluation for Top-N

In [None]:
# Build test set ground truth from original testset: items with rating >=4 are relevant
test_df = pd.DataFrame(testset, columns=['user','item','rating','_'])
test_relevant = test_df[test_df['rating']>=4.0].groupby('user')['item'].apply(set).to_dict()

def precision_recall_at_k(top_n, test_relevant, k=10):
    precisions = []
    recalls = []
    for uid, recs in top_n.items():
        if uid not in test_relevant or len(test_relevant[uid])==0:
            continue
        recommended = [iid for (iid, _) in recs][:k]
        relevant = test_relevant[uid]
        hits = len([iid for iid in recommended if iid in relevant])
        precisions.append(hits / k)
        recalls.append(hits / len(relevant))
    return np.mean(precisions) if precisions else None, np.mean(recalls) if recalls else None

prec, rec = precision_recall_at_k(top_n, test_relevant, k=10)
prec, rec

## 7) Plot RMSE comparison

In [None]:
# Plot RMSE comparison (from previous results DataFrame)
plt.figure(figsize=(6,4))
plt.bar(results['Model'], results['RMSE'])
plt.title('RMSE Comparison')
plt.xlabel('Model')
plt.ylabel('RMSE')
plt.grid(axis='y')
plt.show()

### Notes
- Use cross-validation and hold-out test sets for robust evaluation.
- Grid search can be extended for other algorithms (NMF, KNN).
- Consider scaling for large datasets (sample, incremental training, or matrix factorization libraries).