In [5]:
import pandas as pd
import numpy as np


In [6]:
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [7]:
ratings = pd.read_csv('u.data', sep='\t', names=['userId', 'movieId', 'rating', 'timestamp'])

In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [9]:
movies = pd.read_csv('u.item', sep = "|", encoding='latin-1', usecols=[0, 1], names=['movieId', 'title'])

In [10]:
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [11]:
df = pd.merge(ratings,movies,on="movieId")

In [26]:
df.head()

df.groupby("userId")["movieId"].count().describe()

count    943.000000
mean     106.044539
std      100.931743
min       20.000000
25%       33.000000
50%       65.000000
75%      148.000000
max      737.000000
Name: movieId, dtype: float64

In [13]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy



#load the dataset
reader = Reader(rating_scale=(0.5,5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [15]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import GridSearchCV
from surprise import accuracy

# Load your data
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Define hyperparameter grid (keep it simple!)
param_grid = {
    'n_factors': [50, 100, 150],
    'n_epochs': [20, 30],
    'lr_all': [0.002, 0.005, 0.01],
    'reg_all': [0.02, 0.1]
}

# One line to find best parameters!
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(data)

# Get results
print(f"Best RMSE: {gs.best_score['rmse']:.4f}")
print(f"Best params: {gs.best_params['rmse']}")

# Use the best model
best_svd = gs.best_estimator['rmse']



Best RMSE: 0.9208
Best params: {'n_factors': 150, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}


In [36]:
#using the best model parameters to predict
import random

trainset = data.build_full_trainset()

#train 
best_svd.fit(trainset)

user_id = input("Enter the user ID: ")
user_id = int(user_id)

# First, check if the user ID exists in training data
if user_id not in trainset._raw2inner_id_users:
    print("This user ID does not exist in training data.")
else:
    # Convert raw user ID to internal Surprise format
    inner_uid = trainset.to_inner_uid(user_id)

    # Movies the user has already rated
    seen_iids = set(iid for (iid, _) in trainset.ur[inner_uid])

    # All movie IDs (internal)
    all_iids = set(trainset.all_items())

    # Movies the user hasn't seen
    unseen_iids = all_iids - seen_iids

    # Predict ratings only for unseen movies
    predictions = [
        best_svd.predict(user_id, trainset.to_raw_iid(iid)) for iid in unseen_iids
    ]

     # Debug: print sample predictions
    for p in predictions[:5]:
        print(f"Movie {p.iid}, Predicted Rating: {round(p.est, 2)}")

    # Sort by predicted rating
    top_n_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)[:50]


    top_10_sampled = random.sample(top_n_predictions,10)
    

    # Movie titles dictionary
    movie_titles = dict(zip(movies['movieId'], movies['title']))

    # Display top recommendations
    print(f"\nTop 10 recommendations for user {user_id}:\n")
    for prediction in top_10_sampled:
        movie_title = movie_titles.get(int(prediction.iid), "unknown title")
        print(f"{movie_title}: {round(prediction.est, 2)}")


Enter the user ID:  66


Movie 242, Predicted Rating: 3.55
Movie 302, Predicted Rating: 4.07
Movie 377, Predicted Rating: 2.62
Movie 51, Predicted Rating: 3.48
Movie 346, Predicted Rating: 3.49
[Prediction(uid=66, iid=64, r_ui=None, est=4.406629670646146, details={'was_impossible': False}), Prediction(uid=66, iid=1449, r_ui=None, est=4.400619962188902, details={'was_impossible': False}), Prediction(uid=66, iid=174, r_ui=None, est=4.362243308453733, details={'was_impossible': False}), Prediction(uid=66, iid=172, r_ui=None, est=4.289553966045322, details={'was_impossible': False}), Prediction(uid=66, iid=12, r_ui=None, est=4.2865789704616395, details={'was_impossible': False}), Prediction(uid=66, iid=22, r_ui=None, est=4.259362799902046, details={'was_impossible': False}), Prediction(uid=66, iid=318, r_ui=None, est=4.253122182699945, details={'was_impossible': False}), Prediction(uid=66, iid=313, r_ui=None, est=4.243714394523337, details={'was_impossible': False}), Prediction(uid=66, iid=272, r_ui=None, est=4.18