In [1]:
!pip install scikit-surprise



In [2]:
from surprise import Dataset, Reader
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import accuracy
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [3]:
ratings_df = pd.read_csv("ratings.csv")
movies_df = pd.read_csv("movies.csv")

df = pd.merge(ratings_df, movies_df[['movieId', 'genres']], on='movieId', how='left')
df


Unnamed: 0,userId,movieId,rating,timestamp,genres
0,1,1,4.0,964982703,Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Comedy|Romance
2,1,6,4.0,964982224,Action|Crime|Thriller
3,1,47,5.0,964983815,Mystery|Thriller
4,1,50,5.0,964982931,Crime|Mystery|Thriller
...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Drama|Horror|Thriller
100832,610,168248,5.0,1493850091,Action|Crime|Thriller
100833,610,168250,5.0,1494273047,Horror
100834,610,168252,5.0,1493846352,Action|Sci-Fi


In [4]:
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()
mlb = MultiLabelBinarizer()

df['userId'] = user_encoder.fit_transform(df['userId'])
df['movieId'] = movie_encoder.fit_transform(df['movieId'])

df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('genres').str.split('|')), columns = mlb.classes_, index = df.index ))


In [5]:
df

Unnamed: 0,userId,movieId,rating,timestamp,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,4.0,964982703,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,2,4.0,964981247,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,0,5,4.0,964982224,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,43,5.0,964983815,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,0,46,5.0,964982931,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,609,9416,4.0,1493848402,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
100832,609,9443,5.0,1493850091,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
100833,609,9444,5.0,1494273047,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
100834,609,9445,5.0,1493846352,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [6]:
df.drop(columns = "(no genres listed)", inplace = True)


In [7]:
df

Unnamed: 0,userId,movieId,rating,timestamp,Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,4.0,964982703,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,2,4.0,964981247,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
2,0,5,4.0,964982224,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,0,43,5.0,964983815,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,0,46,5.0,964982931,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,609,9416,4.0,1493848402,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
100832,609,9443,5.0,1493850091,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
100833,609,9444,5.0,1494273047,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
100834,609,9445,5.0,1493846352,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [8]:
train_df, test_df = train_test_split(df, test_size = 0.2)
train_df

Unnamed: 0,userId,movieId,rating,timestamp,Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
72139,463,5880,5.0,1275549429,1,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
50656,326,2692,5.0,1234789704,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
70278,447,8524,3.0,1462122852,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
14172,90,254,4.0,1112711284,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
10704,67,1549,3.5,1158535415,0,0,1,1,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34766,232,277,5.0,1448815806,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
11405,67,6700,1.0,1240093080,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
97694,605,1468,2.5,1171229476,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
69463,447,3908,4.0,1028110163,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
reader = Reader(rating_scale = (0.5, 5))
data = Dataset.load_from_df(train_df[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()


In [10]:
trainset

<surprise.trainset.Trainset at 0x78252c209990>

In [11]:
model_svd = SVD()
model_svd.fit(trainset)

predictions_svd = model_svd.test(trainset.build_anti_testset())
accuracy.rmse(predictions_svd)

RMSE: 0.4752


0.47518885201825256

In [12]:
def get_top_n_recommendations(user_id, n=5):
  user_movies = df[df['userId'] == user_id]['movieId'].unique()
  all_movies = df['movieId'].unique()
  movies_to_predict = list(set(all_movies) - set(user_movies))

  user_movie_pairs = [(user_id, movie_id, 0) for movie_id in movies_to_predict]
  predictions_cf = model_svd.test(user_movie_pairs)

  top_n_recommendations = sorted(predictions_cf, key = lambda x: x.est)[:n]

  for pred in top_n_recommendations:
    predicted_rating = pred.est
    print(predicted_rating)


  top_n_movie_ids = [int(pred.iid) for pred in top_n_recommendations]

  top_n_movies = movie_encoder.inverse_transform(top_n_movie_ids)

  return top_n_movies

In [13]:
user_id = 221
recommendations = get_top_n_recommendations(user_id)
top_n_movies_titles = movies_df[movies_df['movieId'].isin(recommendations)]['title'].tolist()
print(f"Top 5 Recommendations for User {user_id}:")
for i, title in enumerate(top_n_movies_titles, 1):
  print(f"{i}.{title}")

1.7375658693043825
1.8865865839653015
1.9196639118867984
1.959635357548193
1.963200117808426
Top 5 Recommendations for User 221:
1.Anaconda (1997)
2.I Still Know What You Did Last Summer (1998)
3.Battlefield Earth (2000)
4.Jason X (2002)
5.Catwoman (2004)
