<a href="https://colab.research.google.com/github/Prometheus1211/Movie-Recommender-System/blob/main/Anirudh_Rajat_Movie_Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds

In [None]:
# 1M Dataset
df_movies = pd.read_csv("/content/movies.csv", usecols=['movieId', 'title', 'genre'], sep=';', dtype={'movieId': 'int32', 'title': 'str', 'genre': 'str'})
df_movies.head()

Unnamed: 0,movieId,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
# 1M Dataset
df_ratings = pd.read_csv("/content/ratings.csv", usecols=['userId', 'movieId', 'rating'], sep=';', dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1193,5.0
1,1,661,3.0
2,1,914,3.0
3,1,3408,4.0
4,1,2355,5.0


In [None]:
df_movie_features = df_ratings.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)
df_movie_features.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
item_indices = pd.DataFrame(sorted(list(set(df_ratings['movieId']))),columns=['movieId'])

item_indices['movie_index']=item_indices.index

item_indices.head()

Unnamed: 0,movieId,movie_index
0,1,0
1,2,1
2,3,2
3,4,3
4,5,4


In [None]:
user_indices = pd.DataFrame(sorted(list(set(df_ratings['userId']))),columns=['userId'])

user_indices['user_index']=user_indices.index

user_indices.head()

Unnamed: 0,userId,user_index
0,1,0
1,2,1
2,3,2
3,4,3
4,5,4


In [None]:
df_with_index = pd.merge(df_ratings,item_indices,on='movieId')

df_with_index=pd.merge(df_with_index,user_indices,on='userId')

df_with_index.head()

Unnamed: 0,userId,movieId,rating,movie_index,user_index
0,1,1193,5.0,1104,0
1,1,661,3.0,639,0
2,1,914,3.0,853,0
3,1,3408,4.0,3177,0
4,1,2355,5.0,2162,0


In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test= train_test_split(df_with_index,test_size=0.2)
print(len(df_train))
print(len(df_test))

800167
200042


In [None]:
df_train.head()

Unnamed: 0,userId,movieId,rating,movie_index,user_index
171856,2181,1982,5.0,1802,2180
491097,2116,88,3.0,86,2115
858031,4581,2791,5.0,2586,4580
921837,4437,1198,4.0,1108,4436
305613,4072,2294,4.0,2102,4071


In [None]:
df_test.head()

Unnamed: 0,userId,movieId,rating,movie_index,user_index
984593,4520,3917,4.0,3670,4519
881556,1071,1254,5.0,1162,1070
293588,3934,1597,2.0,1466,3933
714788,1272,2174,4.0,1993,1271
355471,4682,2701,2.0,2496,4681


In [None]:
n_users = df_ratings.userId.unique().shape[0]
n_items = df_ratings.movieId.unique().shape[0]
print(n_users)
print(n_items)

6040
3706


In [None]:
train_data_matrix = np.zeros((n_users, n_items))

for line in df_train.itertuples():
    train_data_matrix[line[5], line[4]] = line[3]
train_data_matrix.shape

(6040, 3706)

In [None]:
train_data_matrix

array([[5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [3., 0., 0., ..., 0., 0., 0.]])

In [None]:

test_data_matrix = np.zeros((n_users, n_items))

for line in df_test.itertuples():
    test_data_matrix[line[5], line[4]] = line[3]
    #train_data_matrix[line['movieId'], line['userId']] = line['rating']
test_data_matrix.shape

(6040, 3706)

In [None]:
test_data_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
pd.DataFrame(train_data_matrix).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3696,3697,3698,3699,3700,3701,3702,3703,3704,3705
0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df_train['rating'].max()

5.0

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [None]:

rmse_list = []
for i in [1,2,5,20,40,50,60,100,200]:

    u,s,vt = svds(train_data_matrix,k=i)

    s_diag_matrix=np.diag(s)

    X_pred = np.dot(np.dot(u,s_diag_matrix),vt)

    rmse_score = rmse(X_pred,test_data_matrix)
    rmse_list.append(rmse_score)
    print("Matrix Factorisation with " + str(i) +" latent features has a RMSE of " + str(rmse_score))

Matrix Factorisation with 1 latent features has a RMSE of 3.0805581551463144
Matrix Factorisation with 2 latent features has a RMSE of 3.003264057539651
Matrix Factorisation with 5 latent features has a RMSE of 2.869945578953964
Matrix Factorisation with 20 latent features has a RMSE of 2.703885709128179
Matrix Factorisation with 40 latent features has a RMSE of 2.709741063883562
Matrix Factorisation with 50 latent features has a RMSE of 2.7363117782825084
Matrix Factorisation with 60 latent features has a RMSE of 2.7689085300250853
Matrix Factorisation with 100 latent features has a RMSE of 2.8898886370287866
Matrix Factorisation with 200 latent features has a RMSE of 3.122817620885955


In [None]:
R = df_movie_features.to_numpy()
user_ratings_mean = np.mean(R, axis = 1)

In [None]:
U, sigma, Vt = svds(R, k = 20)
sigma = np.diag(sigma)
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [None]:
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = df_movie_features.columns)
preds_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,3.296092,0.751343,0.055482,0.045996,0.185475,-0.223091,-0.004782,0.227162,-0.040384,0.010916,...,0.04616,0.063587,0.099208,0.053815,-0.013648,0.362027,-0.014751,0.040499,0.056695,0.12915
1,1.400668,0.554054,0.231635,0.187057,0.227535,0.867737,0.224022,0.174578,0.314799,1.532037,...,0.083954,0.126731,0.125738,0.170212,0.092777,0.286713,-0.047009,0.106894,0.127939,0.162797
2,1.333437,0.216279,0.156412,-0.015515,0.032154,0.217351,-0.07027,0.066912,0.07608,0.605373,...,0.025174,0.051069,0.065831,0.057821,0.024025,0.28448,-0.073693,0.01231,0.034155,-0.085134
3,0.281884,-0.088827,0.043531,0.084201,0.048487,0.318925,0.017737,0.023997,0.001312,0.104055,...,0.039579,0.023396,0.020582,-0.000513,-0.002436,0.01465,0.070732,0.004227,0.038865,-0.060646
4,1.241472,0.283944,-0.042118,0.31074,-0.060108,1.607038,-0.097398,0.165202,0.110061,0.433708,...,0.252153,0.166115,0.154353,0.134328,0.179793,0.156212,0.661747,0.190263,0.242495,0.336899


In [None]:
df_names = pd.merge(df_ratings,df_movies,on='movieId')
df_names.head()

Unnamed: 0,userId,movieId,rating,title,genre
0,1,1193,5.0,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5.0,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4.0,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4.0,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5.0,One Flew Over the Cuckoo's Nest (1975),Drama


In [None]:
user_id = 837

users_movies = df_names.loc[df_names["userId"]==user_id]

print("User ID : " + str(user_id) + " has already rated " + str(len(users_movies)) + " movies")

users_movies

User ID : 837 has already rated 69 movies


Unnamed: 0,userId,movieId,rating,title,genre
17018,837,2791,4.0,Airplane! (1980),Comedy
43891,837,1961,4.0,Rain Man (1988),Drama
52623,837,2028,5.0,Saving Private Ryan (1998),Action|Drama|War
60792,837,3068,4.0,Verdict The (1982),Drama
74604,837,1213,4.0,GoodFellas (1990),Crime|Drama
...,...,...,...,...,...
812638,837,3341,3.0,Born Yesterday (1950),Comedy
815704,837,3504,3.0,Network (1976),Comedy|Drama
886655,837,1177,2.0,Enchanted April (1991),Drama
891275,837,3307,4.0,City Lights (1931),Comedy|Drama|Romance


In [None]:
def recommend_movies(preds_df, userID, movies_df, original_ratings_df, num_recommendations=5):


    user_row_number = userID - 1
    sorted_user_predictions = preds_df.iloc[user_row_number].sort_values(ascending=False)

    user_data = original_ratings_df[original_ratings_df.userId == (userID)]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )
    predictions = (movies_df[~movies_df['movieId'].isin(user_full['movieId'])]).merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left', left_on = 'movieId',
               right_on = 'movieId').rename(columns = {user_row_number: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:num_recommendations, :-1]

    return user_full, predictions

In [None]:
predictions.head(10)

Unnamed: 0,movieId,title,genre
516,527,Schindler's List (1993),Drama|War
1848,1953,French Connection The (1971),Action|Crime|Drama|Thriller
596,608,Fargo (1996),Crime|Drama|Thriller
1235,1284,Big Sleep The (1946),Film-Noir|Mystery
1188,1230,Annie Hall (1977),Comedy|Romance
2085,2194,Untouchables The (1987),Action|Crime|Drama
897,922,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),Film-Noir
1198,1242,Glory (1989),Action|Drama|War
581,593,Silence of the Lambs The (1991),Drama|Thriller
1849,1954,Rocky (1976),Action|Drama
