In [102]:
import pandas as pd
import numpy as np
from surprise import SVD, KNNBasic, NMF, SVDpp
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, GridSearchCV

from surprise.model_selection import train_test_split
from surprise import accuracy

from surprise.prediction_algorithms import SVD, NormalPredictor, BaselineOnly

### Reading data into python and exploring data info

In [29]:
ratings_df = pd.read_csv('../data/ratings.csv')
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [30]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [31]:
movies_df = pd.read_csv('../data/movies.csv')
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [32]:
tags_df = pd.read_csv('../data/tags.csv')
tags_df

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [33]:
links_df = pd.read_csv('../data/links.csv')
links_df

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
9737,193581,5476944,432131.0
9738,193583,5914996,445030.0
9739,193585,6397426,479308.0
9740,193587,8391976,483455.0


In [34]:
#Instantiate algorithm from Surprise
algo = SVD()

In [35]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [36]:
len(movies_df)

9742

In [37]:
len(ratings_df)

100836

In [38]:
movies_df.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [39]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [40]:
ratings_df.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

### Merging dataframes

#### We need to merge the ratings and movies dataframes so we can have the combined data to use for the recommendation system. We then created a .csv version of the dataframe so we can reload it in the correct format for Surprise.

In [41]:
## merge ratings and movies df's
merged_df = pd.merge(ratings_df, movies_df, on='movieId', how='right')

In [42]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100854 entries, 0 to 100853
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  float64
 1   movieId    100854 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  float64
 4   title      100854 non-null  object 
 5   genres     100854 non-null  object 
dtypes: float64(3), int64(1), object(2)
memory usage: 5.4+ MB


In [43]:
merged_df.isna().sum()

userId       18
movieId       0
rating       18
timestamp    18
title         0
genres        0
dtype: int64

In [44]:
merged_df = merged_df.dropna()

In [45]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100853
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  float64
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  float64
 4   title      100836 non-null  object 
 5   genres     100836 non-null  object 
dtypes: float64(3), int64(1), object(2)
memory usage: 5.4+ MB


In [46]:
merged_df['rating'].value_counts()

4.0    26818
3.0    20047
5.0    13211
3.5    13136
4.5     8551
2.0     7551
2.5     5550
1.0     2811
1.5     1791
0.5     1370
Name: rating, dtype: int64

In [20]:
merged_df.to_csv('../data/ratings_and_movies.csv', index=False)

In [47]:
user_item_rating = merged_df[['userId', 'movieId', 'rating']]

In [22]:
user_item_rating.to_csv('../data/user_item_rating.csv', index=False)

In [48]:
merged_df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1.0,1,4.0,9.649827e+08,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5.0,1,4.0,8.474350e+08,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7.0,1,4.5,1.106636e+09,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15.0,1,2.5,1.510578e+09,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17.0,1,4.5,1.305696e+09,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
100849,184.0,193581,4.0,1.537109e+09,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
100850,184.0,193583,3.5,1.537110e+09,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
100851,184.0,193585,3.5,1.537110e+09,Flint (2017),Drama
100852,184.0,193587,3.5,1.537110e+09,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


### Starting with Surprise

In [62]:
reader = Reader(line_format='user item rating', sep=',', skip_lines=1, rating_scale=(1, 5))

ratings_surprise = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

In [63]:
trainset, testset = train_test_split(ratings_surprise, test_size=0.2, random_state=42)

In [64]:
print('Number of users: ', trainset.n_users, '\n')
print('Number of items: ', trainset.n_items, '\n')

Number of users:  610 

Number of items:  8928 



#### Implementing the basic SVD algorithm

In [52]:
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1adcc337100>

In [53]:
preds = svd.test(testset)

In [54]:
rmse = accuracy.rmse(preds)
mae = accuracy.mae(preds)

RMSE: 0.8813
MAE:  0.6771


In [None]:
params = {'n_factors': [20, 25, 50, 100],
         'reg_all': [0.02, 0.03, 0.04, 0.05, 0.06, 0.07], 'n_epochs': [20, 25, 30, 35]}
g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1, cv=3)
g_s_svd.fit(ratings_surprise)

In [None]:
g_s_svd.best_params['rmse']

In [None]:
# Cross validate
results = cross_validate(svd, ratings_surprise, measures=['RMSE'], cv=3, n_jobs = -1, verbose=True)

In [None]:
svd.fit(trainset)
predictions = svd.test(testset)
svd_baseline = accuracy.rmse(predictions)

In [None]:
# Re-doing the SVD model with the best params from the GridSearchCV
svd_bestparams = SVD(n_factors=100, n_epochs=30, biased=True, reg_all=0.07, random_state=42)

svd_bestparams.fit(trainset)
predictions = svd_bestparams.test(testset)
svd_gs1 = accuracy.rmse(predictions)

The RMSE score is very slightly better than the score of the baseline model with nothing tuned.

In [None]:
params = {'n_factors': [50, 100, 125, 150],
         'reg_all': [0.06, 0.07, 0.08], 'n_epochs': [30], 'lr_all': [0.02, 0.05]}
g_s_svd2 = GridSearchCV(SVD,param_grid=params,n_jobs=-1, cv=3)
g_s_svd2.fit(ratings_surprise)

In [None]:
g_s_svd2.best_params['rmse']

In [None]:
params = {'n_factors': [150, 175, 200],
         'reg_all': [0.07, 0.08, 0.09, 0.1], 'n_epochs': [30], 'lr_all': [0.01, 0.02]}
g_s_svd3 = GridSearchCV(SVD,param_grid=params,n_jobs=-1, cv=3)
g_s_svd3.fit(ratings_surprise)

In [None]:
g_s_svd3.best_params['rmse']

In [None]:
params = {'n_factors': [200, 300, 500],
         'reg_all': [0.1, 0.3, 0.5], 'n_epochs': [30], 'lr_all': [0.02]}
g_s_svd4 = GridSearchCV(SVD,param_grid=params,n_jobs=-1, cv=3)
g_s_svd4.fit(ratings_surprise)

In [None]:
g_s_svd4.best_params['rmse']

In [None]:
params = {'n_factors': [1, 500, 1000, 1500],
         'reg_all': [0.1], 'n_epochs': [30], 'lr_all': [0.02]}
g_s_svd5 = GridSearchCV(SVD,param_grid=params,n_jobs=-1, cv=3)
g_s_svd5.fit(ratings_surprise)

In [None]:
g_s_svd5.best_params['rmse']

In [None]:
svd_bestparams2 = SVD(n_factors=500, n_epochs=30, biased=True, reg_all=0.1, lr_all=0.02, random_state=42)

svd_bestparams2.fit(trainset)
predictions = svd_bestparams2.test(testset)
svd_gs2 = accuracy.rmse(predictions)

### Other type of models

In [68]:
#KNN
sim = {'user_based': True, 'name': 'pearson'}
KNN = KNNBasic(sim_options=sim)

In [69]:
trainset, testset = train_test_split(ratings_surprise, test_size=0.2, random_state=42)

In [70]:
KNN.fit(trainset)

Computing the msd similarity matrix...


AttributeError: module 'numpy' has no attribute 'int'.
`np.int` was a deprecated alias for the builtin `int`. To avoid this error in existing code, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [72]:
nmf = NMF()

In [73]:
nmf.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x1adcc5707c0>

In [74]:
nmf_preds = nmf.test(testset)

In [77]:
rmse_nmf = accuracy.rmse(nmf_preds)
print("RMSE (NMF):", rmse_nmf)

RMSE: 0.9267
RMSE (NMF): 0.9266807876671836


In [80]:
NMF_bestparamsSVD = NMF(n_factors=500, n_epochs=30, biased=True, random_state=42)

In [81]:
NMF_bestparamsSVD.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x1adcc187f10>

In [82]:
nmf_preds2 = NMF_bestparamsSVD.test(testset)

In [83]:
rmse_nmf2 = accuracy.rmse(nmf_preds2)
print("RMSE (NMF):", rmse_nmf2)

RMSE: 1.8088
RMSE (NMF): 1.8087746792544073


In [86]:
params = {'n_factors': [10, 100, 500],
         'n_epochs': [75, 100, 135]}
g_s_nmf1 = GridSearchCV(NMF,param_grid=params,n_jobs=-1, cv=3)
g_s_nmf1.fit(ratings_surprise)

In [87]:
g_s_nmf1.best_params['rmse']

{'n_factors': 500, 'n_epochs': 100}

In [88]:
NMF2 = NMF(n_factors=500, n_epochs=100, biased=True, random_state=42)

In [90]:
NMF2.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x1add8c8ddf0>

In [91]:
nmf_preds3 = NMF2.test(testset)

In [92]:
rmse_nmf3 = accuracy.rmse(nmf_preds3)
print("RMSE (NMF):", rmse_nmf3)

RMSE: 1.9353
RMSE (NMF): 1.9353167735262842


### SVD++

In [94]:
svdpp = SVDpp()

In [95]:
svdpp.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x1add8c8d340>

In [96]:
preds_svdpp = svdpp.test(testset)

In [97]:
rmse_svdpp = accuracy.rmse(preds_svdpp)
print("RMSE (SVD++):", rmse_svdpp)

RMSE: 0.8668
RMSE (SVD++): 0.8668344774542038


In [100]:
svdpp_bestparams = SVDpp(n_factors=25, n_epochs=30, reg_all=0.1, lr_all=0.02, random_state=42)

svdpp_bestparams.fit(trainset)
predictions = svdpp_bestparams.test(testset)
svdpp_gs = accuracy.rmse(predictions)

KeyboardInterrupt: 