In [2]:
import pandas as pd
from surprise import Reader, Dataset

In [4]:
df = pd.read_csv('./ml-latest-small/ratings.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [5]:
df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [9]:
new_df = df.drop(columns=['timestamp'])
new_df.head(3)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0


In [11]:
reader = Reader()
movies = Dataset.load_from_df(new_df,reader)
movies

<surprise.dataset.DatasetAutoFolds at 0x1190c0e48>

In [12]:
#Extract raw ratings from the dataset
raw_ratings = movies.raw_ratings

In [14]:
import random

In [15]:
random.shuffle(raw_ratings)

In [17]:
raw_ratings[:5]

[(584, 377, 5.0, None),
 (57, 10, 3.0, None),
 (480, 2291, 4.0, None),
 (105, 3462, 4.5, None),
 (182, 1237, 5.0, None)]

In [42]:
#We want to split our ratings data into with 80/20 ratio
threshold = int(.8*len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]

In [32]:
movies.raw_ratings = A_raw_ratings #The data is now the A set

In [33]:
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV
import numpy as np

In [34]:
#Now we'll investigate some optimization with GridSearch CV
param_grid = {'n_factors':[5,20,100], 'n_epochs':[5,10], 'lr_all':[0.002, 0.005], 'reg_all':[0.02, 0.05, 0.5]}
gs_model = GridSearchCV(SVD, param_grid=param_grid, n_jobs=-1, joblib_verbose=3)
gs_model.fit(movies)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  3.3min finished


In [30]:
print(gs_model.best_score)
print(gs_model.best_params)

{'rmse': 0.8817626184585396, 'mae': 0.6800445780532369}
{'rmse': {'n_factors': 5, 'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.02}, 'mae': {'n_factors': 5, 'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.02}}


In [35]:
#create our high performance algorithm
our_algo = gs_model.best_estimator['rmse']

In [36]:
#Retrain on the whole set A
trainset = movies.build_full_trainset()
our_algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11dcc5b70>

In [39]:
from surprise import accuracy

In [44]:
#This is the BIASED Accuracy on A
predictions = our_algo.test(trainset.build_testset())
print('Biased accuract on A', end='  ')
accuracy.rmse(predictions)

Biased accuract on A  RMSE: 0.8416


0.8415655196781596

In [45]:
#This is the UNBIASED Accuract on B
testset = movies.construct_testset(B_raw_ratings) #testset is now the set B
predictions = our_algo.test(testset)
print('Unbiased accuracy on B,', end=' ')
accuracy.rmse(predictions)

Unbiased accuracy on B, RMSE: 0.8834


0.8834117006142889