In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from surprise import Dataset, Reader, accuracy, SVD


In [2]:
ratings = pd.read_csv("ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
df = ratings.drop("timestamp", axis=1)
df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [4]:
# using S_train = 80%, S_dev = 10%, S_test = 10%
trainset, testset = train_test_split(df, test_size=0.1)

print("Test set size: ", testset.shape)

training, valset = train_test_split(trainset, test_size=0.1)

print("Validation set size : ", valset.shape)
print("Training set size: ", training.shape)

Test set size:  (10084, 3)
Validation set size :  (9076, 3)
Training set size:  (81676, 3)


In [5]:
reader = Reader(rating_scale = (1, 5))

train_data = Dataset.load_from_df(training, reader)
val_data = Dataset.load_from_df(valset, reader)
test_data = Dataset.load_from_df(testset, reader)

In [6]:
train_data = train_data.build_full_trainset()
val_data = val_data.build_full_trainset()
test_data = test_data.build_full_trainset()

In [7]:
val_data = val_data.build_testset()
test_data = test_data.build_testset()

In [8]:
list1 = []

# Training using 1 to 100 factors on S_train and checking RMSE values on S_dev to get optimal number of factors
for factors in range(1, 101):
    algo = SVD(n_factors=factors)
    algo.fit(train_data)
    predictions = algo.test(val_data)
    acc_values = accuracy.rmse(predictions, verbose=False)
    list1.append(acc_values)

print(list1)


[0.8645664711163066, 0.8654550546717052, 0.8650307165384136, 0.8651429873230916, 0.8641789365177175, 0.8654334700046908, 0.8637940613768109, 0.8651494738238669, 0.8659219683117503, 0.8657768454551286, 0.8650839613540467, 0.8654072925100432, 0.8643779468122967, 0.8644044183196675, 0.864929860591056, 0.8651578701141828, 0.8663287803407169, 0.8654048289722133, 0.8662332259641885, 0.865392835720467, 0.8646101191314839, 0.8632962501314123, 0.864427533830159, 0.8630809604186438, 0.8670371805094856, 0.8651802661667876, 0.8639873051467005, 0.8646589085907657, 0.8652654337325382, 0.8657851085742451, 0.8686633383615423, 0.8656145783477994, 0.8656678650262946, 0.8643698854963461, 0.8668253223386989, 0.8654598846603945, 0.866380311613333, 0.8689993654200789, 0.8664398661186351, 0.867764746476761, 0.865916880506605, 0.8651134475738436, 0.8667584953117654, 0.864249405549858, 0.8682734359420771, 0.867013307815377, 0.867231632974084, 0.8648799443432121, 0.8669701923219975, 0.8690859083369019, 0.869706

In [9]:
# Number of factors [optimal] (gives lowest RMSE on validation set S_dev)
opt_factors = list1.index(min(list1)) + 1
print(opt_factors)

24


In [10]:
# Now training on train_data using optimal number of factors
final_algo = SVD(n_factors=opt_factors)
final_algo.fit(train_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x242a7adf7f0>

In [11]:
# Finally, predicting and calculating accuracy on test_data (S_test)
pred_test = final_algo.test(test_data)
accuracy.rmse(pred_test)

RMSE: 0.8723


0.87231705753882