In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from surprise import Dataset, Reader, accuracy, SVD


In [2]:
ratings = pd.read_csv("ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
df = ratings.drop("timestamp", axis=1)
df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [4]:
# using S_train = 80%, S_dev = 10%, S_test = 10%
trainset, testset = train_test_split(df, test_size=0.1)

print("Test set size: ", testset.shape)

training, valset = train_test_split(trainset, test_size=0.1)

print("Validation set size : ", valset.shape)
print("Training set size: ", training.shape)

Test set size:  (10084, 3)
Validation set size :  (9076, 3)
Training set size:  (81676, 3)


In [5]:
reader = Reader(rating_scale = (1, 5))

train_data = Dataset.load_from_df(training, reader)
val_data = Dataset.load_from_df(valset, reader)
test_data = Dataset.load_from_df(testset, reader)

In [6]:
train_data = train_data.build_full_trainset()
val_data = val_data.build_full_trainset()
test_data = test_data.build_full_trainset()

In [7]:
val_data = val_data.build_testset()
test_data = test_data.build_testset()

In [8]:
list1 = []

# Training using 1 to 100 factors on S_train and checking RMSE values on S_dev to get optimal number of factors
for factors in range(1, 101):
    algo = SVD(n_factors=factors)
    algo.fit(train_data)
    predictions = algo.test(val_data)
    acc_values = accuracy.rmse(predictions, verbose=False)
    list1.append(acc_values)

print(list1)


[0.8706554367270425, 0.8716178038929369, 0.8708931553408468, 0.8709141262473133, 0.8696360076073671, 0.8676784355216187, 0.8713919816523402, 0.8707364425337059, 0.8712997626411186, 0.8701156140972122, 0.8690151878048915, 0.8716909691827773, 0.8702718240545058, 0.8700016042030735, 0.8697509326466418, 0.8689609045085197, 0.8710125434319774, 0.8708918477776573, 0.8700962213140853, 0.8710667021415142, 0.8709117284479786, 0.8694498309837835, 0.8695850530316718, 0.872410963136513, 0.8694329386973177, 0.8717059368642105, 0.8703633709346543, 0.872368090379602, 0.8693225741436846, 0.8691888264272146, 0.8717946360021606, 0.8725380167944922, 0.8676141318161071, 0.8712532686347592, 0.8702047332366324, 0.8699919013570836, 0.870240564704669, 0.8703457632251452, 0.8713759485930013, 0.8698516506959284, 0.8707550877927872, 0.8701533027923684, 0.8715490727539076, 0.8720521050564329, 0.8708126973529037, 0.8691461467851574, 0.8718125718227394, 0.8695921187329756, 0.8721394714001137, 0.8719545864279279, 0.

In [9]:
# Number of factors [optimal] (gives lowest RMSE on validation set S_dev)
opt_factors = list1.index(min(list1)) + 1
print(opt_factors)

33


In [10]:
# Now training on train_data using optimal number of factors
final_algo = SVD(n_factors=opt_factors)
final_algo.fit(train_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1e06649b5b0>

In [11]:
# Finally, predicting and calculating accuracy on test_data (S_test)
pred_test = final_algo.test(test_data)
accuracy.rmse(pred_test)

RMSE: 0.8679


0.8679028429127571