In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from surprise import Dataset, Reader, accuracy, SVD


In [2]:
ratings = pd.read_csv("ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
df = ratings.drop("timestamp", axis=1)
df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [4]:
# using S_train = 80%, S_dev = 10%, S_test = 10%
trainset, test_val = train_test_split(df, train_size=0.8)

print("Training set size: ", trainset.shape)

valset, testset = train_test_split(test_val, test_size=0.5)

print("Validation set size : ", valset.shape)
print("Test set size: ", testset.shape)

Training set size:  (80668, 3)
Validation set size :  (10084, 3)
Test set size:  (10084, 3)


In [5]:
reader = Reader(rating_scale = (1, 5))

train_data = Dataset.load_from_df(trainset, reader)
val_data = Dataset.load_from_df(valset, reader)
test_data = Dataset.load_from_df(testset, reader)

In [6]:
train_data = train_data.build_full_trainset()
val_data = val_data.build_full_trainset()
test_data = test_data.build_full_trainset()

In [7]:
val_data = val_data.build_testset()
test_data = test_data.build_testset()

In [8]:
list1 = []

# Training using 1 to 100 factors on S_train and checking RMSE values on S_dev to get optimal number of factors
for factors in range(1, 101):
    algo = SVD(n_factors=factors)
    algo.fit(train_data)
    predictions = algo.test(val_data)
    acc_values = accuracy.rmse(predictions, verbose=False)
    list1.append(acc_values)

print(list1)


[0.884999060154812, 0.8846475831326711, 0.8850642512087848, 0.8846784529321446, 0.883437222568176, 0.8851263291236368, 0.8854301737139209, 0.8862186481114975, 0.8860128604075619, 0.885696191333881, 0.8851412512017515, 0.8838265223587652, 0.884573998216873, 0.8841849070054747, 0.8852502786759966, 0.8865738782505199, 0.8842459745890754, 0.8843420893457867, 0.8844640136581743, 0.88307370298517, 0.8836111686281845, 0.8849502320909314, 0.884985876880839, 0.8851230884198125, 0.8851849162993487, 0.8874503794701862, 0.885252967406262, 0.8853175674404303, 0.8857308948708853, 0.8825525547351636, 0.8860544193446729, 0.8851943103341736, 0.8850200841697218, 0.8853012200828946, 0.8858600501986055, 0.8853615542570892, 0.884815969405308, 0.8869396983929385, 0.884845141360928, 0.8845089509914983, 0.8833322199367543, 0.8858127815449548, 0.8854083540031372, 0.8859326513599786, 0.8833630271504498, 0.8817823945233061, 0.8862000252904362, 0.8838332104200877, 0.8859577111197314, 0.8850769596294173, 0.8837848

In [9]:
# Number of factors [optimal] (gives lowest RMSE on validation set S_dev)
opt_factors = list1.index(min(list1)) + 1
print(opt_factors)

46


In [10]:
# Now training on train_data using optimal number of factors
final_algo = SVD(n_factors=opt_factors)
final_algo.fit(train_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x20c9c65f550>

In [11]:
# Finally, predicting and calculating accuracy on test_data (S_test)
pred_test = final_algo.test(test_data)
accuracy.rmse(pred_test)

RMSE: 0.8570


0.8569593657080649