In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from surprise import Dataset, Reader, accuracy, SVD


In [2]:
ratings = pd.read_csv("ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
df = ratings.drop("timestamp", axis=1)
df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [4]:
trainset, testset = train_test_split(df, test_size=0.05)

print(testset.shape)

(5042, 3)


In [5]:
training, valset = train_test_split(trainset, test_size=0.05)

print(training.shape)
print(valset.shape)

(91004, 3)
(4790, 3)


In [6]:
reader = Reader(rating_scale = (1, 5))

train_data = Dataset.load_from_df(training, reader)
val_data = Dataset.load_from_df(valset, reader)
test_data = Dataset.load_from_df(testset, reader)

In [7]:
train_data = train_data.build_full_trainset()
val_data = val_data.build_full_trainset()
test_data = test_data.build_full_trainset()

In [8]:
val_data = val_data.build_testset()
test_data = test_data.build_testset()

In [9]:
list1 = []

# Training using 1 to 100 factors on S_train and checking RMSE values on S_dev to get optimal number of factors
for factors in range(1, 101):
    algo = SVD(n_factors=factors)
    algo.fit(train_data)
    predictions = algo.test(val_data)
    acc_values = accuracy.rmse(predictions, verbose=False)
    list1.append(acc_values)

print(list1)


[0.8699021616491726, 0.8708493844173502, 0.869115137195342, 0.8706247906044619, 0.8683682327289861, 0.8663650521154901, 0.8690330142351681, 0.8689856245667221, 0.8711799767444491, 0.8695719639954813, 0.86956841271716, 0.868773531502751, 0.8667602487016599, 0.8661527396316129, 0.8672801603068241, 0.8688893008462167, 0.8683793975966896, 0.8680351340065955, 0.8663384389479762, 0.8662693451322793, 0.8691427039478079, 0.868425329944009, 0.8710910373962533, 0.8654180250257706, 0.8660822259738816, 0.8713345529772276, 0.8694330205195522, 0.8647659938856499, 0.8710616546759765, 0.8682021839786248, 0.8706601014637462, 0.8697154529186663, 0.8647465465393213, 0.8677392991321122, 0.8646836454330612, 0.8665666540707497, 0.873624305831948, 0.8672158299459064, 0.8675485394290581, 0.8695312583983458, 0.8685768674647818, 0.8688730815719012, 0.8677126936414624, 0.8676769904141317, 0.870824777953473, 0.8766976431685655, 0.8699764625045032, 0.8654916894243779, 0.8696638792180398, 0.8696480643699769, 0.8698

In [10]:
# Number of factors [optimal] (gives lowest RMSE on validation set S_dev)
opt_factors = list1.index(min(list1)) + 1
print(opt_factors)

35


In [11]:
# Now training on train_data using optimal number of factors
algo_f = SVD(n_factors=opt_factors)
algo.fit(train_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x202bcc66a30>

In [12]:
# Finally, predicting and calculating accuracy on test_data (S_test)
pred_test = algo.test(test_data)
accuracy.rmse(pred_test)

RMSE: 0.8706


0.8706442809607582