In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from surprise import Dataset, Reader, accuracy, SVD


In [2]:
ratings = pd.read_csv("ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
df = ratings.drop("timestamp", axis=1)
df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [4]:
trainset, testset = train_test_split(df, test_size=0.05)

print(testset.shape)

(5042, 3)


In [5]:
training, valset = train_test_split(trainset, test_size=0.05)

print(training.shape)
print(valset.shape)

(91004, 3)
(4790, 3)


In [6]:
reader = Reader(rating_scale = (1, 5))

train_data = Dataset.load_from_df(training, reader)
val_data = Dataset.load_from_df(valset, reader)
test_data = Dataset.load_from_df(testset, reader)

In [7]:
train_data = train_data.build_full_trainset()
val_data = val_data.build_full_trainset()
test_data = test_data.build_full_trainset()

In [8]:
val_data = val_data.build_testset()
test_data = test_data.build_testset()

In [9]:
list1 = []

# Training using 1 to 100 factors on S_train and checking RMSE values on S_dev to get optimal number of factors
for factors in range(1, 101):
    algo = SVD(n_factors=factors)
    algo.fit(train_data)
    predictions = algo.test(val_data)
    acc_values = accuracy.rmse(predictions, verbose=False)
    list1.append(acc_values)

print(list1)


[0.8509953486295496, 0.8506072789203213, 0.8500723633740075, 0.8493870851053701, 0.8519449643075017, 0.8509316930999661, 0.852188827937195, 0.8491452695350885, 0.8509333717196569, 0.8497492209131638, 0.8505060471911446, 0.8507211913660261, 0.8518193609213476, 0.8512452408141901, 0.8516055055487216, 0.8471517695020013, 0.8503870883887377, 0.8510951728658619, 0.8490784653954893, 0.8514123773591831, 0.8488986194966021, 0.8506065206058898, 0.8504492818427538, 0.8511755844308243, 0.8492459365088008, 0.8500559290683306, 0.8475293154244163, 0.8499938956918252, 0.8530531723058091, 0.8512834323428441, 0.8518429875724957, 0.8537688474865337, 0.8502932826313994, 0.8526866958183548, 0.8536591236723646, 0.8547230634595536, 0.8500299753613982, 0.8523184252244892, 0.8496040064004139, 0.848193556497039, 0.8492960806502443, 0.8495232362442259, 0.8473143780230503, 0.8472238941889388, 0.8463907348650097, 0.8509896178150427, 0.8529466373912395, 0.8506423332991642, 0.84981196761103, 0.846185281068882, 0.85

In [10]:
# Number of factors [optimal] (gives lowest RMSE on validation set S_dev)
opt_factors = list1.index(min(list1)) + 1
print(opt_factors)

50


In [11]:
# Now training on train_data using optimal number of factors
final_algo = SVD(n_factors=opt_factors)
final_algo.fit(train_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1aa587278b0>

In [12]:
# Finally, predicting and calculating accuracy on test_data (S_test)
pred_test = final_algo.test(test_data)
accuracy.rmse(pred_test)

RMSE: 0.8764


0.8764299463198998