In [8]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
# from surprise.model_selection.split import train_test_split
from sklearn.model_selection import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
import numpy as np
import json
import os, io
from surprise import KNNBasic, KNNWithMeans
from surprise import SVDpp
from surprise import SVD
from surprise import accuracy
from io import StringIO
from surprise.model_selection import KFold

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/lthingratingpreprocessed/Lthing_rating_preprocessed.txt


# Load pre-processed Data

In [9]:
ratings_df = pd.read_csv("/kaggle/input/lthingratingpreprocessed/Lthing_rating_preprocessed.txt", sep=' ', names = ["user", "work", "stars"])
ratings_df

Unnamed: 0,user,work,stars
0,0,266338,5
1,1,189305,3
2,2,44118,4
3,2,307829,4
4,2,188812,5
...,...,...,...
1706974,83192,154240,5
1706975,83192,152760,4
1706976,83192,30617,3
1706977,83192,459414,5


In [10]:
#create training set
trainingSet, testSet = train_test_split(ratings_df, test_size=0.2, train_size=None, random_state=None, shuffle=True)

In [11]:
trainingSet.shape,testSet.shape,type(trainingSet)

((1365583, 3), (341396, 3), pandas.core.frame.DataFrame)

In [12]:
reader = Reader(rating_scale=(0.5,5))  #invoke reader instance of surprise library
data=Dataset.load_from_df(trainingSet,reader)

In [13]:
test_set = Dataset.load_from_df(testSet, reader=reader)
test_set = test_set.build_full_trainset().build_testset()
test_set

[(35103, 229972, 0.0),
 (35103, 94016, 0.0),
 (35103, 447944, 0.0),
 (20964, 450770, 2.0),
 (20964, 144118, 4.0),
 (20964, 304711, 4.0),
 (20964, 331029, 4.0),
 (20964, 2637, 4.0),
 (20964, 406562, 4.0),
 (20964, 27253, 4.0),
 (20964, 492007, 4.0),
 (20964, 48775, 3.0),
 (20964, 100228, 4.0),
 (20964, 495274, 4.0),
 (20964, 360148, 2.0),
 (20964, 220989, 4.0),
 (20964, 394276, 4.0),
 (20964, 389113, 3.0),
 (20964, 158089, 4.0),
 (20964, 231149, 3.0),
 (20964, 324233, 3.0),
 (20964, 413263, 4.0),
 (20964, 339135, 4.0),
 (20964, 464540, 3.0),
 (20964, 331032, 4.0),
 (20964, 410934, 4.0),
 (20964, 57580, 2.0),
 (20964, 258986, 4.0),
 (20964, 135884, 4.0),
 (20964, 145932, 3.0),
 (20964, 319650, 4.0),
 (20964, 203312, 4.0),
 (20964, 31910, 5.0),
 (20964, 52451, 4.0),
 (20964, 410028, 2.0),
 (20964, 330029, 4.0),
 (20964, 428327, 3.0),
 (20964, 10088, 3.0),
 (20964, 335712, 4.0),
 (20964, 207424, 4.0),
 (20964, 48687, 4.0),
 (20964, 488483, 3.0),
 (20964, 436456, 4.0),
 (20964, 320167, 4.0)

# K-fold cross validation

In [14]:
kf = KFold(n_splits=5)
algo = SVD()
for trainset, valset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(valset)
    accuracy.rmse(predictions, verbose=True)

RMSE: 1.1909
RMSE: 1.1882
RMSE: 1.1852
RMSE: 1.1897
RMSE: 1.1883


In [15]:
trainset = data.build_full_trainset()

In [16]:
# Build an algorithm, and train it. Follow methodology provided previously
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fb5c6b850d0>

In [17]:
predictions = algo.test(test_set)
predictions

[Prediction(uid=35103, iid=229972, r_ui=0.0, est=1.0840552265083274, details={'was_impossible': False}),
 Prediction(uid=35103, iid=94016, r_ui=0.0, est=1.1739267531562612, details={'was_impossible': False}),
 Prediction(uid=35103, iid=447944, r_ui=0.0, est=1.330109603455842, details={'was_impossible': False}),
 Prediction(uid=20964, iid=450770, r_ui=2.0, est=4.306320578168511, details={'was_impossible': False}),
 Prediction(uid=20964, iid=144118, r_ui=4.0, est=3.306808274137546, details={'was_impossible': False}),
 Prediction(uid=20964, iid=304711, r_ui=4.0, est=4.068692927889034, details={'was_impossible': False}),
 Prediction(uid=20964, iid=331029, r_ui=4.0, est=3.579186758257173, details={'was_impossible': False}),
 Prediction(uid=20964, iid=2637, r_ui=4.0, est=3.3557423333543452, details={'was_impossible': False}),
 Prediction(uid=20964, iid=406562, r_ui=4.0, est=3.579186758257173, details={'was_impossible': False}),
 Prediction(uid=20964, iid=27253, r_ui=4.0, est=3.64351503712640

In [18]:
accuracy.rmse(predictions, verbose=True)

RMSE: 1.1797


1.179662895099649

# GridSearchCV

In [19]:
param_grid = {'n_epochs': [10,20], 'lr_all': [0.001,0.01,0.1,1],
              'reg_all': [0.1,0.3,1],'n_factors': [20,30]}

In [20]:
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5) 

In [21]:
gs.fit(data)
gs

<surprise.model_selection.search.GridSearchCV at 0x7fb5b9286f50>

In [22]:
print(gs.best_score['rmse'])

1.166019086796885


In [23]:
print(gs.best_params['rmse'])

{'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.1, 'n_factors': 20}


In [24]:
svd = gs.best_estimator['rmse']
svd.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fb554051f90>

In [25]:
predictions = svd.test(test_set)
predictions

[Prediction(uid=35103, iid=229972, r_ui=0.0, est=0.5, details={'was_impossible': False}),
 Prediction(uid=35103, iid=94016, r_ui=0.0, est=0.5005381658620163, details={'was_impossible': False}),
 Prediction(uid=35103, iid=447944, r_ui=0.0, est=0.5191661196300443, details={'was_impossible': False}),
 Prediction(uid=20964, iid=450770, r_ui=2.0, est=3.5550443598098544, details={'was_impossible': False}),
 Prediction(uid=20964, iid=144118, r_ui=4.0, est=3.431251193140905, details={'was_impossible': False}),
 Prediction(uid=20964, iid=304711, r_ui=4.0, est=3.9550696765887214, details={'was_impossible': False}),
 Prediction(uid=20964, iid=331029, r_ui=4.0, est=3.5409238607375695, details={'was_impossible': False}),
 Prediction(uid=20964, iid=2637, r_ui=4.0, est=3.261301020840903, details={'was_impossible': False}),
 Prediction(uid=20964, iid=406562, r_ui=4.0, est=3.5409238607375695, details={'was_impossible': False}),
 Prediction(uid=20964, iid=27253, r_ui=4.0, est=3.7612549082194966, details

In [26]:
accuracy.rmse(predictions, verbose=True)

RMSE: 1.1580


1.1580327750837944