In [1]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
# from surprise.model_selection.split import train_test_split
from sklearn.model_selection import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
import numpy as np
import json
import os, io
from surprise import KNNBasic, KNNWithMeans
from surprise import SVDpp
from surprise import SVD
from surprise import accuracy
from io import StringIO
from surprise.model_selection import KFold

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/lthingratingpreprocessed/Lthing_rating_preprocessed.txt


# Load pre-processed Data

In [2]:
ratings_df = pd.read_csv("/kaggle/input/lthingratingpreprocessed/Lthing_rating_preprocessed.txt", sep=' ', names = ["user", "work", "stars"])
ratings_df

Unnamed: 0,user,work,stars
0,0,266338,5
1,1,189305,3
2,2,44118,4
3,2,307829,4
4,2,188812,5
...,...,...,...
1706974,83192,154240,5
1706975,83192,152760,4
1706976,83192,30617,3
1706977,83192,459414,5


In [3]:
#create training set
trainingSet, testSet = train_test_split(ratings_df, test_size=0.2, train_size=None, random_state=None, shuffle=True)

In [4]:
trainingSet.shape,testSet.shape,type(trainingSet)

((1365583, 3), (341396, 3), pandas.core.frame.DataFrame)

In [5]:
reader = Reader(rating_scale=(0.5,5))  #invoke reader instance of surprise library
data=Dataset.load_from_df(trainingSet,reader)

In [6]:
test_set = Dataset.load_from_df(testSet, reader=reader)
test_set = test_set.build_full_trainset().build_testset()
test_set

[(19755, 151272, 4.0),
 (19755, 134632, 4.0),
 (19755, 161939, 5.0),
 (6257, 283713, 0.0),
 (6257, 337293, 0.0),
 (6257, 401836, 0.0),
 (6257, 460592, 0.0),
 (6257, 74728, 0.0),
 (6257, 33625, 0.0),
 (6257, 498175, 0.0),
 (6257, 195546, 0.0),
 (6257, 46079, 0.0),
 (6257, 417429, 0.0),
 (6257, 178999, 0.0),
 (6257, 233425, 0.0),
 (6257, 174204, 0.0),
 (6257, 298220, 0.0),
 (71672, 116539, 0.0),
 (71672, 115598, 0.0),
 (71672, 119177, 0.0),
 (71672, 119264, 0.0),
 (71672, 120514, 0.0),
 (71672, 119061, 0.0),
 (71672, 119176, 0.0),
 (71672, 118325, 0.0),
 (71672, 118622, 0.0),
 (71672, 118696, 0.0),
 (71672, 179541, 0.0),
 (71672, 116438, 0.0),
 (71672, 116251, 0.0),
 (71672, 116388, 0.0),
 (71672, 455432, 0.0),
 (71672, 118489, 0.0),
 (71672, 38712, 0.0),
 (71672, 4361, 0.0),
 (71672, 120470, 0.0),
 (71672, 118486, 0.0),
 (71672, 120527, 0.0),
 (71672, 118494, 0.0),
 (71672, 117842, 0.0),
 (71672, 120141, 0.0),
 (71672, 120145, 0.0),
 (71672, 319491, 0.0),
 (71672, 127675, 0.0),
 (71672,

# K-fold cross validation

In [7]:
kf = KFold(n_splits=5)
algo = SVD()
for trainset, valset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(valset)
    accuracy.rmse(predictions, verbose=True)

RMSE: 1.1870
RMSE: 1.1919
RMSE: 1.1892
RMSE: 1.1889
RMSE: 1.1852


In [8]:
trainset = data.build_full_trainset()

In [9]:
# Build an algorithm, and train it. Follow methodology provided previously
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f12f206fd90>

In [10]:
predictions = algo.test(test_set)
predictions

[Prediction(uid=19755, iid=151272, r_ui=4.0, est=3.188335201811993, details={'was_impossible': False}),
 Prediction(uid=19755, iid=134632, r_ui=4.0, est=3.3220238498527266, details={'was_impossible': False}),
 Prediction(uid=19755, iid=161939, r_ui=5.0, est=3.188335201811993, details={'was_impossible': False}),
 Prediction(uid=6257, iid=283713, r_ui=0.0, est=0.5, details={'was_impossible': False}),
 Prediction(uid=6257, iid=337293, r_ui=0.0, est=0.5213075431402575, details={'was_impossible': False}),
 Prediction(uid=6257, iid=401836, r_ui=0.0, est=0.5, details={'was_impossible': False}),
 Prediction(uid=6257, iid=460592, r_ui=0.0, est=0.5520542512981843, details={'was_impossible': False}),
 Prediction(uid=6257, iid=74728, r_ui=0.0, est=0.5, details={'was_impossible': False}),
 Prediction(uid=6257, iid=33625, r_ui=0.0, est=0.5, details={'was_impossible': False}),
 Prediction(uid=6257, iid=498175, r_ui=0.0, est=0.5, details={'was_impossible': False}),
 Prediction(uid=6257, iid=195546, r_

In [11]:
accuracy.rmse(predictions, verbose=True)

RMSE: 1.1797


1.1796581077410584

# GridSearchCV

In [12]:
param_grid = {'n_epochs': [25], 'lr_all': [0.01],
              'reg_all': [0.001],'n_factors': [10]}

In [13]:
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5) 

In [14]:
gs.fit(data)
gs

<surprise.model_selection.search.GridSearchCV at 0x7f12d846ed10>

In [15]:
print(gs.best_score['rmse'])

1.2056855285570662


In [16]:
print(gs.best_params['rmse'])

{'n_epochs': 25, 'lr_all': 0.01, 'reg_all': 0.001, 'n_factors': 10}


In [17]:
svd = gs.best_estimator['rmse']
svd.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f1294f2ddd0>

In [18]:
predictions = svd.test(test_set)
predictions

[Prediction(uid=19755, iid=151272, r_ui=4.0, est=3.4695410480639493, details={'was_impossible': False}),
 Prediction(uid=19755, iid=134632, r_ui=4.0, est=3.427164045236081, details={'was_impossible': False}),
 Prediction(uid=19755, iid=161939, r_ui=5.0, est=3.4695410480639493, details={'was_impossible': False}),
 Prediction(uid=6257, iid=283713, r_ui=0.0, est=0.5, details={'was_impossible': False}),
 Prediction(uid=6257, iid=337293, r_ui=0.0, est=0.5, details={'was_impossible': False}),
 Prediction(uid=6257, iid=401836, r_ui=0.0, est=0.5, details={'was_impossible': False}),
 Prediction(uid=6257, iid=460592, r_ui=0.0, est=0.5, details={'was_impossible': False}),
 Prediction(uid=6257, iid=74728, r_ui=0.0, est=0.5, details={'was_impossible': False}),
 Prediction(uid=6257, iid=33625, r_ui=0.0, est=0.5, details={'was_impossible': False}),
 Prediction(uid=6257, iid=498175, r_ui=0.0, est=0.5, details={'was_impossible': False}),
 Prediction(uid=6257, iid=195546, r_ui=0.0, est=0.5, details={'wa

In [19]:
accuracy.rmse(predictions, verbose=True)

RMSE: 1.2045


1.2045117426429581