In [85]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
# from surprise.model_selection.split import train_test_split
from sklearn.model_selection import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
import numpy as np
import json
import os, io
from surprise import KNNBasic, KNNWithMeans
from surprise import SVDpp
from surprise import SVD
from surprise import accuracy
from io import StringIO
from surprise.model_selection import KFold

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/librarythingratings/Lthing_rating _final.txt
/kaggle/input/lthingratingpreprocessed/Lthing_rating_preprocessed.txt


# Load pre-processed Data

In [86]:
ratings_df = pd.read_csv("/kaggle/input/lthingratingpreprocessed/Lthing_rating_preprocessed.txt", sep=' ', names = ["user", "work", "stars"])

In [87]:
#create training set
trainingSet, testSet = train_test_split(ratings_df, test_size=0.2, train_size=None, random_state=None, shuffle=True)

In [88]:
trainingSet.shape,testSet.shape,type(trainingSet)

((1365583, 3), (341396, 3), pandas.core.frame.DataFrame)

In [89]:
reader = Reader(rating_scale=(0.5,5))  #invoke reader instance of surprise library
data=Dataset.load_from_df(trainingSet,reader)

In [90]:
test_set = Dataset.load_from_df(testSet, reader=reader)
test_set = test_set.build_full_trainset().build_testset()

# K-fold cross validation

In [91]:
kf = KFold(n_splits=3)
algo = SVD()
for trainset, valset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(valset)
    accuracy.rmse(predictions, verbose=True)

RMSE: 1.1988
RMSE: 1.1979
RMSE: 1.1985


In [92]:
trainset = data.build_full_trainset()

In [93]:
# Build an algorithm, and train it. Follow methodology provided previously
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f480d65f110>

In [94]:
predictions = algo.test(test_set)

In [95]:
accuracy.rmse(predictions, verbose=True)

RMSE: 1.1787


1.178675327645319

# GridSearchCV

In [96]:
param_grid = {'n_epochs': [10,20], 'lr_all': [0.001,0.01,0.1,1],
              'reg_all': [0.1,0.3,1],'n_factors': [20,30]}

In [97]:
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5) 

In [98]:
gs.fit(data)

In [99]:
print(gs.best_score['rmse'])

1.166239337704951


In [100]:
print(gs.best_params['rmse'])

{'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.1, 'n_factors': 20}


In [101]:
svd = gs.best_estimator['rmse']
svd.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f474b664f90>

In [102]:
predictions = svd.test(test_set)

In [103]:
accuracy.rmse(predictions, verbose=True)

RMSE: 1.1569


1.1568978558992622