In [1]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3163000 sha256=bff45b8839c25ff3f825af3005aeb56cf33bd03fa0b4336b303e56bc84ca6a26
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [2]:
import pandas as pd
import numpy as np
import surprise

In [3]:
df = pd.read_excel("[final] April 2015 to Nov 30 2019 - Transformed Jester Data - .xlsx",
                   header=None)
ratings = pd.melt(df, id_vars=0)
ratings.columns=["uid", "iid", "rating"]
ratings = ratings[(ratings['rating']<=10) & (ratings['rating']>=-10)]
ratings.head()

Unnamed: 0,uid,iid,rating
46200,112,7,-4.45
46233,75,7,-10.0
46282,73,7,-5.76
46286,67,7,9.04
46304,121,7,0.0


In [4]:
lowest_rating = ratings['rating'].min()
highest_rating = ratings['rating'].max()
print("Ratings range between {0} and {1}".format(lowest_rating,highest_rating))
reader = surprise.Reader(rating_scale = (lowest_rating,highest_rating))

Ratings range between -10.0 and 10.0


Converting the data into **surprise** format:

In [5]:
data = surprise.Dataset.load_from_df(ratings,reader)
type(data)

**SVD** Tuning with different hyper-parameters

In [6]:
from surprise.model_selection import GridSearchCV
from surprise.model_selection.split import KFold

param_grid = {'n_epochs': np.arange(5,50,10),
              'lr_all':np.linspace(0.001,1,5),
              'reg_all': np.linspace(0.01,0.8,5),
              'n_factors':[50,100]}
kfold = KFold(n_splits=5, random_state=24, shuffle=True)
gs = GridSearchCV(surprise.SVD, param_grid, measures=['rmse', 'mae'], n_jobs=-1,
                  cv=kfold, joblib_verbose=3)

Running the Grid Search CV

In [7]:
gs.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:   53.4s
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 508 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done 1148 tasks      | elapsed: 18.9min
[Parallel(n_jobs=-1)]: Done 1250 out of 1250 | elapsed: 21.7min finished


**Best Score**

In [8]:
print(gs.best_score['rmse'])

4.7839474917231835


**Best Param**

In [9]:
print(gs.best_params['rmse'])

{'n_epochs': 25, 'lr_all': 0.001, 'reg_all': 0.20750000000000002, 'n_factors': 50}
