In [1]:
# Data manipulation
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100

# Recommender systems
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
import surprise as sp

# Other
import os
import random
import sys

# Reload imported code
%load_ext autoreload
%autoreload 2

# Custom modules
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src import *

## Load data

**Movie data found here https://grouplens.org/datasets/movielens/**

In [2]:
cols = ['u_id', 'i_id', 'rating', 'timestamp']
movie_data = pd.read_csv('../data/ml-1m/ratings.dat', names = cols, sep = '::', usecols=[0, 1, 2])

# Prepare data
train = movie_data.sample(frac=0.8)
test = movie_data.drop(train.index.tolist())

## Simple Model with global mean

In [3]:
global_mean = train['rating'].mean()
pred = [global_mean for _ in range(test.shape[0])]

rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:4f}')


Test RMSE: 1.117268


## Baseline Model with biases

In [6]:
%%time

baseline_model = BaselineModel(n_epochs = 20, reg = 0.005)
baseline_model.fit(train)

pred = baseline_model.predict(test)
rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.12743109045394502
Epoch  2 / 20  -  train_rmse: 0.2279631897517531
Epoch  3 / 20  -  train_rmse: 0.33766202245142213
Epoch  4 / 20  -  train_rmse: 0.42638235786208967
Epoch  5 / 20  -  train_rmse: 0.4916609904263343
Epoch  6 / 20  -  train_rmse: 0.5381164059789344
Epoch  7 / 20  -  train_rmse: 0.5708272955763287
Epoch  8 / 20  -  train_rmse: 0.5938506550749628
Epoch  9 / 20  -  train_rmse: 0.6101406370252674
Epoch  10 / 20  -  train_rmse: 0.621770036561658
Epoch  11 / 20  -  train_rmse: 0.630169029453688
Epoch  12 / 20  -  train_rmse: 0.6363173234609523
Epoch  13 / 20  -  train_rmse: 0.6408849295459511
Epoch  14 / 20  -  train_rmse: 0.6443309987010555
Epoch  15 / 20  -  train_rmse: 0.6469716980846663
Epoch  16 / 20  -  train_rmse: 0.6490262719430819
Epoch  17 / 20  -  train_rmse: 0.6506481109593726
Epoch  18 / 20  -  train_rmse: 0.651945663000393
Epoch  19 / 20  -  train_rmse: 0.6529965131523516
Epoch  20 / 20  -  train_rmse: 0.6538568878300737

Test RMS

## Custom SVD

In [6]:
%%time 
random.seed(2)
np.random.seed(2)
svd = SVD(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)
svd.fit(train)

pred = svd.predict(test)
rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.4200119423824712
Epoch  2 / 20  -  train_rmse: 0.5134161673945182
Epoch  3 / 20  -  train_rmse: 0.5490851682852552
Epoch  4 / 20  -  train_rmse: 0.5547686906688019
Epoch  5 / 20  -  train_rmse: 0.5466300037437639
Epoch  6 / 20  -  train_rmse: 0.5328242936133687
Epoch  7 / 20  -  train_rmse: 0.5171638382987964
Epoch  8 / 20  -  train_rmse: 0.5013124692639045
Epoch  9 / 20  -  train_rmse: 0.485925623190334
Epoch  10 / 20  -  train_rmse: 0.47120743485351135
Epoch  11 / 20  -  train_rmse: 0.45717300346984496
Epoch  12 / 20  -  train_rmse: 0.44376765423753367
Epoch  13 / 20  -  train_rmse: 0.43091867509787507
Epoch  14 / 20  -  train_rmse: 0.4185559896239487
Epoch  15 / 20  -  train_rmse: 0.4066190389080836
Epoch  16 / 20  -  train_rmse: 0.3950578946796036
Epoch  17 / 20  -  train_rmse: 0.38383222864034194
Epoch  18 / 20  -  train_rmse: 0.37290970429994413
Epoch  19 / 20  -  train_rmse: 0.3622644141818156
Epoch  20 / 20  -  train_rmse: 0.35187556748446197

Te

## Surprise

In [90]:
# Load the movielens-100k dataset (download it if needed),
data = sp.Dataset.load_builtin('ml-100k')

In [91]:
%%time
# We'll use the famous SVD algorithm.
algo = sp.SVD()

# Run 5-fold cross-validation and print results
sp.model_selection.cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9265  0.9351  0.9324  0.9415  0.9408  0.9353  0.0056  
MAE (testset)     0.7294  0.7344  0.7358  0.7421  0.7399  0.7363  0.0044  
Fit time          4.16    4.19    4.22    4.25    4.20    4.20    0.03    
Test time         0.13    0.45    0.11    0.11    0.11    0.18    0.13    
Wall time: 22.6 s


{'test_rmse': array([0.92651162, 0.93507166, 0.93241232, 0.94154717, 0.94084358]),
 'test_mae': array([0.72938989, 0.73444756, 0.73576219, 0.74210202, 0.73993852]),
 'fit_time': (4.163815021514893,
  4.187639236450195,
  4.216679334640503,
  4.247923374176025,
  4.1983418464660645),
 'test_time': (0.12666082382202148,
  0.45206427574157715,
  0.11275815963745117,
  0.1120297908782959,
  0.11228561401367188)}

In [93]:
%%time

surprise_svd = sp.SVD(lr_all=0.001, reg_all=0.005, n_epochs=20,
          n_factors=50)

reader = sp.Reader()

data_train = sp.Dataset.load_from_df(train[['u_id', 'i_id', 'rating']], reader = reader).build_full_trainset()
data_test = sp.Dataset.load_from_df(test[['u_id', 'i_id', 'rating']], reader = reader).build_full_trainset().build_testset()
surprise_svd.fit(data_train)

pred = surprise_svd.test(data_test)
sp.accuracy.rmse(pred)

RMSE: 0.9169
Wall time: 31.1 s


0.9168845476049273

In [21]:
surprise_svd.test(data_test)[:20]

[Prediction(uid=259, iid=200, r_ui=4.0, est=4.1670259763161255, details={'was_impossible': False}),
 Prediction(uid=259, iid=357, r_ui=5.0, est=4.430772728105106, details={'was_impossible': False}),
 Prediction(uid=259, iid=179, r_ui=4.0, est=4.20424005908081, details={'was_impossible': False}),
 Prediction(uid=259, iid=1135, r_ui=5.0, est=3.231236971348656, details={'was_impossible': False}),
 Prediction(uid=259, iid=15, r_ui=3.0, est=3.91758055378108, details={'was_impossible': False}),
 Prediction(uid=259, iid=121, r_ui=3.0, est=3.590538369392122, details={'was_impossible': False}),
 Prediction(uid=259, iid=294, r_ui=3.0, est=3.2821077351936805, details={'was_impossible': False}),
 Prediction(uid=259, iid=748, r_ui=4.0, est=3.323876222072292, details={'was_impossible': False}),
 Prediction(uid=259, iid=147, r_ui=4.0, est=3.8508097317210708, details={'was_impossible': False}),
 Prediction(uid=851, iid=696, r_ui=3.0, est=3.3294724032153775, details={'was_impossible': False}),
 Predict