In [13]:
# Data manipulation
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100

# Recommender systems
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
import surprise as sp

# Other
import os
import random
import sys

# Reload imported code
%load_ext autoreload
%autoreload 2

# Custom modules
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load data

**Movie data found here https://grouplens.org/datasets/movielens/**

In [2]:
cols = ['u_id', 'i_id', 'rating', 'timestamp']
movie_data = pd.read_csv('../data/ml-1m/ratings.dat', names = cols, sep = '::', usecols=[0, 1, 2])

# Prepare data
train = movie_data.sample(frac=0.8)
test = movie_data.drop(train.index.tolist())

## Simple Model with global mean

In [6]:
global_mean = train['rating'].mean()
pred = [global_mean for _ in range(test.shape[0])]

rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:4f}')


Test RMSE: 1.117850


## Baseline Model with biases

### SGD

In [26]:
%%time

baseline_model = BaselineModel(n_epochs = 20, reg = 0.005, verbose=1)
baseline_model.fit(train)

pred = baseline_model.predict(test)
rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.871980978214218
Epoch  2 / 20  -  train_rmse: 0.8374227238174543
Epoch  3 / 20  -  train_rmse: 0.8245628365555198
Epoch  4 / 20  -  train_rmse: 0.8181313766158749
Epoch  5 / 20  -  train_rmse: 0.8144161721838907
Epoch  6 / 20  -  train_rmse: 0.8120702230072291
Epoch  7 / 20  -  train_rmse: 0.8104958250944153
Epoch  8 / 20  -  train_rmse: 0.8093910914643307
Epoch  9 / 20  -  train_rmse: 0.8085888629632715
Epoch  10 / 20  -  train_rmse: 0.8079900351762842
Epoch  11 / 20  -  train_rmse: 0.8075326908098902
Epoch  12 / 20  -  train_rmse: 0.8071765151351332
Epoch  13 / 20  -  train_rmse: 0.8068943680095578
Epoch  14 / 20  -  train_rmse: 0.8066674668501668
Epoch  15 / 20  -  train_rmse: 0.8064825062164301
Epoch  16 / 20  -  train_rmse: 0.8063298697247823
Epoch  17 / 20  -  train_rmse: 0.8062024845531691
Epoch  18 / 20  -  train_rmse: 0.8060950676200299
Epoch  19 / 20  -  train_rmse: 0.8060036178439534
Epoch  20 / 20  -  train_rmse: 0.8059250671127095

Test RMSE

### ALS

In [28]:
%%time

baseline_model = BaselineModel(method='als', n_epochs = 20, reg = 0.005, verbose=1)
baseline_model.fit(train)

pred = baseline_model.predict(test)
rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.8431114587618241
Epoch  2 / 20  -  train_rmse: 0.8054503687558924
Epoch  3 / 20  -  train_rmse: 0.8037076611607052
Epoch  4 / 20  -  train_rmse: 0.803611324323333
Epoch  5 / 20  -  train_rmse: 0.8036057069019398
Epoch  6 / 20  -  train_rmse: 0.8036053692857239
Epoch  7 / 20  -  train_rmse: 0.8036053477643588
Epoch  8 / 20  -  train_rmse: 0.8036053461252116
Epoch  9 / 20  -  train_rmse: 0.8036053459404653
Epoch  10 / 20  -  train_rmse: 0.8036053459081435
Epoch  11 / 20  -  train_rmse: 0.803605345900974
Epoch  12 / 20  -  train_rmse: 0.8036053458992236
Epoch  13 / 20  -  train_rmse: 0.8036053458987541
Epoch  14 / 20  -  train_rmse: 0.8036053458985319
Epoch  15 / 20  -  train_rmse: 0.8036053458984738
Epoch  16 / 20  -  train_rmse: 0.8036053458983718
Epoch  17 / 20  -  train_rmse: 0.8036053458982573
Epoch  18 / 20  -  train_rmse: 0.8036053458981735
Epoch  19 / 20  -  train_rmse: 0.8036053458980996
Epoch  20 / 20  -  train_rmse: 0.803605345898018

Test RMSE: 

## SVD by Simon Funk

In [9]:
%%time 
random.seed(2)
np.random.seed(2)
matrix_fact = MatrixFactorization(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)
matrix_fact.fit(train)

pred = matrix_fact.predict(test)
rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 4.7519163235869755
Epoch  2 / 20  -  train_rmse: 4.07125950177922
Epoch  3 / 20  -  train_rmse: 3.6994707653121295
Epoch  4 / 20  -  train_rmse: 3.487612945175548
Epoch  5 / 20  -  train_rmse: 3.361677161458169
Epoch  6 / 20  -  train_rmse: 3.2830956906142963
Epoch  7 / 20  -  train_rmse: 3.2310618451542217
Epoch  8 / 20  -  train_rmse: 3.194092982634919
Epoch  9 / 20  -  train_rmse: 3.1657616120427647
Epoch  10 / 20  -  train_rmse: 3.14243679531888
Epoch  11 / 20  -  train_rmse: 3.122052194110065
Epoch  12 / 20  -  train_rmse: 3.1034212511623207
Epoch  13 / 20  -  train_rmse: 3.085852518150945
Epoch  14 / 20  -  train_rmse: 3.068932643826073
Epoch  15 / 20  -  train_rmse: 3.052403973330817
Epoch  16 / 20  -  train_rmse: 3.036095775003364
Epoch  17 / 20  -  train_rmse: 3.019885868491791
Epoch  18 / 20  -  train_rmse: 3.003679427500367
Epoch  19 / 20  -  train_rmse: 2.9873974174428293
Epoch  20 / 20  -  train_rmse: 2.9709703793155584

Test RMSE: 0.9163
Wall

## Surprise

In [13]:
# Load the movielens-100k dataset (download it if needed),
data = sp.Dataset.load_builtin('ml-100k')

In [14]:
%%time
# We'll use the famous SVD algorithm.
algo = sp.AlgoBase()

# Run 5-fold cross-validation and print results
sp.model_selection.cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9315  0.9319  0.9371  0.9483  0.9311  0.9360  0.0065  
MAE (testset)     0.7346  0.7348  0.7381  0.7454  0.7353  0.7376  0.0041  
Fit time          5.25    4.75    4.70    4.66    4.71    4.81    0.22    
Test time         0.16    0.13    0.20    0.14    0.12    0.15    0.03    
Wall time: 25.7 s


{'test_rmse': array([0.93150639, 0.93190626, 0.93712494, 0.94831838, 0.93106516]),
 'test_mae': array([0.7345864 , 0.73479604, 0.73812511, 0.74543866, 0.7352659 ]),
 'fit_time': (5.246900796890259,
  4.746541976928711,
  4.704091310501099,
  4.6643593311309814,
  4.708273649215698),
 'test_time': (0.1593332290649414,
  0.132645845413208,
  0.20246028900146484,
  0.140625,
  0.12167477607727051)}

In [90]:
%%time

# surprise_svd = sp.SVD(lr_all=0.001, reg_all=0.005, n_epochs=20, n_factors=50)
surprise_svd = sp.BaselineOnly()

reader = sp.Reader()

data_train = sp.Dataset.load_from_df(train[['u_id', 'i_id', 'rating']], reader = reader).build_full_trainset()
data_test = sp.Dataset.load_from_df(test[['u_id', 'i_id', 'rating']], reader = reader).build_full_trainset().build_testset()
surprise_svd.fit(data_train)

pred = surprise_svd.test(data_test)
sp.accuracy.rmse(pred)

Estimating biases using als...
RMSE: 0.9091
Wall time: 7.84 s


0.9090794242564598