In [1]:
# Data manipulation
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100

# Recommender systems
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
import surprise as sp

# Other
import os
import random
import sys

# Reload imported code
%load_ext autoreload
%autoreload 2

# Custom modules
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src import *

rand_seed = 2

## Load data

**Movie data found here https://grouplens.org/datasets/movielens/**

In [2]:
cols = ['user_id', 'item_id', 'rating', 'timestamp']
movie_data = pd.read_csv('../data/ml-1m/ratings.dat', names = cols, sep = '::', usecols=[0, 1, 2], engine='python')

# Prepare data
train = movie_data.sample(frac=0.8, random_state=rand_seed)
test = movie_data.drop(train.index.tolist())

# Prepare data for updates
users_update = [2, 500, 700]
train_start = movie_data.query('user_id not in @users_update')
data_update = movie_data.query('user_id in @users_update')
train_update = data_update.sample(frac=0.8, random_state=rand_seed)
test_update = data_update.drop(train_update.index.tolist())

## Simple Model with global mean

This is essentially just the global standard deviation

In [3]:
global_mean = train['rating'].mean()
pred = [global_mean for _ in range(test.shape[0])]

rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:4f}')

print(f"\n Or equivalently the standard deviation: {train['rating'].std()}")


Test RMSE: 1.119044

 Or equivalently the standard deviation: 1.1166159863745446


## Baseline Model with biases

### SGD

In [4]:
%%time

baseline_model = BaselineModel(n_epochs = 20, reg = 0.005, verbose=1)
baseline_model.fit(train)

pred = baseline_model.predict(test)
rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.933494571686373
Epoch  2 / 20  -  train_rmse: 0.9148579764576709
Epoch  3 / 20  -  train_rmse: 0.9078240789151573
Epoch  4 / 20  -  train_rmse: 0.9042828382011496
Epoch  5 / 20  -  train_rmse: 0.9022289689573211
Epoch  6 / 20  -  train_rmse: 0.9009285538160273
Epoch  7 / 20  -  train_rmse: 0.9000541124488959
Epoch  8 / 20  -  train_rmse: 0.8994395808144476
Epoch  9 / 20  -  train_rmse: 0.8989927407083671
Epoch  10 / 20  -  train_rmse: 0.8986588050589224
Epoch  11 / 20  -  train_rmse: 0.898403494044068
Epoch  12 / 20  -  train_rmse: 0.8982044636330987
Epoch  13 / 20  -  train_rmse: 0.8980466587695454
Epoch  14 / 20  -  train_rmse: 0.8979196525732741
Epoch  15 / 20  -  train_rmse: 0.8978160532085211
Epoch  16 / 20  -  train_rmse: 0.8977305140627257
Epoch  17 / 20  -  train_rmse: 0.8976590993040879
Epoch  18 / 20  -  train_rmse: 0.8975988662385087
Epoch  19 / 20  -  train_rmse: 0.8975475839461394
Epoch  20 / 20  -  train_rmse: 0.8975035398315944

Test RMSE:

### ALS

In [5]:
%%time

baseline_model = BaselineModel(method='als', n_epochs = 20, reg = 0.005, verbose=1)
baseline_model.fit(train)

pred = baseline_model.predict(test)
rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.9178374472516706
Epoch  2 / 20  -  train_rmse: 0.897307281620994
Epoch  3 / 20  -  train_rmse: 0.8963564345409184
Epoch  4 / 20  -  train_rmse: 0.8963042132630377
Epoch  5 / 20  -  train_rmse: 0.8963011841494454
Epoch  6 / 20  -  train_rmse: 0.8963010029148099
Epoch  7 / 20  -  train_rmse: 0.8963009914026157
Epoch  8 / 20  -  train_rmse: 0.8963009905265219
Epoch  9 / 20  -  train_rmse: 0.8963009904274769
Epoch  10 / 20  -  train_rmse: 0.896300990410144
Epoch  11 / 20  -  train_rmse: 0.8963009904063131
Epoch  12 / 20  -  train_rmse: 0.8963009904053619
Epoch  13 / 20  -  train_rmse: 0.8963009904050973
Epoch  14 / 20  -  train_rmse: 0.8963009904050061
Epoch  15 / 20  -  train_rmse: 0.8963009904049439
Epoch  16 / 20  -  train_rmse: 0.8963009904049123
Epoch  17 / 20  -  train_rmse: 0.896300990404848
Epoch  18 / 20  -  train_rmse: 0.8963009904047935
Epoch  19 / 20  -  train_rmse: 0.8963009904047731
Epoch  20 / 20  -  train_rmse: 0.8963009904046988

Test RMSE: 

### Updating with new users

In [7]:
%%time
baseline_model = BaselineModel(method='als', n_epochs = 20, reg = 0.05, verbose=1)
baseline_model.fit(train_start)

print('\nUpdate model')
baseline_model.update_users(train_update, verbose=1)
pred = baseline_model.predict(test_update)
rmse = mean_squared_error(test_update['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.9191803443219464
Epoch  2 / 20  -  train_rmse: 0.8987037513062358
Epoch  3 / 20  -  train_rmse: 0.8977609464578302
Epoch  4 / 20  -  train_rmse: 0.8977092277636958
Epoch  5 / 20  -  train_rmse: 0.8977061626808502
Epoch  6 / 20  -  train_rmse: 0.8977059587033653
Epoch  7 / 20  -  train_rmse: 0.8977059405810108
Epoch  8 / 20  -  train_rmse: 0.8977059380120525
Epoch  9 / 20  -  train_rmse: 0.8977059374795142
Epoch  10 / 20  -  train_rmse: 0.897705937337243
Epoch  11 / 20  -  train_rmse: 0.8977059372831977
Epoch  12 / 20  -  train_rmse: 0.8977059372501258
Epoch  13 / 20  -  train_rmse: 0.8977059372221129
Epoch  14 / 20  -  train_rmse: 0.8977059371953395
Epoch  15 / 20  -  train_rmse: 0.8977059371688736
Epoch  16 / 20  -  train_rmse: 0.8977059371424928
Epoch  17 / 20  -  train_rmse: 0.8977059371161288
Epoch  18 / 20  -  train_rmse: 0.8977059370898027
Epoch  19 / 20  -  train_rmse: 0.8977059370635059
Epoch  20 / 20  -  train_rmse: 0.8977059370372027

Update mo

## Matrix Factorization (FunkSVD) by Simon Funk

In [7]:
%%time 
random.seed(2)
np.random.seed(2)
matrix_fact = MatrixFactorization(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)
matrix_fact.fit(train)

pred = matrix_fact.predict(test)
rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.0127035036057546
Epoch  2 / 20  -  train_rmse: 0.973399904124025
Epoch  3 / 20  -  train_rmse: 0.9522201402680913
Epoch  4 / 20  -  train_rmse: 0.9386090696979279
Epoch  5 / 20  -  train_rmse: 0.9288639526799425
Epoch  6 / 20  -  train_rmse: 0.9213635578752073
Epoch  7 / 20  -  train_rmse: 0.9152878929802353
Epoch  8 / 20  -  train_rmse: 0.9101754454103971
Epoch  9 / 20  -  train_rmse: 0.9057441442060727
Epoch  10 / 20  -  train_rmse: 0.9018099719719699
Epoch  11 / 20  -  train_rmse: 0.8982463727688216
Epoch  12 / 20  -  train_rmse: 0.8949624194855501
Epoch  13 / 20  -  train_rmse: 0.8918903129755686
Epoch  14 / 20  -  train_rmse: 0.8889778398420741
Epoch  15 / 20  -  train_rmse: 0.8861836186228284
Epoch  16 / 20  -  train_rmse: 0.8834739938870996
Epoch  17 / 20  -  train_rmse: 0.8808209488836901
Epoch  18 / 20  -  train_rmse: 0.8782006746379216
Epoch  19 / 20  -  train_rmse: 0.8755925796334392
Epoch  20 / 20  -  train_rmse: 0.8729786073783997

Test RMSE

## Surprise

In [13]:
# Load the movielens-100k dataset (download it if needed),
data = sp.Dataset.load_builtin('ml-100k')

In [14]:
%%time
# We'll use the famous SVD algorithm.
algo = sp.AlgoBase()

# Run 5-fold cross-validation and print results
sp.model_selection.cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9315  0.9319  0.9371  0.9483  0.9311  0.9360  0.0065  
MAE (testset)     0.7346  0.7348  0.7381  0.7454  0.7353  0.7376  0.0041  
Fit time          5.25    4.75    4.70    4.66    4.71    4.81    0.22    
Test time         0.16    0.13    0.20    0.14    0.12    0.15    0.03    
Wall time: 25.7 s


{'test_rmse': array([0.93150639, 0.93190626, 0.93712494, 0.94831838, 0.93106516]),
 'test_mae': array([0.7345864 , 0.73479604, 0.73812511, 0.74543866, 0.7352659 ]),
 'fit_time': (5.246900796890259,
  4.746541976928711,
  4.704091310501099,
  4.6643593311309814,
  4.708273649215698),
 'test_time': (0.1593332290649414,
  0.132645845413208,
  0.20246028900146484,
  0.140625,
  0.12167477607727051)}

In [90]:
%%time

# surprise_svd = sp.SVD(lr_all=0.001, reg_all=0.005, n_epochs=20, n_factors=50)
surprise_svd = sp.BaselineOnly()

reader = sp.Reader()

data_train = sp.Dataset.load_from_df(train[['u_id', 'i_id', 'rating']], reader = reader).build_full_trainset()
data_test = sp.Dataset.load_from_df(test[['u_id', 'i_id', 'rating']], reader = reader).build_full_trainset().build_testset()
surprise_svd.fit(data_train)

pred = surprise_svd.test(data_test)
sp.accuracy.rmse(pred)

Estimating biases using als...
RMSE: 0.9091
Wall time: 7.84 s


0.9090794242564598