In [106]:
# Data manipulation
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100

# Recommender systems
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
import surprise as sp

# Other
import os
import random
import sys

# Custom modules
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from src import BaselineModel, MatrixFactorization, train_update_test_split

# Reload imported code
%load_ext autoreload
%autoreload 2

rand_seed = 2
np.random.seed(rand_seed)
random.seed(rand_seed)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Purpose

This notebook shows some examples on how to use the package using the classic movielens dataset

## Load data

**Movie data found here https://grouplens.org/datasets/movielens/**

In [212]:
cols = ['user_id', 'item_id', 'rating', 'timestamp']
movie_data = pd.read_csv('../data/ml-1m/ratings.dat', names = cols, sep = '::', usecols=[0, 1, 2], engine='python')

# Prepare data
train, test = train_test_split(movie_data, test_size=0.2)

# Prepare data for online learning
train_initial, train_update, test_update = train_update_test_split(movie_data, frac_new_users=0.2)

## Simple model with global mean

This is similar to just the global standard deviation

In [213]:
global_mean = train['rating'].mean()
pred = [global_mean for _ in range(test.shape[0])]

rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:4f}')


Test RMSE: 1.117783


## Baseline Model with biases

### SGD

In [214]:
%%time

baseline_model = BaselineModel(n_epochs = 20, reg = 0.005, verbose=1)
baseline_model.fit(train)

pred = baseline_model.predict(test)
rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.916291295920267
Epoch  2 / 20  -  train_rmse: 0.9058303139253466
Epoch  3 / 20  -  train_rmse: 0.9025221071941208
Epoch  4 / 20  -  train_rmse: 0.9010567557199297
Epoch  5 / 20  -  train_rmse: 0.900289130516374
Epoch  6 / 20  -  train_rmse: 0.8998424440258098
Epoch  7 / 20  -  train_rmse: 0.899562213999701
Epoch  8 / 20  -  train_rmse: 0.89937586020524
Epoch  9 / 20  -  train_rmse: 0.8992459097976333
Epoch  10 / 20  -  train_rmse: 0.8991516023604073
Epoch  11 / 20  -  train_rmse: 0.8990807803821095
Epoch  12 / 20  -  train_rmse: 0.8990259950662489
Epoch  13 / 20  -  train_rmse: 0.8989825065050737
Epoch  14 / 20  -  train_rmse: 0.8989471990614153
Epoch  15 / 20  -  train_rmse: 0.8989179657736619
Epoch  16 / 20  -  train_rmse: 0.8988933452724307
Epoch  17 / 20  -  train_rmse: 0.8988723004606296
Epoch  18 / 20  -  train_rmse: 0.8988540796981623
Epoch  19 / 20  -  train_rmse: 0.8988381275269515
Epoch  20 / 20  -  train_rmse: 0.8988240259640466

Test RMSE: 0.

### ALS

In [215]:
%%time

baseline_model = BaselineModel(method='als', n_epochs = 20, reg = 0.5, verbose=1)
baseline_model.fit(train)

pred = baseline_model.predict(test)
rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.9180820099926396
Epoch  2 / 20  -  train_rmse: 0.8976375884887955
Epoch  3 / 20  -  train_rmse: 0.8966705160265632
Epoch  4 / 20  -  train_rmse: 0.89661202822231
Epoch  5 / 20  -  train_rmse: 0.8966073177045732
Epoch  6 / 20  -  train_rmse: 0.8966067032334587
Epoch  7 / 20  -  train_rmse: 0.8966065678960716
Epoch  8 / 20  -  train_rmse: 0.8966065182762025
Epoch  9 / 20  -  train_rmse: 0.896606487693043
Epoch  10 / 20  -  train_rmse: 0.8966064617486386
Epoch  11 / 20  -  train_rmse: 0.8966064370680196
Epoch  12 / 20  -  train_rmse: 0.8966064128404503
Epoch  13 / 20  -  train_rmse: 0.8966063888688445
Epoch  14 / 20  -  train_rmse: 0.8966063651041888
Epoch  15 / 20  -  train_rmse: 0.8966063415332692
Epoch  16 / 20  -  train_rmse: 0.8966063181516986
Epoch  17 / 20  -  train_rmse: 0.8966062949570798
Epoch  18 / 20  -  train_rmse: 0.8966062719475106
Epoch  19 / 20  -  train_rmse: 0.8966062491213108
Epoch  20 / 20  -  train_rmse: 0.8966062264769168

Test RMSE: 

### Updating with new users

In [216]:
baseline_model = BaselineModel(method='sgd', n_epochs = 20, lr=0.01, reg = 0.05, verbose=1)
baseline_model.fit(train_initial)

user_biases = baseline_model.user_biases

Epoch  1 / 20  -  train_rmse: 0.9222551151837987
Epoch  2 / 20  -  train_rmse: 0.9104955177975218
Epoch  3 / 20  -  train_rmse: 0.9073286619098914
Epoch  4 / 20  -  train_rmse: 0.9059619115473901
Epoch  5 / 20  -  train_rmse: 0.9052502769074252
Epoch  6 / 20  -  train_rmse: 0.9048341738159744
Epoch  7 / 20  -  train_rmse: 0.9045697185313493
Epoch  8 / 20  -  train_rmse: 0.9043903944996496
Epoch  9 / 20  -  train_rmse: 0.9042622655119018
Epoch  10 / 20  -  train_rmse: 0.9041666970748231
Epoch  11 / 20  -  train_rmse: 0.9040928412098005
Epoch  12 / 20  -  train_rmse: 0.9040340673839535
Epoch  13 / 20  -  train_rmse: 0.903986148670051
Epoch  14 / 20  -  train_rmse: 0.9039462883988072
Epoch  15 / 20  -  train_rmse: 0.9039125736631315
Epoch  16 / 20  -  train_rmse: 0.9038836563415112
Epoch  17 / 20  -  train_rmse: 0.9038585604478493
Epoch  18 / 20  -  train_rmse: 0.9038365621496433
Epoch  19 / 20  -  train_rmse: 0.903817112916474
Epoch  20 / 20  -  train_rmse: 0.9037997890025076


In [217]:
%%time
baseline_model.update_users(train_update, n_epochs=20, lr=0.001, verbose=1)
pred = baseline_model.predict(test_update)
rmse = mean_squared_error(test_update['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.9488961953765018
Epoch  2 / 20  -  train_rmse: 0.9362227321026241
Epoch  3 / 20  -  train_rmse: 0.9279229038443937
Epoch  4 / 20  -  train_rmse: 0.922089363344399
Epoch  5 / 20  -  train_rmse: 0.9177689098423024
Epoch  6 / 20  -  train_rmse: 0.9144398208093097
Epoch  7 / 20  -  train_rmse: 0.9117947393896905
Epoch  8 / 20  -  train_rmse: 0.9096415235664184
Epoch  9 / 20  -  train_rmse: 0.9078541105214077
Epoch  10 / 20  -  train_rmse: 0.9063464234556696
Epoch  11 / 20  -  train_rmse: 0.9050576869564634
Epoch  12 / 20  -  train_rmse: 0.9039437495333479
Epoch  13 / 20  -  train_rmse: 0.9029717394330362
Epoch  14 / 20  -  train_rmse: 0.9021166547079046
Epoch  15 / 20  -  train_rmse: 0.9013591197496272
Epoch  16 / 20  -  train_rmse: 0.9006838691369248
Epoch  17 / 20  -  train_rmse: 0.9000786984057629
Epoch  18 / 20  -  train_rmse: 0.8995337223935118
Epoch  19 / 20  -  train_rmse: 0.8990408408895711
Epoch  20 / 20  -  train_rmse: 0.8985933469208418

Test RMSE

## Matrix Factorization

In [218]:
%%time 
matrix_fact = MatrixFactorization(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)
matrix_fact.fit(train)

pred = matrix_fact.predict(test)
rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.0133183676130273
Epoch  2 / 20  -  train_rmse: 0.9739939909084588
Epoch  3 / 20  -  train_rmse: 0.9527725729940393
Epoch  4 / 20  -  train_rmse: 0.9391270521722095
Epoch  5 / 20  -  train_rmse: 0.9293582679036712
Epoch  6 / 20  -  train_rmse: 0.9218435556790232
Epoch  7 / 20  -  train_rmse: 0.9157609902775945
Epoch  8 / 20  -  train_rmse: 0.9106475148366705
Epoch  9 / 20  -  train_rmse: 0.9062199750714944
Epoch  10 / 20  -  train_rmse: 0.9022936451703484
Epoch  11 / 20  -  train_rmse: 0.8987415394475611
Epoch  12 / 20  -  train_rmse: 0.895472502322115
Epoch  13 / 20  -  train_rmse: 0.89241865011766
Epoch  14 / 20  -  train_rmse: 0.8895277869678401
Epoch  15 / 20  -  train_rmse: 0.886758619591924
Epoch  16 / 20  -  train_rmse: 0.8840776263193174
Epoch  17 / 20  -  train_rmse: 0.8814569478488512
Epoch  18 / 20  -  train_rmse: 0.878872935340008
Epoch  19 / 20  -  train_rmse: 0.8763051383832647
Epoch  20 / 20  -  train_rmse: 0.8737355991999893

Test RMSE: 0.

### Getting list of recommendations for a user

In [229]:
user = 200
items_known = train.query('user_id == @user')['item_id']
matrix_fact.recommend(user=user, items_known=items_known)

Unnamed: 0,user_id,item_id,rating_pred
1250,200,922,4.758032
1330,200,1207,4.717769
1199,200,2019,4.647636
66,200,1250,4.636333
593,200,50,4.634492
71,200,260,4.630086
1206,200,905,4.622763
633,200,908,4.594477
322,200,1208,4.58469
45,200,2858,4.579518


### Updating with new users

In [208]:
matrix_fact = MatrixFactorization(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.5)
matrix_fact.fit(train_initial)

Epoch  1 / 20  -  train_rmse: 1.0197651740162348
Epoch  2 / 20  -  train_rmse: 0.9868537886030636
Epoch  3 / 20  -  train_rmse: 0.9705247035806774
Epoch  4 / 20  -  train_rmse: 0.9607933752841796
Epoch  5 / 20  -  train_rmse: 0.9543514988978699
Epoch  6 / 20  -  train_rmse: 0.9497838824946246
Epoch  7 / 20  -  train_rmse: 0.9463846087803535
Epoch  8 / 20  -  train_rmse: 0.9437624710821482
Epoch  9 / 20  -  train_rmse: 0.9416832366111585
Epoch  10 / 20  -  train_rmse: 0.9399980808855553
Epoch  11 / 20  -  train_rmse: 0.9386078918598724
Epoch  12 / 20  -  train_rmse: 0.9374440972651416
Epoch  13 / 20  -  train_rmse: 0.9364577341770643
Epoch  14 / 20  -  train_rmse: 0.9356129035878681
Epoch  15 / 20  -  train_rmse: 0.93488268564152
Epoch  16 / 20  -  train_rmse: 0.9342464992142986
Epoch  17 / 20  -  train_rmse: 0.9336883425899541
Epoch  18 / 20  -  train_rmse: 0.9331955898802641
Epoch  19 / 20  -  train_rmse: 0.9327581483658285
Epoch  20 / 20  -  train_rmse: 0.9323678563411486


MatrixFactorization(lr=0.001, n_epochs=20, reg=0.5)

In [209]:
%%time

# Update model with new users
matrix_fact.update_users(train_update, lr=0.001, n_epochs=20, verbose=1)
pred = matrix_fact.predict(test_update)
rmse = mean_squared_error(test_update['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.9646540237204959
Epoch  2 / 20  -  train_rmse: 0.9518434410454569
Epoch  3 / 20  -  train_rmse: 0.9442733344172113
Epoch  4 / 20  -  train_rmse: 0.9392690285421632
Epoch  5 / 20  -  train_rmse: 0.9357021214843568
Epoch  6 / 20  -  train_rmse: 0.9330206454181479
Epoch  7 / 20  -  train_rmse: 0.9309240664004891
Epoch  8 / 20  -  train_rmse: 0.9292350818398317
Epoch  9 / 20  -  train_rmse: 0.927842347411242
Epoch  10 / 20  -  train_rmse: 0.9266723429219486
Epoch  11 / 20  -  train_rmse: 0.9256744833955226
Epoch  12 / 20  -  train_rmse: 0.9248127495947855
Epoch  13 / 20  -  train_rmse: 0.9240607410385152
Epoch  14 / 20  -  train_rmse: 0.9233986246488626
Epoch  15 / 20  -  train_rmse: 0.9228111822962562
Epoch  16 / 20  -  train_rmse: 0.9222865209610683
Epoch  17 / 20  -  train_rmse: 0.9218151964373533
Epoch  18 / 20  -  train_rmse: 0.9213896030893373
Epoch  19 / 20  -  train_rmse: 0.9210035394539674
Epoch  20 / 20  -  train_rmse: 0.9206518929042135

Test RMSE

## Surprise

In [11]:
# Load the movielens-100k dataset (download it if needed),
data = sp.Dataset.load_builtin('ml-100k')

In [12]:
%%time
# We'll use the famous SVD algorithm.
algo = sp.SVD()

# Run 5-fold cross-validation and print results
sp.model_selection.cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9397  0.9355  0.9292  0.9388  0.9341  0.9355  0.0037  
MAE (testset)     0.7399  0.7374  0.7332  0.7386  0.7353  0.7369  0.0024  
Fit time          4.25    4.16    4.22    4.22    4.23    4.22    0.03    
Test time         0.17    0.12    0.11    0.16    0.12    0.14    0.02    
Wall time: 22.4 s


{'test_rmse': array([0.93967479, 0.93553074, 0.92923385, 0.93884081, 0.93411477]),
 'test_mae': array([0.73994964, 0.73736547, 0.73319142, 0.738612  , 0.73534611]),
 'fit_time': (4.249607563018799,
  4.164863586425781,
  4.2217137813568115,
  4.221713066101074,
  4.229688882827759),
 'test_time': (0.16755151748657227,
  0.12466549873352051,
  0.11167263984680176,
  0.16453123092651367,
  0.12067866325378418)}

In [167]:
%%time

# surprise_svd = sp.SVD(lr_all=0.001, reg_all=0.005, n_epochs=20, n_factors=50)
surprise_svd = sp.BaselineOnly()

reader = sp.Reader()

data_train = sp.Dataset.load_from_df(train[['user_id', 'item_id', 'rating']], reader = reader).build_full_trainset()
data_test = sp.Dataset.load_from_df(test[['user_id', 'item_id', 'rating']], reader = reader).build_full_trainset().build_testset()
surprise_svd.fit(data_train)

pred = surprise_svd.test(data_test)
sp.accuracy.rmse(pred)

Estimating biases using als...
RMSE: 0.9106
Wall time: 7.84 s


0.9106051680456124