In [1]:
# Data manipulation
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100

# Recommender systems
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
import surprise as sp

# Other
import os
import random
import sys

# Custom modules
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from src import BaselineModel, MatrixFactorization, train_update_test_split

# Reload imported code
%load_ext autoreload
%autoreload 2

rand_seed = 2
np.random.seed(rand_seed)
random.seed(rand_seed)

## Load data

**Movie data found here https://grouplens.org/datasets/movielens/**

In [2]:
cols = ['user_id', 'item_id', 'rating', 'timestamp']
movie_data = pd.read_csv('../data/ml-1m/ratings.dat', names = cols, sep = '::', usecols=[0, 1, 2], engine='python')
movie_data = movie_data.sample(frac=1)
# Prepare data
train, test = train_test_split(movie_data, test_size=0.2)

# Prepare data for online learning
train_initial, train_update, test_update = train_update_test_split(movie_data, frac_new_users=0.2)

## Simple model with global mean

This is similar to just the global standard deviation

In [3]:
global_mean = train['rating'].mean()
pred = [global_mean for _ in range(test.shape[0])]

rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:4f}')


Test RMSE: 1.117193


## Baseline Model with biases

### SGD

In [4]:
%%time

baseline_model = BaselineModel(n_epochs = 20, reg = 0.005, verbose=1)
baseline_model.fit(train)

pred = baseline_model.predict(test)
rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.916138002653919
Epoch  2 / 20  -  train_rmse: 0.9056374391849454
Epoch  3 / 20  -  train_rmse: 0.9023088396050357
Epoch  4 / 20  -  train_rmse: 0.9008342120744669
Epoch  5 / 20  -  train_rmse: 0.9000620153896943
Epoch  6 / 20  -  train_rmse: 0.8996126503672814
Epoch  7 / 20  -  train_rmse: 0.8993305444154954
Epoch  8 / 20  -  train_rmse: 0.8991426935936087
Epoch  9 / 20  -  train_rmse: 0.8990114538702991
Epoch  10 / 20  -  train_rmse: 0.8989159921593428
Epoch  11 / 20  -  train_rmse: 0.8988441181155185
Epoch  12 / 20  -  train_rmse: 0.8987883664895142
Epoch  13 / 20  -  train_rmse: 0.8987439875478945
Epoch  14 / 20  -  train_rmse: 0.898707858368828
Epoch  15 / 20  -  train_rmse: 0.898677865613586
Epoch  16 / 20  -  train_rmse: 0.8986525418810812
Epoch  17 / 20  -  train_rmse: 0.8986308442739734
Epoch  18 / 20  -  train_rmse: 0.8986120156337635
Epoch  19 / 20  -  train_rmse: 0.8985954953519975
Epoch  20 / 20  -  train_rmse: 0.8985808607360918

Test RMSE: 

### ALS

In [5]:
%%time

baseline_model = BaselineModel(method='als', n_epochs = 20, reg = 0.5, verbose=1)
baseline_model.fit(train)

pred = baseline_model.predict(test)
rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.9178315267402595
Epoch  2 / 20  -  train_rmse: 0.8973630281853254
Epoch  3 / 20  -  train_rmse: 0.8963963969718824
Epoch  4 / 20  -  train_rmse: 0.896338149748741
Epoch  5 / 20  -  train_rmse: 0.8963334709750764
Epoch  6 / 20  -  train_rmse: 0.8963328597251499
Epoch  7 / 20  -  train_rmse: 0.8963327236190923
Epoch  8 / 20  -  train_rmse: 0.8963326724357833
Epoch  9 / 20  -  train_rmse: 0.8963326400913297
Epoch  10 / 20  -  train_rmse: 0.8963326123394056
Epoch  11 / 20  -  train_rmse: 0.8963325858475254
Epoch  12 / 20  -  train_rmse: 0.8963325598168373
Epoch  13 / 20  -  train_rmse: 0.8963325340534692
Epoch  14 / 20  -  train_rmse: 0.8963325085093153
Epoch  15 / 20  -  train_rmse: 0.896332483171204
Epoch  16 / 20  -  train_rmse: 0.8963324580347655
Epoch  17 / 20  -  train_rmse: 0.8963324330974767
Epoch  18 / 20  -  train_rmse: 0.8963324083573896
Epoch  19 / 20  -  train_rmse: 0.8963323838127704
Epoch  20 / 20  -  train_rmse: 0.8963323594618783

Test RMSE:

### Updating with new users

In [6]:
baseline_model = BaselineModel(method='sgd', n_epochs = 20, lr=0.01, reg = 0.05, verbose=1)
baseline_model.fit(train_initial)

user_biases = baseline_model.user_biases

Epoch  1 / 20  -  train_rmse: 0.916681766152793
Epoch  2 / 20  -  train_rmse: 0.9067150383041128
Epoch  3 / 20  -  train_rmse: 0.9035766076888032
Epoch  4 / 20  -  train_rmse: 0.9021866734700877
Epoch  5 / 20  -  train_rmse: 0.9014559633972776
Epoch  6 / 20  -  train_rmse: 0.9010273176238084
Epoch  7 / 20  -  train_rmse: 0.900755015190394
Epoch  8 / 20  -  train_rmse: 0.9005709217404749
Epoch  9 / 20  -  train_rmse: 0.900440005002407
Epoch  10 / 20  -  train_rmse: 0.9003429185430207
Epoch  11 / 20  -  train_rmse: 0.9002683530229807
Epoch  12 / 20  -  train_rmse: 0.9002093771575651
Epoch  13 / 20  -  train_rmse: 0.9001615668582349
Epoch  14 / 20  -  train_rmse: 0.900121995882471
Epoch  15 / 20  -  train_rmse: 0.9000886663428845
Epoch  16 / 20  -  train_rmse: 0.9000601748224009
Epoch  17 / 20  -  train_rmse: 0.9000355099217533
Epoch  18 / 20  -  train_rmse: 0.9000139257510138
Epoch  19 / 20  -  train_rmse: 0.8999948606903906
Epoch  20 / 20  -  train_rmse: 0.8999778838996778


In [7]:
%%time
baseline_model.update_users(train_update, n_epochs=20, lr=0.001, verbose=1)
pred = baseline_model.predict(test_update)
rmse = mean_squared_error(test_update['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.9577807977795892
Epoch  2 / 20  -  train_rmse: 0.943916088654542
Epoch  3 / 20  -  train_rmse: 0.9348871838092219
Epoch  4 / 20  -  train_rmse: 0.9286451402415307
Epoch  5 / 20  -  train_rmse: 0.9241347000803877
Epoch  6 / 20  -  train_rmse: 0.9207594879844894
Epoch  7 / 20  -  train_rmse: 0.9181600603545007
Epoch  8 / 20  -  train_rmse: 0.9161091321065541
Epoch  9 / 20  -  train_rmse: 0.9144573577035976
Epoch  10 / 20  -  train_rmse: 0.9131033878076178
Epoch  11 / 20  -  train_rmse: 0.9119765029276343
Epoch  12 / 20  -  train_rmse: 0.9110261357359404
Epoch  13 / 20  -  train_rmse: 0.910215334968973
Epoch  14 / 20  -  train_rmse: 0.90951656842701
Epoch  15 / 20  -  train_rmse: 0.9089089585373543
Epoch  16 / 20  -  train_rmse: 0.9083764198816021
Epoch  17 / 20  -  train_rmse: 0.9079063786664788
Epoch  18 / 20  -  train_rmse: 0.9074888759194341
Epoch  19 / 20  -  train_rmse: 0.9071159286957357
Epoch  20 / 20  -  train_rmse: 0.9067810678715049

Test RMSE: 0

## Matrix Factorization

In [8]:
%%time 
matrix_fact = MatrixFactorization(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)
matrix_fact.fit(train)

pred = matrix_fact.predict(test)
rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.0131156932692604
Epoch  2 / 20  -  train_rmse: 0.9737452999285628
Epoch  3 / 20  -  train_rmse: 0.952540753861357
Epoch  4 / 20  -  train_rmse: 0.938917032409236
Epoch  5 / 20  -  train_rmse: 0.9291655197256019
Epoch  6 / 20  -  train_rmse: 0.9216639521191695
Epoch  7 / 20  -  train_rmse: 0.9155920554121091
Epoch  8 / 20  -  train_rmse: 0.9104881151741863
Epoch  9 / 20  -  train_rmse: 0.9060698901459235
Epoch  10 / 20  -  train_rmse: 0.9021532505217927
Epoch  11 / 20  -  train_rmse: 0.8986115997821935
Epoch  12 / 20  -  train_rmse: 0.8953540410437282
Epoch  13 / 20  -  train_rmse: 0.8923128675330885
Epoch  14 / 20  -  train_rmse: 0.8894360087812822
Epoch  15 / 20  -  train_rmse: 0.8866822640314471
Epoch  16 / 20  -  train_rmse: 0.8840181824246149
Epoch  17 / 20  -  train_rmse: 0.8814159599330976
Epoch  18 / 20  -  train_rmse: 0.8788519900983536
Epoch  19 / 20  -  train_rmse: 0.8763058519900032
Epoch  20 / 20  -  train_rmse: 0.8737596022833991

Test RMSE:

### Getting list of recommendations for a user

In [9]:
user = 200
items_known = train.query('user_id == @user')['item_id']
matrix_fact.recommend(user=user, items_known=items_known)

Unnamed: 0,user_id,item_id,rating_pred
815,200,1250,4.72179
317,200,908,4.674285
596,200,2019,4.655865
1113,200,3435,4.64275
37,200,318,4.635284
1209,200,1207,4.633416
1981,200,1178,4.619055
1988,200,745,4.615561
655,200,527,4.609948
189,200,2762,4.609722


### Updating with new users

In [10]:
matrix_fact = MatrixFactorization(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.5)
matrix_fact.fit(train_initial)

Epoch  1 / 20  -  train_rmse: 1.017453407623808
Epoch  2 / 20  -  train_rmse: 0.9844477150625434
Epoch  3 / 20  -  train_rmse: 0.9681259718442118
Epoch  4 / 20  -  train_rmse: 0.9583876429745246
Epoch  5 / 20  -  train_rmse: 0.9519254023581528
Epoch  6 / 20  -  train_rmse: 0.9473320649242977
Epoch  7 / 20  -  train_rmse: 0.9439066747027656
Epoch  8 / 20  -  train_rmse: 0.9412604236187501
Epoch  9 / 20  -  train_rmse: 0.9391600145529355
Epoch  10 / 20  -  train_rmse: 0.9374568087602531
Epoch  11 / 20  -  train_rmse: 0.9360515262276466
Epoch  12 / 20  -  train_rmse: 0.9348752818165803
Epoch  13 / 20  -  train_rmse: 0.9338787603871875
Epoch  14 / 20  -  train_rmse: 0.9330257216590568
Epoch  15 / 20  -  train_rmse: 0.9322889368641933
Epoch  16 / 20  -  train_rmse: 0.9316475551427175
Epoch  17 / 20  -  train_rmse: 0.9310853440176857
Epoch  18 / 20  -  train_rmse: 0.9305894825490518
Epoch  19 / 20  -  train_rmse: 0.9301497143363141
Epoch  20 / 20  -  train_rmse: 0.9297577409183222


MatrixFactorization(lr=0.001, n_epochs=20, reg=0.5)

In [11]:
%%time

# Update model with new users
matrix_fact.update_users(train_update, lr=0.001, n_epochs=20, verbose=1)
pred = matrix_fact.predict(test_update)
rmse = mean_squared_error(test_update['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.9753709546570367
Epoch  2 / 20  -  train_rmse: 0.9627583441585484
Epoch  3 / 20  -  train_rmse: 0.9547804232691787
Epoch  4 / 20  -  train_rmse: 0.9493349624667294
Epoch  5 / 20  -  train_rmse: 0.9454156730751521
Epoch  6 / 20  -  train_rmse: 0.9424806133583435
Epoch  7 / 20  -  train_rmse: 0.940213282664107
Epoch  8 / 20  -  train_rmse: 0.938417298298735
Epoch  9 / 20  -  train_rmse: 0.9369648911723226
Epoch  10 / 20  -  train_rmse: 0.9357696846776115
Epoch  11 / 20  -  train_rmse: 0.9347713982456956
Epoch  12 / 20  -  train_rmse: 0.9339268154225837
Epoch  13 / 20  -  train_rmse: 0.9332042285318105
Epoch  14 / 20  -  train_rmse: 0.9325799017752063
Epoch  15 / 20  -  train_rmse: 0.9320357513480894
Epoch  16 / 20  -  train_rmse: 0.9315577835310822
Epoch  17 / 20  -  train_rmse: 0.9311350184767716
Epoch  18 / 20  -  train_rmse: 0.930758733206125
Epoch  19 / 20  -  train_rmse: 0.9304219192355778
Epoch  20 / 20  -  train_rmse: 0.9301188875204278

Test RMSE: 

## Surprise

In [12]:
# Load the movielens-100k dataset (download it if needed),
data = sp.Dataset.load_builtin('ml-100k')

In [13]:
%%time
# We'll use the famous SVD algorithm.
algo = sp.SVD()

# Run 5-fold cross-validation and print results
sp.model_selection.cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9427  0.9346  0.9333  0.9366  0.9352  0.9365  0.0033  
MAE (testset)     0.7440  0.7368  0.7351  0.7370  0.7379  0.7382  0.0030  
Fit time          4.86    4.76    4.85    5.03    5.19    4.94    0.15    
Test time         0.13    0.16    0.18    0.14    0.21    0.16    0.03    
Wall time: 26.5 s


{'test_rmse': array([0.94266318, 0.93461611, 0.93329959, 0.93660388, 0.93519107]),
 'test_mae': array([0.74398474, 0.73679712, 0.73513794, 0.73699482, 0.73788392]),
 'fit_time': (4.858710289001465,
  4.762438774108887,
  4.845961332321167,
  5.028306245803833,
  5.1944146156311035),
 'test_time': (0.13063931465148926,
  0.16353535652160645,
  0.17874550819396973,
  0.13814067840576172,
  0.21229290962219238)}

In [14]:
%%time

# surprise_svd = sp.SVD(lr_all=0.001, reg_all=0.005, n_epochs=20, n_factors=50)
surprise_svd = sp.BaselineOnly()

reader = sp.Reader()

data_train = sp.Dataset.load_from_df(train[['user_id', 'item_id', 'rating']], reader = reader).build_full_trainset()
data_test = sp.Dataset.load_from_df(test[['user_id', 'item_id', 'rating']], reader = reader).build_full_trainset().build_testset()
surprise_svd.fit(data_train)

pred = surprise_svd.test(data_test)
sp.accuracy.rmse(pred)

Estimating biases using als...
RMSE: 0.9096
Wall time: 7.85 s


0.9095729293919449