In [1]:
# Data manipulation
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100

# Recommender systems
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import surprise as sp

# Other
import os
import random
import sys

# Custom modules
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from src import BaselineModel, MatrixFactorization, train_update_test_split

# Reload imported code 
%load_ext autoreload
%autoreload 2

rand_seed = 2
np.random.seed(rand_seed)
random.seed(rand_seed)

# Load data

**Movie data found here https://grouplens.org/datasets/movielens/**

In [2]:
cols = ['user_id', 'item_id', 'rating', 'timestamp']
movie_data = pd.read_csv('../data/ml-1m/ratings.dat', names = cols, sep = '::', usecols=[0, 1, 2], engine='python')
# movie_data = pd.read_csv('../data/ml-100k/u.data', names = cols, sep = '\t', usecols=[0, 1, 2], engine='python')

# Prepare data
train, test = train_test_split(movie_data, test_size=0.2)

# Prepare data for online learning
train_initial, train_update, test_update = train_update_test_split(movie_data, frac_new_users=0.2)

movie_data.head(10)

Unnamed: 0,user_id,item_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
5,1,1197,3
6,1,1287,5
7,1,2804,5
8,1,594,4
9,1,919,4


# Simple model with global mean

This is similar to just the global standard deviation

In [3]:
global_mean = train['rating'].mean()
pred = [global_mean for _ in range(test.shape[0])]

rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:4f}')


Test RMSE: 1.117284


# Baseline Model with biases

## SGD

In [4]:
%%time

baseline_model = BaselineModel(n_epochs = 20, reg = 0.005, verbose=1)
baseline_model.fit(train)

pred = baseline_model.predict(test)
rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.9159252285487107
Epoch  2 / 20  -  train_rmse: 0.9054648413828678
Epoch  3 / 20  -  train_rmse: 0.9021557010752497
Epoch  4 / 20  -  train_rmse: 0.9006867132228016
Epoch  5 / 20  -  train_rmse: 0.8999153812555668
Epoch  6 / 20  -  train_rmse: 0.8994658823750281
Epoch  7 / 20  -  train_rmse: 0.8991838099413846
Epoch  8 / 20  -  train_rmse: 0.898996392067281
Epoch  9 / 20  -  train_rmse: 0.8988659401531301
Epoch  10 / 20  -  train_rmse: 0.8987715199772648
Epoch  11 / 20  -  train_rmse: 0.8987008477189359
Epoch  12 / 20  -  train_rmse: 0.8986463860326711
Epoch  13 / 20  -  train_rmse: 0.8986033334840108
Epoch  14 / 20  -  train_rmse: 0.8985685315752041
Epoch  15 / 20  -  train_rmse: 0.8985398432976371
Epoch  16 / 20  -  train_rmse: 0.8985157860978665
Epoch  17 / 20  -  train_rmse: 0.8984953079118734
Epoch  18 / 20  -  train_rmse: 0.8984776465644875
Epoch  19 / 20  -  train_rmse: 0.8984622392707795
Epoch  20 / 20  -  train_rmse: 0.8984486630733863

Test RMSE

In [5]:
baseline_model.recommend(user=200)

Unnamed: 0,user_id,item_id,rating_pred
147,200,318,4.794188
1999,200,2905,4.782433
2229,200,1178,4.722444
501,200,50,4.713932
3401,200,2503,4.697496
1875,200,1212,4.674459
15,200,2019,4.667138
402,200,527,4.664419
942,200,745,4.645939
94,200,1148,4.632405


## ALS

In [6]:
%%time

baseline_model = BaselineModel(method='als', n_epochs = 20, reg = 0.5, verbose=1)
baseline_model.fit(train)

pred = baseline_model.predict(test)
rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.9176617885927157
Epoch  2 / 20  -  train_rmse: 0.8971604837483601
Epoch  3 / 20  -  train_rmse: 0.896194271222296
Epoch  4 / 20  -  train_rmse: 0.8961360919877157
Epoch  5 / 20  -  train_rmse: 0.8961314250532887
Epoch  6 / 20  -  train_rmse: 0.896130818212106
Epoch  7 / 20  -  train_rmse: 0.8961306848554195
Epoch  8 / 20  -  train_rmse: 0.8961306360024889
Epoch  9 / 20  -  train_rmse: 0.8961306058768953
Epoch  10 / 20  -  train_rmse: 0.8961305803066902
Epoch  11 / 20  -  train_rmse: 0.8961305559767178
Epoch  12 / 20  -  train_rmse: 0.8961305320922529
Epoch  13 / 20  -  train_rmse: 0.8961305084604018
Epoch  14 / 20  -  train_rmse: 0.896130485033271
Epoch  15 / 20  -  train_rmse: 0.8961304617979958
Epoch  16 / 20  -  train_rmse: 0.8961304387501737
Epoch  17 / 20  -  train_rmse: 0.8961304158874858
Epoch  18 / 20  -  train_rmse: 0.8961303932080759
Epoch  19 / 20  -  train_rmse: 0.896130370710279
Epoch  20 / 20  -  train_rmse: 0.8961303483924202

Test RMSE: 0

## Updating with new users

In [7]:
baseline_model = BaselineModel(method='sgd', n_epochs = 20, lr=0.01, reg = 0.05, verbose=1)
baseline_model.fit(train_initial)

Epoch  1 / 20  -  train_rmse: 0.9146801195330119
Epoch  2 / 20  -  train_rmse: 0.9050223594671496
Epoch  3 / 20  -  train_rmse: 0.9019987055625731
Epoch  4 / 20  -  train_rmse: 0.9006686569140377
Epoch  5 / 20  -  train_rmse: 0.8999729081939808
Epoch  6 / 20  -  train_rmse: 0.8995657145779568
Epoch  7 / 20  -  train_rmse: 0.89930691061048
Epoch  8 / 20  -  train_rmse: 0.8991314210618228
Epoch  9 / 20  -  train_rmse: 0.8990060235508048
Epoch  10 / 20  -  train_rmse: 0.8989124896959825
Epoch  11 / 20  -  train_rmse: 0.8988402184047977
Epoch  12 / 20  -  train_rmse: 0.8987827332620895
Epoch  13 / 20  -  train_rmse: 0.8987359043007539
Epoch  14 / 20  -  train_rmse: 0.8986969958933293
Epoch  15 / 20  -  train_rmse: 0.8986641335567704
Epoch  16 / 20  -  train_rmse: 0.8986359934290962
Epoch  17 / 20  -  train_rmse: 0.8986116149856502
Epoch  18 / 20  -  train_rmse: 0.8985902844349936
Epoch  19 / 20  -  train_rmse: 0.8985714599717707
Epoch  20 / 20  -  train_rmse: 0.8985547225547444


BaselineModel(n_epochs=20, reg=0.05)

In [8]:
%%time
baseline_model.update_users(train_update, n_epochs=20, lr=0.001, verbose=1)
pred = baseline_model.predict(test_update)
rmse = mean_squared_error(test_update['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.9635727193182636
Epoch  2 / 20  -  train_rmse: 0.9500646758033183
Epoch  3 / 20  -  train_rmse: 0.9415386383955896
Epoch  4 / 20  -  train_rmse: 0.9357062825863587
Epoch  5 / 20  -  train_rmse: 0.9314783619674327
Epoch  6 / 20  -  train_rmse: 0.9282781751952418
Epoch  7 / 20  -  train_rmse: 0.9257735866453076
Epoch  8 / 20  -  train_rmse: 0.9237606324250288
Epoch  9 / 20  -  train_rmse: 0.9221076203068104
Epoch  10 / 20  -  train_rmse: 0.9207259734128911
Epoch  11 / 20  -  train_rmse: 0.9195540213709754
Epoch  12 / 20  -  train_rmse: 0.9185475311638134
Epoch  13 / 20  -  train_rmse: 0.9176739474537375
Epoch  14 / 20  -  train_rmse: 0.9169087659571536
Epoch  15 / 20  -  train_rmse: 0.916233178625095
Epoch  16 / 20  -  train_rmse: 0.9156325014702582
Epoch  17 / 20  -  train_rmse: 0.915095097997138
Epoch  18 / 20  -  train_rmse: 0.9146116248523293
Epoch  19 / 20  -  train_rmse: 0.914174492147414
Epoch  20 / 20  -  train_rmse: 0.9137774700652217

Test RMSE: 

# Matrix Factorization

## Linear Kernel

In [9]:
%%time 
matrix_fact = MatrixFactorization(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)
matrix_fact.fit(train)

pred = matrix_fact.predict(test)
rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.0131011252294773
Epoch  2 / 20  -  train_rmse: 0.9736666969388077
Epoch  3 / 20  -  train_rmse: 0.9523944671540961
Epoch  4 / 20  -  train_rmse: 0.9387221552225186
Epoch  5 / 20  -  train_rmse: 0.928938460851335
Epoch  6 / 20  -  train_rmse: 0.9214163756438052
Epoch  7 / 20  -  train_rmse: 0.9153321030137026
Epoch  8 / 20  -  train_rmse: 0.9102216297519944
Epoch  9 / 20  -  train_rmse: 0.9058012779631329
Epoch  10 / 20  -  train_rmse: 0.9018860547022041
Epoch  11 / 20  -  train_rmse: 0.8983488825183165
Epoch  12 / 20  -  train_rmse: 0.8950986431955273
Epoch  13 / 20  -  train_rmse: 0.8920675889254348
Epoch  14 / 20  -  train_rmse: 0.8892037355110048
Epoch  15 / 20  -  train_rmse: 0.8864660582431466
Epoch  16 / 20  -  train_rmse: 0.8838213430103019
Epoch  17 / 20  -  train_rmse: 0.8812420579217561
Epoch  18 / 20  -  train_rmse: 0.8787048792138804
Epoch  19 / 20  -  train_rmse: 0.8761896524743615
Epoch  20 / 20  -  train_rmse: 0.8736786543849769

Test RMSE

## Getting list of recommendations for a user

In [10]:
user = 200
items_known = train.query('user_id == @user')['item_id']
matrix_fact.recommend(user=user, items_known=items_known)

Unnamed: 0,user_id,item_id,rating_pred
1468,200,922,4.799596
219,200,750,4.754314
465,200,745,4.751673
687,200,912,4.717364
2139,200,1178,4.678136
367,200,3435,4.658557
215,200,527,4.65154
888,200,904,4.641466
445,200,50,4.64054
229,200,318,4.631874


## Updating with new users

In [11]:
# matrix_fact = MatrixFactorization(n_epochs = 20, kernel = 'rbf', n_factors = 100, verbose = 1, lr = 0.001, reg = 0.5)
matrix_fact = MatrixFactorization(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.5, reg = 0.005, kernel='rbf')
matrix_fact.fit(train_initial)

Epoch  1 / 20  -  train_rmse: 1.0568610899535198
Epoch  2 / 20  -  train_rmse: 0.9781091388392953
Epoch  3 / 20  -  train_rmse: 0.9429025413735869
Epoch  4 / 20  -  train_rmse: 0.926466677767547
Epoch  5 / 20  -  train_rmse: 0.9187575999414697
Epoch  6 / 20  -  train_rmse: 0.9150129094612953
Epoch  7 / 20  -  train_rmse: 0.9131338912290606
Epoch  8 / 20  -  train_rmse: 0.9121715039702479
Epoch  9 / 20  -  train_rmse: 0.9116745732404521
Epoch  10 / 20  -  train_rmse: 0.911418889611707
Epoch  11 / 20  -  train_rmse: 0.911289048442608
Epoch  12 / 20  -  train_rmse: 0.9112240375586808
Epoch  13 / 20  -  train_rmse: 0.9111910006598881
Epoch  14 / 20  -  train_rmse: 0.9111720883027641
Epoch  15 / 20  -  train_rmse: 0.9111576447321493
Epoch  16 / 20  -  train_rmse: 0.9111425728282068
Epoch  17 / 20  -  train_rmse: 0.9111243439852966
Epoch  18 / 20  -  train_rmse: 0.9111018837050248
Epoch  19 / 20  -  train_rmse: 0.9110749354880822
Epoch  20 / 20  -  train_rmse: 0.9110436921486067


MatrixFactorization(gamma=0.01, kernel='rbf', lr=0.5, n_epochs=20, reg=0.005)

In [12]:
%%time

# Update model with new users
matrix_fact.update_users(train_update, lr=0.5, n_epochs=20, verbose=1)
pred = matrix_fact.predict(test_update)
rmse = mean_squared_error(test_update['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.0119572386648612
Epoch  2 / 20  -  train_rmse: 0.9654911833226616
Epoch  3 / 20  -  train_rmse: 0.9484999592886597
Epoch  4 / 20  -  train_rmse: 0.9398415276904925
Epoch  5 / 20  -  train_rmse: 0.9347056320743945
Epoch  6 / 20  -  train_rmse: 0.9313874481586881
Epoch  7 / 20  -  train_rmse: 0.9291214126947662
Epoch  8 / 20  -  train_rmse: 0.9275107388390131
Epoch  9 / 20  -  train_rmse: 0.9263297441589416
Epoch  10 / 20  -  train_rmse: 0.925441414717506
Epoch  11 / 20  -  train_rmse: 0.9247584207037933
Epoch  12 / 20  -  train_rmse: 0.924222974314968
Epoch  13 / 20  -  train_rmse: 0.9237956942759937
Epoch  14 / 20  -  train_rmse: 0.9234491018299616
Epoch  15 / 20  -  train_rmse: 0.923163645988194
Epoch  16 / 20  -  train_rmse: 0.9229251831565568
Epoch  17 / 20  -  train_rmse: 0.9227233306625443
Epoch  18 / 20  -  train_rmse: 0.9225503649866644
Epoch  19 / 20  -  train_rmse: 0.9224004698361524
Epoch  20 / 20  -  train_rmse: 0.9222692144472568

Test RMSE: 

## Sigmoid kernel

In [13]:
%%time 
matrix_fact = MatrixFactorization(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.01, reg = 0.005, kernel='sigmoid')
matrix_fact.fit(train)

pred = matrix_fact.predict(test)
rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.6534171739501924
Epoch  2 / 20  -  train_rmse: 1.5339356687644987
Epoch  3 / 20  -  train_rmse: 1.3752401796661198
Epoch  4 / 20  -  train_rmse: 1.259981827279546
Epoch  5 / 20  -  train_rmse: 1.1824029931387572
Epoch  6 / 20  -  train_rmse: 1.125426262537537
Epoch  7 / 20  -  train_rmse: 1.0817863732252446
Epoch  8 / 20  -  train_rmse: 1.0477191326241873
Epoch  9 / 20  -  train_rmse: 1.020687976383859
Epoch  10 / 20  -  train_rmse: 0.9988500814222605
Epoch  11 / 20  -  train_rmse: 0.9808577080462594
Epoch  12 / 20  -  train_rmse: 0.9657270998941859
Epoch  13 / 20  -  train_rmse: 0.9527413393270071
Epoch  14 / 20  -  train_rmse: 0.9413766531192692
Epoch  15 / 20  -  train_rmse: 0.9312469296837175
Epoch  16 / 20  -  train_rmse: 0.9220631672265107
Epoch  17 / 20  -  train_rmse: 0.9136049542540458
Epoch  18 / 20  -  train_rmse: 0.9057012603106378
Epoch  19 / 20  -  train_rmse: 0.8982180481120441
Epoch  20 / 20  -  train_rmse: 0.8910505137982465

Test RMSE: 

## RBF Kernel

In [14]:
%%time 
matrix_fact = MatrixFactorization(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.5, reg = 0.005, kernel='rbf')
matrix_fact.fit(train)

pred = matrix_fact.predict(test)
rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.0649949743072618
Epoch  2 / 20  -  train_rmse: 0.9855529577287307
Epoch  3 / 20  -  train_rmse: 0.9486559856493921
Epoch  4 / 20  -  train_rmse: 0.9294799458611337
Epoch  5 / 20  -  train_rmse: 0.9197192571756231
Epoch  6 / 20  -  train_rmse: 0.9146795522785521
Epoch  7 / 20  -  train_rmse: 0.9120177912276821
Epoch  8 / 20  -  train_rmse: 0.9105960686919143
Epoch  9 / 20  -  train_rmse: 0.9098426971136084
Epoch  10 / 20  -  train_rmse: 0.9094579620113128
Epoch  11 / 20  -  train_rmse: 0.9092788657997173
Epoch  12 / 20  -  train_rmse: 0.9092141470872686
Epoch  13 / 20  -  train_rmse: 0.9092115984347159
Epoch  14 / 20  -  train_rmse: 0.9092409432466452
Epoch  15 / 20  -  train_rmse: 0.9092845351018265
Epoch  16 / 20  -  train_rmse: 0.9093321455718028
Epoch  17 / 20  -  train_rmse: 0.9093779654274577
Epoch  18 / 20  -  train_rmse: 0.9094188416885474
Epoch  19 / 20  -  train_rmse: 0.9094532230504574
Epoch  20 / 20  -  train_rmse: 0.9094805203330077

Test RMS

# Surprise

In [None]:
# Load the movielens-100k dataset (download it if needed),
data = sp.Dataset.load_builtin('ml-100k')

In [22]:
%%time
# We'll use the famous SVD algorithm.
algo = sp.SVD()

# Run 5-fold cross-validation and print results
sp.model_selection.cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9333  0.9435  0.9348  0.9342  0.9390  0.9370  0.0038  
MAE (testset)     0.7394  0.7446  0.7382  0.7324  0.7399  0.7389  0.0039  
Fit time          4.88    4.94    4.82    4.79    4.65    4.82    0.10    
Test time         0.13    0.14    0.13    0.21    0.13    0.15    0.03    
Wall time: 25.6 s


{'test_rmse': array([0.93326446, 0.94347125, 0.93482663, 0.93419445, 0.93902724]),
 'test_mae': array([0.73941536, 0.74455453, 0.73824316, 0.73239435, 0.73991508]),
 'fit_time': (4.881997585296631,
  4.935999631881714,
  4.823998212814331,
  4.791991949081421,
  4.647997856140137),
 'test_time': (0.12800049781799316,
  0.136000394821167,
  0.13100290298461914,
  0.2109992504119873,
  0.12800121307373047)}

In [24]:
%%time

surprise_svd = sp.SVD(lr_all=0.001, reg_all=0.005, n_epochs=20, n_factors=50)
# surprise_svd = sp.BaselineOnly()

reader = sp.Reader()

data_train = sp.Dataset.load_from_df(train[['user_id', 'item_id', 'rating']], reader = reader).build_full_trainset()
data_test = sp.Dataset.load_from_df(test[['user_id', 'item_id', 'rating']], reader = reader).build_full_trainset().build_testset()
surprise_svd.fit(data_train)

pred = surprise_svd.test(data_test)
sp.accuracy.rmse(pred)

RMSE: 0.9161
Wall time: 35.9 s


0.916061209544704