In [1]:
# Data manipulation
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100

# Modeling
from matrix_factorization import BaselineModel, KernelMF, train_update_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Other
import os
import random
import sys

# Reload imported code 
%load_ext autoreload
%autoreload 2

# Print all output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
    
rand_seed = 2
np.random.seed(rand_seed)
random.seed(rand_seed)

# Load data

**Movie data found here https://grouplens.org/datasets/movielens/**

In [2]:
cols = ['user_id', 'item_id', 'rating', 'timestamp']
# movie_data = pd.read_csv('../data/ml-1m/ratings.dat', names = cols, sep = '::', usecols=[0, 1, 2], engine='python')
movie_data = pd.read_csv('../data/ml-100k/u.data', names = cols, sep = '\t', usecols=[0, 1, 2], engine='python')

X = movie_data[['user_id', 'item_id']]
y = movie_data['rating']

# Prepare data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Prepare data for online learning
X_train_initial, y_train_initial, X_train_update, y_train_update, X_test_update, y_test_update = train_update_test_split(movie_data, frac_new_users=0.2)

movie_data.head(10)

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
5,298,474,4
6,115,265,2
7,253,465,5
8,305,451,3
9,6,86,3


# Simple model with global mean

This is similar to just the global standard deviation

In [3]:
global_mean = y_train.mean()
pred = [global_mean for _ in range(y_test.shape[0])]

rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:4f}')


Test RMSE: 1.120652


# Baseline Model with biases

## SGD

In [4]:
%%time

baseline_model = BaselineModel(method='sgd', n_epochs = 20, reg = 0.005, lr = 0.01, verbose=1)
baseline_model.fit(X_train, y_train)

pred = baseline_model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.9685443987174238
Epoch  2 / 20  -  train_rmse: 0.945448032425675
Epoch  3 / 20  -  train_rmse: 0.9350744230954693
Epoch  4 / 20  -  train_rmse: 0.9294774771346712
Epoch  5 / 20  -  train_rmse: 0.9258635943145475
Epoch  6 / 20  -  train_rmse: 0.9235995589398913
Epoch  7 / 20  -  train_rmse: 0.9218589129974872
Epoch  8 / 20  -  train_rmse: 0.9205752967946901
Epoch  9 / 20  -  train_rmse: 0.9197497680553437
Epoch  10 / 20  -  train_rmse: 0.9189075470532244
Epoch  11 / 20  -  train_rmse: 0.9184605627485326
Epoch  12 / 20  -  train_rmse: 0.9180274072268116
Epoch  13 / 20  -  train_rmse: 0.9174771346162836
Epoch  14 / 20  -  train_rmse: 0.9172615435062336
Epoch  15 / 20  -  train_rmse: 0.9169118664096015
Epoch  16 / 20  -  train_rmse: 0.916762599540885
Epoch  17 / 20  -  train_rmse: 0.9165916401686293
Epoch  18 / 20  -  train_rmse: 0.9164009881488299
Epoch  19 / 20  -  train_rmse: 0.9161039428103391
Epoch  20 / 20  -  train_rmse: 0.9160441667784996

Test RMSE:

In [5]:
baseline_model.recommend(user=200)

Unnamed: 0,user_id,item_id,rating_pred
378,200,318,5.0
457,200,357,5.0
388,200,408,5.0
988,200,1449,5.0
281,200,483,5.0
790,200,114,5.0
109,200,127,5.0
562,200,12,5.0
212,200,169,5.0
54,200,603,5.0


## ALS

In [6]:
%%time

baseline_model = BaselineModel(method='als', n_epochs = 20, reg = 0.5, verbose=1)
baseline_model.fit(X_train, y_train)

pred = baseline_model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.9312489364350157
Epoch  2 / 20  -  train_rmse: 0.9144875214764501
Epoch  3 / 20  -  train_rmse: 0.9134856911195807
Epoch  4 / 20  -  train_rmse: 0.9133800448918423
Epoch  5 / 20  -  train_rmse: 0.9133615794862777
Epoch  6 / 20  -  train_rmse: 0.9133565857003941
Epoch  7 / 20  -  train_rmse: 0.9133544601244424
Epoch  8 / 20  -  train_rmse: 0.9133531004630441
Epoch  9 / 20  -  train_rmse: 0.9133519902067218
Epoch  10 / 20  -  train_rmse: 0.9133509792033206
Epoch  11 / 20  -  train_rmse: 0.9133500175542733
Epoch  12 / 20  -  train_rmse: 0.9133490869495551
Epoch  13 / 20  -  train_rmse: 0.9133481801287349
Epoch  14 / 20  -  train_rmse: 0.9133472939684136
Epoch  15 / 20  -  train_rmse: 0.9133464269599311
Epoch  16 / 20  -  train_rmse: 0.9133455782426871
Epoch  17 / 20  -  train_rmse: 0.9133447472230197
Epoch  18 / 20  -  train_rmse: 0.9133439334215674
Epoch  19 / 20  -  train_rmse: 0.9133431364114416
Epoch  20 / 20  -  train_rmse: 0.9133423557930989

Test RMS

## Updating with new users

In [7]:
baseline_model = BaselineModel(method='sgd', n_epochs = 20, lr=0.01, reg = 0.05, verbose=1)
baseline_model.fit(X_train_initial, y_train_initial)

Epoch  1 / 20  -  train_rmse: 0.9650236406922229
Epoch  2 / 20  -  train_rmse: 0.9428226226596799
Epoch  3 / 20  -  train_rmse: 0.9331705124882925
Epoch  4 / 20  -  train_rmse: 0.9279749973416741
Epoch  5 / 20  -  train_rmse: 0.9247974571263335
Epoch  6 / 20  -  train_rmse: 0.9226517575035114
Epoch  7 / 20  -  train_rmse: 0.920835039334346
Epoch  8 / 20  -  train_rmse: 0.9197367786245378
Epoch  9 / 20  -  train_rmse: 0.9189681287833118
Epoch  10 / 20  -  train_rmse: 0.9181493468113285
Epoch  11 / 20  -  train_rmse: 0.9177119438426637
Epoch  12 / 20  -  train_rmse: 0.9172589415232193
Epoch  13 / 20  -  train_rmse: 0.9168827001131301
Epoch  14 / 20  -  train_rmse: 0.9164445680503323
Epoch  15 / 20  -  train_rmse: 0.9164404466859075
Epoch  16 / 20  -  train_rmse: 0.9160093360322635
Epoch  17 / 20  -  train_rmse: 0.9158025569643043
Epoch  18 / 20  -  train_rmse: 0.9157375955425434
Epoch  19 / 20  -  train_rmse: 0.9156845197413601
Epoch  20 / 20  -  train_rmse: 0.9153536272183195


BaselineModel(n_epochs=20, reg=0.05)

In [8]:
%%time
baseline_model.update_users(X_train_update, y_train_update, n_epochs=20, lr=0.001, verbose=1)
pred = baseline_model.predict(X_test_update)
rmse = mean_squared_error(y_test_update, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.0192369838658015
Epoch  2 / 20  -  train_rmse: 1.0025765882013635
Epoch  3 / 20  -  train_rmse: 0.9901259692095271
Epoch  4 / 20  -  train_rmse: 0.9807144030582827
Epoch  5 / 20  -  train_rmse: 0.9734408997442995
Epoch  6 / 20  -  train_rmse: 0.9677156773644434
Epoch  7 / 20  -  train_rmse: 0.9631030982793267
Epoch  8 / 20  -  train_rmse: 0.9593444020925831
Epoch  9 / 20  -  train_rmse: 0.9562283345776661
Epoch  10 / 20  -  train_rmse: 0.9536075629675317
Epoch  11 / 20  -  train_rmse: 0.9513672180603409
Epoch  12 / 20  -  train_rmse: 0.9494208315066158
Epoch  13 / 20  -  train_rmse: 0.9477253749191763
Epoch  14 / 20  -  train_rmse: 0.946229927618241
Epoch  15 / 20  -  train_rmse: 0.9449080911468511
Epoch  16 / 20  -  train_rmse: 0.943720843305453
Epoch  17 / 20  -  train_rmse: 0.9426516413656599
Epoch  18 / 20  -  train_rmse: 0.9416762680286268
Epoch  19 / 20  -  train_rmse: 0.9407955983703769
Epoch  20 / 20  -  train_rmse: 0.9399846956755161

Test RMSE:

# Matrix Factorization

## Linear Kernel

In [9]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.0801330309911932
Epoch  2 / 20  -  train_rmse: 1.0473476509450943
Epoch  3 / 20  -  train_rmse: 1.0244646832888804
Epoch  4 / 20  -  train_rmse: 1.0074920647400105
Epoch  5 / 20  -  train_rmse: 0.994246835724601
Epoch  6 / 20  -  train_rmse: 0.9835051043916838
Epoch  7 / 20  -  train_rmse: 0.9745225390156432
Epoch  8 / 20  -  train_rmse: 0.9668223717422572
Epoch  9 / 20  -  train_rmse: 0.9600683414209181
Epoch  10 / 20  -  train_rmse: 0.9540555205061302
Epoch  11 / 20  -  train_rmse: 0.9486137679667849
Epoch  12 / 20  -  train_rmse: 0.9436380921221055
Epoch  13 / 20  -  train_rmse: 0.9390299858326666
Epoch  14 / 20  -  train_rmse: 0.9347250023203936
Epoch  15 / 20  -  train_rmse: 0.9306721252709302
Epoch  16 / 20  -  train_rmse: 0.9268329678953544
Epoch  17 / 20  -  train_rmse: 0.9231713443339361
Epoch  18 / 20  -  train_rmse: 0.919660317751421
Epoch  19 / 20  -  train_rmse: 0.9162775396770947
Epoch  20 / 20  -  train_rmse: 0.9130048063578868

Test RMSE:

## Getting list of recommendations for a user

In [10]:
user = 200
items_known = X_train.query('user_id == @user')['item_id']
matrix_fact.recommend(user=user, items_known=items_known)

Unnamed: 0,user_id,item_id,rating_pred
37,200,64,5.0
242,200,357,4.953382
11,200,127,4.91476
61,200,272,4.904249
710,200,479,4.83706
395,200,480,4.836522
275,200,12,4.816657
655,200,427,4.808555
55,200,511,4.804192
144,200,285,4.797472


## Updating with new users

In [11]:
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)
matrix_fact.fit(X_train_initial, y_train_initial)

Epoch  1 / 20  -  train_rmse: 1.0706518319392073
Epoch  2 / 20  -  train_rmse: 1.0382624779438394
Epoch  3 / 20  -  train_rmse: 1.016232308328001
Epoch  4 / 20  -  train_rmse: 0.9999366805279928
Epoch  5 / 20  -  train_rmse: 0.9872308367922817
Epoch  6 / 20  -  train_rmse: 0.9769357406601346
Epoch  7 / 20  -  train_rmse: 0.9683129631342807
Epoch  8 / 20  -  train_rmse: 0.9609022720622064
Epoch  9 / 20  -  train_rmse: 0.9543972792347011
Epoch  10 / 20  -  train_rmse: 0.9485809462916166
Epoch  11 / 20  -  train_rmse: 0.9433059630075376
Epoch  12 / 20  -  train_rmse: 0.9384619306949283
Epoch  13 / 20  -  train_rmse: 0.9339642725110164
Epoch  14 / 20  -  train_rmse: 0.9297503741854064
Epoch  15 / 20  -  train_rmse: 0.9257711482478324
Epoch  16 / 20  -  train_rmse: 0.921985910287917
Epoch  17 / 20  -  train_rmse: 0.9183647974387779
Epoch  18 / 20  -  train_rmse: 0.9148839852245906
Epoch  19 / 20  -  train_rmse: 0.9115179356050906
Epoch  20 / 20  -  train_rmse: 0.9082510051903396


KernelMF(gamma=0.01, lr=0.001, n_epochs=20, reg=0.005)

In [12]:
%%time
# Update model with new users
matrix_fact.update_users(X_train_update, y_train_update, lr=0.001, n_epochs=20, verbose=1)
pred = matrix_fact.predict(X_test_update)
rmse = mean_squared_error(y_test_update, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.0397682004761137
Epoch  2 / 20  -  train_rmse: 1.0204525249402976
Epoch  3 / 20  -  train_rmse: 1.0058106710178145
Epoch  4 / 20  -  train_rmse: 0.9945551189184298
Epoch  5 / 20  -  train_rmse: 0.9856789782783212
Epoch  6 / 20  -  train_rmse: 0.9785788434701258
Epoch  7 / 20  -  train_rmse: 0.9727422998246427
Epoch  8 / 20  -  train_rmse: 0.9678438078577599
Epoch  9 / 20  -  train_rmse: 0.9636632891501984
Epoch  10 / 20  -  train_rmse: 0.9600308660297464
Epoch  11 / 20  -  train_rmse: 0.9568136464702428
Epoch  12 / 20  -  train_rmse: 0.9539161652784045
Epoch  13 / 20  -  train_rmse: 0.9512904364030054
Epoch  14 / 20  -  train_rmse: 0.9488745417666238
Epoch  15 / 20  -  train_rmse: 0.9466285136632905
Epoch  16 / 20  -  train_rmse: 0.94452616338993
Epoch  17 / 20  -  train_rmse: 0.9425492611358841
Epoch  18 / 20  -  train_rmse: 0.9406751136767649
Epoch  19 / 20  -  train_rmse: 0.9388943623139107
Epoch  20 / 20  -  train_rmse: 0.9371880494897803

Test RMSE:

## Sigmoid kernel

In [13]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.01, reg = 0.005, kernel='sigmoid')
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.7254842363611376
Epoch  2 / 20  -  train_rmse: 1.700347578847924
Epoch  3 / 20  -  train_rmse: 1.6622359141199023
Epoch  4 / 20  -  train_rmse: 1.6210456578773018
Epoch  5 / 20  -  train_rmse: 1.5756597449133936
Epoch  6 / 20  -  train_rmse: 1.523379818791774
Epoch  7 / 20  -  train_rmse: 1.4657317754887915
Epoch  8 / 20  -  train_rmse: 1.4093479432787581
Epoch  9 / 20  -  train_rmse: 1.358332738938575
Epoch  10 / 20  -  train_rmse: 1.3133318818212163
Epoch  11 / 20  -  train_rmse: 1.2739396811494321
Epoch  12 / 20  -  train_rmse: 1.2393300382279362
Epoch  13 / 20  -  train_rmse: 1.2087120677746743
Epoch  14 / 20  -  train_rmse: 1.181458675550588
Epoch  15 / 20  -  train_rmse: 1.1570300259298787
Epoch  16 / 20  -  train_rmse: 1.1349358601708097
Epoch  17 / 20  -  train_rmse: 1.114946996505043
Epoch  18 / 20  -  train_rmse: 1.0966573702646067
Epoch  19 / 20  -  train_rmse: 1.079843880247601
Epoch  20 / 20  -  train_rmse: 1.0642701656384883

Test RMSE: 1.1

## RBF Kernel

In [14]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.5, reg = 0.005, kernel='rbf')
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.261497709751721
Epoch  2 / 20  -  train_rmse: 1.1098240081612984
Epoch  3 / 20  -  train_rmse: 1.0469994987862579
Epoch  4 / 20  -  train_rmse: 1.005181914551291
Epoch  5 / 20  -  train_rmse: 0.9752579187861348
Epoch  6 / 20  -  train_rmse: 0.9515686603321364
Epoch  7 / 20  -  train_rmse: 0.9340638617221303
Epoch  8 / 20  -  train_rmse: 0.9213238773972364
Epoch  9 / 20  -  train_rmse: 0.9115143003092134
Epoch  10 / 20  -  train_rmse: 0.9039437993331968
Epoch  11 / 20  -  train_rmse: 0.899792715730062
Epoch  12 / 20  -  train_rmse: 0.8949836709174682
Epoch  13 / 20  -  train_rmse: 0.8934174679325033
Epoch  14 / 20  -  train_rmse: 0.8897947618902249
Epoch  15 / 20  -  train_rmse: 0.8861334672817339
Epoch  16 / 20  -  train_rmse: 0.8850958002049469
Epoch  17 / 20  -  train_rmse: 0.883513182070616
Epoch  18 / 20  -  train_rmse: 0.8818590959179743
Epoch  19 / 20  -  train_rmse: 0.8817834058789318
Epoch  20 / 20  -  train_rmse: 0.8826416261286896

Test RMSE: 0

# Scikit-learn compatability

In [15]:
from sklearn.model_selection import GridSearchCV, ParameterGrid

param_grid = {
    'kernel': ['linear', 'sigmoid', 'rbf'],
    'n_factors': [10, 20, 50],
    'n_epochs': [10, 20, 50],
    'reg': [0, 0.005, 0.1]
}

grid_search = GridSearchCV(KernelMF(verbose=0), scoring = 'neg_root_mean_squared_error', param_grid=param_grid, n_jobs=-1, cv=5, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   36.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  3.4min finished


GridSearchCV(cv=5, estimator=KernelMF(gamma=0.01, verbose=0), n_jobs=-1,
             param_grid={'kernel': ['linear', 'sigmoid', 'rbf'],
                         'n_epochs': [10, 20, 50], 'n_factors': [10, 20, 50],
                         'reg': [0, 0.005, 0.1]},
             scoring='neg_root_mean_squared_error', verbose=1)

In [16]:
grid_search.best_score_
grid_search.best_params_

-0.9252872735695155

{'kernel': 'linear', 'n_epochs': 50, 'n_factors': 50, 'reg': 0.1}