In [1]:
# Data manipulation
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100

# Modeling
from matrix_factorization import BaselineModel, KernelMF, train_update_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Other
import os
import random
import sys

# Reload imported code 
%load_ext autoreload
%autoreload 2

# Print all output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

rand_seed = 2
np.random.seed(rand_seed)
random.seed(rand_seed)

# Load data

**Movie data found here https://grouplens.org/datasets/movielens/**

In [2]:
cols = ['user_id', 'item_id', 'rating', 'timestamp']
# movie_data = pd.read_csv('../data/ml-1m/ratings.dat', names = cols, sep = '::', usecols=[0, 1, 2], engine='python')
movie_data = pd.read_csv('../data/ml-100k/u.data', names = cols, sep = '\t', usecols=[0, 1, 2], engine='python')

X = movie_data[['user_id', 'item_id']]
y = movie_data['rating']

# Prepare data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Prepare data for online learning
X_train_initial, y_train_initial, X_train_update, y_train_update, X_test_update, y_test_update = train_update_test_split(movie_data, frac_new_users=0.2)

movie_data.head(10)

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
5,298,474,4
6,115,265,2
7,253,465,5
8,305,451,3
9,6,86,3


# Simple model with global mean

This is similar to just the global standard deviation

In [3]:
global_mean = y_train.mean()
pred = [global_mean for _ in range(y_test.shape[0])]

rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:4f}')


Test RMSE: 1.120652


# Baseline Model with biases

## SGD

In [4]:
%%time

baseline_model = BaselineModel(method='sgd', n_epochs = 20, reg = 0.005, lr = 0.01, verbose=1)
baseline_model.fit(X_train, y_train)

pred = baseline_model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.968584946965404
Epoch  2 / 20  -  train_rmse: 0.9454399191694888
Epoch  3 / 20  -  train_rmse: 0.9353488733388631
Epoch  4 / 20  -  train_rmse: 0.9296936848263799
Epoch  5 / 20  -  train_rmse: 0.926151311570964
Epoch  6 / 20  -  train_rmse: 0.923775016755705
Epoch  7 / 20  -  train_rmse: 0.922100268641781
Epoch  8 / 20  -  train_rmse: 0.9208735985370318
Epoch  9 / 20  -  train_rmse: 0.9199463965251449
Epoch  10 / 20  -  train_rmse: 0.9192267312470667
Epoch  11 / 20  -  train_rmse: 0.9186552785087078
Epoch  12 / 20  -  train_rmse: 0.9181924141457299
Epoch  13 / 20  -  train_rmse: 0.9178108986051953
Epoch  14 / 20  -  train_rmse: 0.9174915391758885
Epoch  15 / 20  -  train_rmse: 0.9172205193346755
Epoch  16 / 20  -  train_rmse: 0.9169877015010416
Epoch  17 / 20  -  train_rmse: 0.9167855187837213
Epoch  18 / 20  -  train_rmse: 0.9166082341717784
Epoch  19 / 20  -  train_rmse: 0.9164514350842512
Epoch  20 / 20  -  train_rmse: 0.9163116821781069

Test RMSE: 0

In [5]:
baseline_model.recommend(user=200)

Unnamed: 0,user_id,item_id,rating_pred
790,200,114,5.0
34,200,50,5.0
338,200,64,5.0
212,200,169,5.0
988,200,1449,5.0
388,200,408,5.0
188,200,178,5.0
54,200,603,5.0
726,200,513,5.0
281,200,483,5.0


## ALS

In [6]:
%%time

baseline_model = BaselineModel(method='als', n_epochs = 20, reg = 0.5, verbose=1)
baseline_model.fit(X_train, y_train)

pred = baseline_model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.9312489364350157
Epoch  2 / 20  -  train_rmse: 0.9144875214764501
Epoch  3 / 20  -  train_rmse: 0.9134856911195807
Epoch  4 / 20  -  train_rmse: 0.9133800448918423
Epoch  5 / 20  -  train_rmse: 0.9133615794862777
Epoch  6 / 20  -  train_rmse: 0.9133565857003941
Epoch  7 / 20  -  train_rmse: 0.9133544601244424
Epoch  8 / 20  -  train_rmse: 0.9133531004630441
Epoch  9 / 20  -  train_rmse: 0.9133519902067218
Epoch  10 / 20  -  train_rmse: 0.9133509792033206
Epoch  11 / 20  -  train_rmse: 0.9133500175542733
Epoch  12 / 20  -  train_rmse: 0.9133490869495551
Epoch  13 / 20  -  train_rmse: 0.9133481801287349
Epoch  14 / 20  -  train_rmse: 0.9133472939684136
Epoch  15 / 20  -  train_rmse: 0.9133464269599311
Epoch  16 / 20  -  train_rmse: 0.9133455782426871
Epoch  17 / 20  -  train_rmse: 0.9133447472230197
Epoch  18 / 20  -  train_rmse: 0.9133439334215674
Epoch  19 / 20  -  train_rmse: 0.9133431364114416
Epoch  20 / 20  -  train_rmse: 0.9133423557930989

Test RMS

## Updating with new users

In [7]:
baseline_model = BaselineModel(method='sgd', n_epochs = 20, lr=0.01, reg = 0.05, verbose=1)
baseline_model.fit(X_train_initial, y_train_initial)

Epoch  1 / 20  -  train_rmse: 0.964499569923119
Epoch  2 / 20  -  train_rmse: 0.9428040600174458
Epoch  3 / 20  -  train_rmse: 0.9333717438673879
Epoch  4 / 20  -  train_rmse: 0.9281188215275641
Epoch  5 / 20  -  train_rmse: 0.9248381206356202
Epoch  6 / 20  -  train_rmse: 0.9226356513191752
Epoch  7 / 20  -  train_rmse: 0.9210775589005141
Epoch  8 / 20  -  train_rmse: 0.9199294720789966
Epoch  9 / 20  -  train_rmse: 0.9190550671651979
Epoch  10 / 20  -  train_rmse: 0.9183705215546366
Epoch  11 / 20  -  train_rmse: 0.917821977784321
Epoch  12 / 20  -  train_rmse: 0.9173735592119324
Epoch  13 / 20  -  train_rmse: 0.9170006265834857
Epoch  14 / 20  -  train_rmse: 0.9166858061210149
Epoch  15 / 20  -  train_rmse: 0.9164165597108449
Epoch  16 / 20  -  train_rmse: 0.9161836500513875
Epoch  17 / 20  -  train_rmse: 0.9159801439630649
Epoch  18 / 20  -  train_rmse: 0.91580074929513
Epoch  19 / 20  -  train_rmse: 0.9156413641563471
Epoch  20 / 20  -  train_rmse: 0.9154987644663583


BaselineModel(n_epochs=20, reg=0.05)

In [8]:
%%time
baseline_model.update_users(X_train_update, y_train_update, n_epochs=20, lr=0.001, verbose=1)
pred = baseline_model.predict(X_test_update)
rmse = mean_squared_error(y_test_update, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.0191142772546382
Epoch  2 / 20  -  train_rmse: 1.0024027918882694
Epoch  3 / 20  -  train_rmse: 0.9899293628487116
Epoch  4 / 20  -  train_rmse: 0.9804649221282987
Epoch  5 / 20  -  train_rmse: 0.9731535124309144
Epoch  6 / 20  -  train_rmse: 0.9673992319805947
Epoch  7 / 20  -  train_rmse: 0.9627858735178542
Epoch  8 / 20  -  train_rmse: 0.9590209162595595
Epoch  9 / 20  -  train_rmse: 0.9558969637901716
Epoch  10 / 20  -  train_rmse: 0.9532653733663553
Epoch  11 / 20  -  train_rmse: 0.9510182727624278
Epoch  12 / 20  -  train_rmse: 0.9490762937405616
Epoch  13 / 20  -  train_rmse: 0.9473801822317853
Epoch  14 / 20  -  train_rmse: 0.9458850326691678
Epoch  15 / 20  -  train_rmse: 0.9445562996890088
Epoch  16 / 20  -  train_rmse: 0.9433670167763138
Epoch  17 / 20  -  train_rmse: 0.9422958380100659
Epoch  18 / 20  -  train_rmse: 0.9413256443922408
Epoch  19 / 20  -  train_rmse: 0.9404425402236979
Epoch  20 / 20  -  train_rmse: 0.93963512123345

Test RMSE:

# Matrix Factorization

## Linear Kernel

In [9]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.0801222116631521
Epoch  2 / 20  -  train_rmse: 1.0473146520428591
Epoch  3 / 20  -  train_rmse: 1.0244607467803393
Epoch  4 / 20  -  train_rmse: 1.0074890819726983
Epoch  5 / 20  -  train_rmse: 0.9942491923413936
Epoch  6 / 20  -  train_rmse: 0.9835138901257094
Epoch  7 / 20  -  train_rmse: 0.9745356599913532
Epoch  8 / 20  -  train_rmse: 0.9668346620204138
Epoch  9 / 20  -  train_rmse: 0.9600896334525281
Epoch  10 / 20  -  train_rmse: 0.9540779001745824
Epoch  11 / 20  -  train_rmse: 0.9486405218111265
Epoch  12 / 20  -  train_rmse: 0.9436611119311279
Epoch  13 / 20  -  train_rmse: 0.9390524870345828
Epoch  14 / 20  -  train_rmse: 0.9347479880958213
Epoch  15 / 20  -  train_rmse: 0.9306956889311894
Epoch  16 / 20  -  train_rmse: 0.9268544411303467
Epoch  17 / 20  -  train_rmse: 0.9231911174153383
Epoch  18 / 20  -  train_rmse: 0.9196786547082368
Epoch  19 / 20  -  train_rmse: 0.9162946416048168
Epoch  20 / 20  -  train_rmse: 0.9130202831875339

Test RMS

## Getting list of recommendations for a user

In [10]:
user = 200
items_known = X_train.query('user_id == @user')['item_id']
matrix_fact.recommend(user=user, items_known=items_known)

Unnamed: 0,user_id,item_id,rating_pred
37,200,64,5.0
242,200,357,4.954014
11,200,127,4.91522
61,200,272,4.908755
395,200,480,4.841725
710,200,479,4.839973
275,200,12,4.820736
55,200,511,4.811989
655,200,427,4.809895
17,200,100,4.803464


## Updating with new users

In [11]:
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)
matrix_fact.fit(X_train_initial, y_train_initial)

Epoch  1 / 20  -  train_rmse: 1.0705746932381526
Epoch  2 / 20  -  train_rmse: 1.0382660426957875
Epoch  3 / 20  -  train_rmse: 1.016215024525483
Epoch  4 / 20  -  train_rmse: 0.9999227294660362
Epoch  5 / 20  -  train_rmse: 0.9872249839452364
Epoch  6 / 20  -  train_rmse: 0.9769265257288504
Epoch  7 / 20  -  train_rmse: 0.9683048573697514
Epoch  8 / 20  -  train_rmse: 0.9608970562505023
Epoch  9 / 20  -  train_rmse: 0.9543939579409526
Epoch  10 / 20  -  train_rmse: 0.9485821217812994
Epoch  11 / 20  -  train_rmse: 0.9433098947660511
Epoch  12 / 20  -  train_rmse: 0.9384666913135622
Epoch  13 / 20  -  train_rmse: 0.9339699264597934
Epoch  14 / 20  -  train_rmse: 0.9297565525057904
Epoch  15 / 20  -  train_rmse: 0.9257774469205091
Epoch  16 / 20  -  train_rmse: 0.9219936103680543
Epoch  17 / 20  -  train_rmse: 0.918373539403859
Epoch  18 / 20  -  train_rmse: 0.9148913769568732
Epoch  19 / 20  -  train_rmse: 0.911525587551766
Epoch  20 / 20  -  train_rmse: 0.9082579928457778


KernelMF(gamma=0.01, lr=0.001, n_epochs=20, reg=0.005)

In [12]:
%%time
# Update model with new users
matrix_fact.update_users(X_train_update, y_train_update, lr=0.001, n_epochs=20, verbose=1)
pred = matrix_fact.predict(X_test_update)
rmse = mean_squared_error(y_test_update, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.0398656868356
Epoch  2 / 20  -  train_rmse: 1.0205576874442035
Epoch  3 / 20  -  train_rmse: 1.005932072259718
Epoch  4 / 20  -  train_rmse: 0.994657890390333
Epoch  5 / 20  -  train_rmse: 0.985793674422776
Epoch  6 / 20  -  train_rmse: 0.9786755773344993
Epoch  7 / 20  -  train_rmse: 0.9728353493432004
Epoch  8 / 20  -  train_rmse: 0.9679416274302333
Epoch  9 / 20  -  train_rmse: 0.9637585721547791
Epoch  10 / 20  -  train_rmse: 0.9601170133421498
Epoch  11 / 20  -  train_rmse: 0.9568944335345086
Epoch  12 / 20  -  train_rmse: 0.9540011144651367
Epoch  13 / 20  -  train_rmse: 0.9513705470566537
Epoch  14 / 20  -  train_rmse: 0.9489527779171221
Epoch  15 / 20  -  train_rmse: 0.9467097748161197
Epoch  16 / 20  -  train_rmse: 0.9446121807344438
Epoch  17 / 20  -  train_rmse: 0.9426370248154202
Epoch  18 / 20  -  train_rmse: 0.9407660949988312
Epoch  19 / 20  -  train_rmse: 0.9389847703547473
Epoch  20 / 20  -  train_rmse: 0.9372811746783785

Test RMSE: 0.9

## Sigmoid kernel

In [13]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.01, reg = 0.005, kernel='sigmoid')
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.7254777443622737
Epoch  2 / 20  -  train_rmse: 1.7002770272866834
Epoch  3 / 20  -  train_rmse: 1.6621854353211043
Epoch  4 / 20  -  train_rmse: 1.6210519279111004
Epoch  5 / 20  -  train_rmse: 1.5756817133636316
Epoch  6 / 20  -  train_rmse: 1.5234665705656611
Epoch  7 / 20  -  train_rmse: 1.4658696361682801
Epoch  8 / 20  -  train_rmse: 1.4095143392284235
Epoch  9 / 20  -  train_rmse: 1.3585098674330331
Epoch  10 / 20  -  train_rmse: 1.3134812994218037
Epoch  11 / 20  -  train_rmse: 1.274038256061191
Epoch  12 / 20  -  train_rmse: 1.2394121109949494
Epoch  13 / 20  -  train_rmse: 1.2087791097194398
Epoch  14 / 20  -  train_rmse: 1.181472989275656
Epoch  15 / 20  -  train_rmse: 1.1569867880380225
Epoch  16 / 20  -  train_rmse: 1.1349140014111203
Epoch  17 / 20  -  train_rmse: 1.1149046640922917
Epoch  18 / 20  -  train_rmse: 1.0966454852532301
Epoch  19 / 20  -  train_rmse: 1.0798574505097602
Epoch  20 / 20  -  train_rmse: 1.0643001689478164

Test RMSE:

## RBF Kernel

In [14]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.5, reg = 0.005, kernel='rbf')
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.2617769898308766
Epoch  2 / 20  -  train_rmse: 1.1099584186466687
Epoch  3 / 20  -  train_rmse: 1.0449538509447247
Epoch  4 / 20  -  train_rmse: 1.0032850543994067
Epoch  5 / 20  -  train_rmse: 0.9731173033230605
Epoch  6 / 20  -  train_rmse: 0.9504936444747265
Epoch  7 / 20  -  train_rmse: 0.9333982521769938
Epoch  8 / 20  -  train_rmse: 0.920477552089845
Epoch  9 / 20  -  train_rmse: 0.9107142631785093
Epoch  10 / 20  -  train_rmse: 0.9033277879070493
Epoch  11 / 20  -  train_rmse: 0.8977237427403485
Epoch  12 / 20  -  train_rmse: 0.8934545543507805
Epoch  13 / 20  -  train_rmse: 0.8901858675242217
Epoch  14 / 20  -  train_rmse: 0.8876689219940567
Epoch  15 / 20  -  train_rmse: 0.8857188281332395
Epoch  16 / 20  -  train_rmse: 0.8841980100807602
Epoch  17 / 20  -  train_rmse: 0.8830038246632579
Epoch  18 / 20  -  train_rmse: 0.8820594029251833
Epoch  19 / 20  -  train_rmse: 0.881306916237868
Epoch  20 / 20  -  train_rmse: 0.8807026398556485

Test RMSE:

# Scikit-learn compatability

In [15]:
from sklearn.model_selection import GridSearchCV, ParameterGrid

param_grid = {
    'kernel': ['linear', 'sigmoid', 'rbf'],
    'n_factors': [10, 20, 50],
    'n_epochs': [10, 20, 50],
    'reg': [0, 0.005, 0.1]
}

grid_search = GridSearchCV(KernelMF(verbose=0), scoring = 'neg_root_mean_squared_error', param_grid=param_grid, n_jobs=-1, cv=5, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   39.8s
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  1.9min finished


GridSearchCV(cv=5, estimator=KernelMF(gamma=0.01, verbose=0), n_jobs=-1,
             param_grid={'kernel': ['linear', 'sigmoid', 'rbf'],
                         'n_epochs': [10, 20, 50], 'n_factors': [10, 20, 50],
                         'reg': [0, 0.005, 0.1]},
             scoring='neg_root_mean_squared_error', verbose=1)

In [16]:
grid_search.best_score_
grid_search.best_params_

-0.9252857357209816

{'kernel': 'linear', 'n_epochs': 50, 'n_factors': 50, 'reg': 0.1}