In [1]:
# Data manipulation
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100

# Modeling
# from matrix_factorization import BaselineModel, KernelMF, train_update_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Other
import os
import random
import sys

# Reload imported code 
%load_ext autoreload
%autoreload 2

# Print all output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Custom modules
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from matrix_factorization import *
    
rand_seed = 2
np.random.seed(rand_seed)
random.seed(rand_seed)

# Load data

**Movie data found here https://grouplens.org/datasets/movielens/**

In [2]:
cols = ['user_id', 'item_id', 'rating', 'timestamp']
# movie_data = pd.read_csv('../data/ml-1m/ratings.dat', names = cols, sep = '::', usecols=[0, 1, 2], engine='python')
movie_data = pd.read_csv('../data/ml-100k/u.data', names = cols, sep = '\t', usecols=[0, 1, 2], engine='python')

X = movie_data[['user_id', 'item_id']]
y = movie_data['rating']

# Prepare data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Prepare data for online learning
X_train_initial, y_train_initial, X_train_update, y_train_update, X_test_update, y_test_update = train_update_test_split(movie_data, frac_new_users=0.2)

movie_data.head(10)

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
5,298,474,4
6,115,265,2
7,253,465,5
8,305,451,3
9,6,86,3


# Simple model with global mean

This is similar to just the global standard deviation

In [3]:
global_mean = y_train.mean()
pred = [global_mean for _ in range(y_test.shape[0])]

rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:4f}')


Test RMSE: 1.120652


# Baseline Model with biases

## SGD

In [4]:
%%time

baseline_model = BaselineModel(method='sgd', n_epochs = 20, reg = 0.005, lr = 0.01, verbose=1)
baseline_model.fit(X_train, y_train)

pred = baseline_model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.9686047785822739
Epoch  2 / 20  -  train_rmse: 0.9453664505621137
Epoch  3 / 20  -  train_rmse: 0.9351136783733202
Epoch  4 / 20  -  train_rmse: 0.9294704369882787
Epoch  5 / 20  -  train_rmse: 0.9259923829341642
Epoch  6 / 20  -  train_rmse: 0.9234802301835858
Epoch  7 / 20  -  train_rmse: 0.9218322975240733
Epoch  8 / 20  -  train_rmse: 0.9204146484742777
Epoch  9 / 20  -  train_rmse: 0.9195402387388252
Epoch  10 / 20  -  train_rmse: 0.9189683375712325
Epoch  11 / 20  -  train_rmse: 0.9183769991614104
Epoch  12 / 20  -  train_rmse: 0.9179874974998621
Epoch  13 / 20  -  train_rmse: 0.9176097352462143
Epoch  14 / 20  -  train_rmse: 0.9171260723903609
Epoch  15 / 20  -  train_rmse: 0.9168209946156023
Epoch  16 / 20  -  train_rmse: 0.9166864353013994
Epoch  17 / 20  -  train_rmse: 0.9165443126952022
Epoch  18 / 20  -  train_rmse: 0.9164152668839165
Epoch  19 / 20  -  train_rmse: 0.9161617476994287
Epoch  20 / 20  -  train_rmse: 0.9160456121895796

Test RMS

In [5]:
baseline_model.recommend(user=200)

Unnamed: 0,user_id,item_id,rating_pred
378,200,318,5.0
424,200,479,5.0
790,200,114,5.0
527,200,484,5.0
844,200,963,5.0
133,200,657,5.0
457,200,357,5.0
281,200,483,5.0
726,200,513,5.0
539,200,923,5.0


## ALS

In [6]:
%%time

baseline_model = BaselineModel(method='als', n_epochs = 20, reg = 0.5, verbose=1)
baseline_model.fit(X_train, y_train)

pred = baseline_model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.9312489364350157
Epoch  2 / 20  -  train_rmse: 0.9144875214764501
Epoch  3 / 20  -  train_rmse: 0.9134856911195807
Epoch  4 / 20  -  train_rmse: 0.9133800448918423
Epoch  5 / 20  -  train_rmse: 0.9133615794862777
Epoch  6 / 20  -  train_rmse: 0.9133565857003941
Epoch  7 / 20  -  train_rmse: 0.9133544601244424
Epoch  8 / 20  -  train_rmse: 0.9133531004630441
Epoch  9 / 20  -  train_rmse: 0.9133519902067218
Epoch  10 / 20  -  train_rmse: 0.9133509792033206
Epoch  11 / 20  -  train_rmse: 0.9133500175542733
Epoch  12 / 20  -  train_rmse: 0.9133490869495551
Epoch  13 / 20  -  train_rmse: 0.9133481801287349
Epoch  14 / 20  -  train_rmse: 0.9133472939684136
Epoch  15 / 20  -  train_rmse: 0.9133464269599311
Epoch  16 / 20  -  train_rmse: 0.9133455782426871
Epoch  17 / 20  -  train_rmse: 0.9133447472230197
Epoch  18 / 20  -  train_rmse: 0.9133439334215674
Epoch  19 / 20  -  train_rmse: 0.9133431364114416
Epoch  20 / 20  -  train_rmse: 0.9133423557930989

Test RMS

## Updating with new users

In [7]:
baseline_model = BaselineModel(method='sgd', n_epochs = 20, lr=0.01, reg = 0.05, verbose=1)
baseline_model.fit(X_train_initial, y_train_initial)

Epoch  1 / 20  -  train_rmse: 0.9648469718465079
Epoch  2 / 20  -  train_rmse: 0.9428549007268147
Epoch  3 / 20  -  train_rmse: 0.933359815507563
Epoch  4 / 20  -  train_rmse: 0.9282869121659242
Epoch  5 / 20  -  train_rmse: 0.9246452147883015
Epoch  6 / 20  -  train_rmse: 0.9224963221592593
Epoch  7 / 20  -  train_rmse: 0.9210873751265738
Epoch  8 / 20  -  train_rmse: 0.9198148526842017
Epoch  9 / 20  -  train_rmse: 0.9190979962795893
Epoch  10 / 20  -  train_rmse: 0.9181924929724009
Epoch  11 / 20  -  train_rmse: 0.9175746422478163
Epoch  12 / 20  -  train_rmse: 0.9172790736796215
Epoch  13 / 20  -  train_rmse: 0.916719188200985
Epoch  14 / 20  -  train_rmse: 0.9164995522990769
Epoch  15 / 20  -  train_rmse: 0.9162652209705183
Epoch  16 / 20  -  train_rmse: 0.9161349468164517
Epoch  17 / 20  -  train_rmse: 0.9158049890489385
Epoch  18 / 20  -  train_rmse: 0.9156282513797119
Epoch  19 / 20  -  train_rmse: 0.9157443869486565
Epoch  20 / 20  -  train_rmse: 0.915292343522453


BaselineModel(n_epochs=20, reg=0.05)

In [8]:
%%time
baseline_model.update_users(X_train_update, y_train_update, n_epochs=20, lr=0.001, verbose=1)
pred = baseline_model.predict(X_test_update)
rmse = mean_squared_error(y_test_update, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.019344266663901
Epoch  2 / 20  -  train_rmse: 1.0026434848559835
Epoch  3 / 20  -  train_rmse: 0.9901902375419621
Epoch  4 / 20  -  train_rmse: 0.9807399554407025
Epoch  5 / 20  -  train_rmse: 0.9734393366405978
Epoch  6 / 20  -  train_rmse: 0.9677053535168566
Epoch  7 / 20  -  train_rmse: 0.9631010740660371
Epoch  8 / 20  -  train_rmse: 0.9593464832305924
Epoch  9 / 20  -  train_rmse: 0.956228132278306
Epoch  10 / 20  -  train_rmse: 0.953597496156807
Epoch  11 / 20  -  train_rmse: 0.9513585953281702
Epoch  12 / 20  -  train_rmse: 0.9494132780961599
Epoch  13 / 20  -  train_rmse: 0.9477215344137045
Epoch  14 / 20  -  train_rmse: 0.9462276496158921
Epoch  15 / 20  -  train_rmse: 0.9449016954637041
Epoch  16 / 20  -  train_rmse: 0.943701383621735
Epoch  17 / 20  -  train_rmse: 0.9426320119229815
Epoch  18 / 20  -  train_rmse: 0.94165973765909
Epoch  19 / 20  -  train_rmse: 0.9407697547984482
Epoch  20 / 20  -  train_rmse: 0.9399611302791129

Test RMSE: 0.9

# Matrix Factorization

## Linear Kernel

In [9]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.0801575298322
Epoch  2 / 20  -  train_rmse: 1.0473535756055057
Epoch  3 / 20  -  train_rmse: 1.0244942690795111
Epoch  4 / 20  -  train_rmse: 1.0075007581238407
Epoch  5 / 20  -  train_rmse: 0.9942400187850499
Epoch  6 / 20  -  train_rmse: 0.9834983608869214
Epoch  7 / 20  -  train_rmse: 0.9745173746067622
Epoch  8 / 20  -  train_rmse: 0.966813617253945
Epoch  9 / 20  -  train_rmse: 0.9600678085418083
Epoch  10 / 20  -  train_rmse: 0.9540556957314716
Epoch  11 / 20  -  train_rmse: 0.9486146196037182
Epoch  12 / 20  -  train_rmse: 0.9436378636687951
Epoch  13 / 20  -  train_rmse: 0.9390297626064091
Epoch  14 / 20  -  train_rmse: 0.9347241390534637
Epoch  15 / 20  -  train_rmse: 0.9306742385879399
Epoch  16 / 20  -  train_rmse: 0.9268325064246742
Epoch  17 / 20  -  train_rmse: 0.9231708515555644
Epoch  18 / 20  -  train_rmse: 0.9196592127479206
Epoch  19 / 20  -  train_rmse: 0.9162757110307747
Epoch  20 / 20  -  train_rmse: 0.9130011417801452

Test RMSE: 0

## Getting list of recommendations for a user

In [10]:
user = 200
items_known = X_train.query('user_id == @user')['item_id']
matrix_fact.recommend(user=user, items_known=items_known)

Unnamed: 0,user_id,item_id,rating_pred
37,200,64,5.0
242,200,357,4.956367
11,200,127,4.913384
61,200,272,4.90369
395,200,480,4.838028
710,200,479,4.83752
275,200,12,4.819722
655,200,427,4.810242
55,200,511,4.805875
17,200,100,4.803776


## Updating with new users

In [11]:
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)
matrix_fact.fit(X_train_initial, y_train_initial)

Epoch  1 / 20  -  train_rmse: 1.0705332940129852
Epoch  2 / 20  -  train_rmse: 1.0382345850581252
Epoch  3 / 20  -  train_rmse: 1.0162136164394588
Epoch  4 / 20  -  train_rmse: 0.9999132435689974
Epoch  5 / 20  -  train_rmse: 0.9872219881342071
Epoch  6 / 20  -  train_rmse: 0.9769264669882989
Epoch  7 / 20  -  train_rmse: 0.9683055466691725
Epoch  8 / 20  -  train_rmse: 0.9608919823567673
Epoch  9 / 20  -  train_rmse: 0.9543918764078243
Epoch  10 / 20  -  train_rmse: 0.9485794121612329
Epoch  11 / 20  -  train_rmse: 0.943304877324259
Epoch  12 / 20  -  train_rmse: 0.9384605161062898
Epoch  13 / 20  -  train_rmse: 0.9339626352883069
Epoch  14 / 20  -  train_rmse: 0.9297497730296204
Epoch  15 / 20  -  train_rmse: 0.9257708120894028
Epoch  16 / 20  -  train_rmse: 0.9219861472875341
Epoch  17 / 20  -  train_rmse: 0.9183660245607463
Epoch  18 / 20  -  train_rmse: 0.9148836025568047
Epoch  19 / 20  -  train_rmse: 0.9115182633538961
Epoch  20 / 20  -  train_rmse: 0.908251104737101


KernelMF(gamma=0.01, lr=0.001, n_epochs=20, reg=0.005)

In [12]:
%%time
# Update model with new users
matrix_fact.update_users(X_train_update, y_train_update, lr=0.001, n_epochs=20, verbose=1)
pred = matrix_fact.predict(X_test_update)
rmse = mean_squared_error(y_test_update, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.0398070512329172
Epoch  2 / 20  -  train_rmse: 1.0205023242966256
Epoch  3 / 20  -  train_rmse: 1.0058673036631753
Epoch  4 / 20  -  train_rmse: 0.994569994505876
Epoch  5 / 20  -  train_rmse: 0.9856862982326481
Epoch  6 / 20  -  train_rmse: 0.9785819446494046
Epoch  7 / 20  -  train_rmse: 0.9727304709667893
Epoch  8 / 20  -  train_rmse: 0.9678411090379412
Epoch  9 / 20  -  train_rmse: 0.9636493359569871
Epoch  10 / 20  -  train_rmse: 0.9600064702858608
Epoch  11 / 20  -  train_rmse: 0.9567916911608662
Epoch  12 / 20  -  train_rmse: 0.9539041661646682
Epoch  13 / 20  -  train_rmse: 0.9512798086695983
Epoch  14 / 20  -  train_rmse: 0.948865081665178
Epoch  15 / 20  -  train_rmse: 0.9466174922145003
Epoch  16 / 20  -  train_rmse: 0.9445214789672587
Epoch  17 / 20  -  train_rmse: 0.9425512951558327
Epoch  18 / 20  -  train_rmse: 0.9406809794251239
Epoch  19 / 20  -  train_rmse: 0.9388985732382609
Epoch  20 / 20  -  train_rmse: 0.9371936241883402

Test RMSE:

## Sigmoid kernel

In [13]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.01, reg = 0.005, kernel='sigmoid')
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.725481923520382
Epoch  2 / 20  -  train_rmse: 1.7002825961415657
Epoch  3 / 20  -  train_rmse: 1.6621672871254909
Epoch  4 / 20  -  train_rmse: 1.6209576751533759
Epoch  5 / 20  -  train_rmse: 1.5755476317807886
Epoch  6 / 20  -  train_rmse: 1.5233231690476359
Epoch  7 / 20  -  train_rmse: 1.4656804966819397
Epoch  8 / 20  -  train_rmse: 1.4093258199041918
Epoch  9 / 20  -  train_rmse: 1.3583867662783806
Epoch  10 / 20  -  train_rmse: 1.3134218215860078
Epoch  11 / 20  -  train_rmse: 1.2739631301583274
Epoch  12 / 20  -  train_rmse: 1.2393720007790097
Epoch  13 / 20  -  train_rmse: 1.208724044999088
Epoch  14 / 20  -  train_rmse: 1.181439192432482
Epoch  15 / 20  -  train_rmse: 1.1569561025632533
Epoch  16 / 20  -  train_rmse: 1.1349060147014292
Epoch  17 / 20  -  train_rmse: 1.1148811095124267
Epoch  18 / 20  -  train_rmse: 1.0966100593068233
Epoch  19 / 20  -  train_rmse: 1.079846148221734
Epoch  20 / 20  -  train_rmse: 1.0642544113528227

Test RMSE: 1

## RBF Kernel

In [14]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.5, reg = 0.005, kernel='rbf')
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.2635558927053938
Epoch  2 / 20  -  train_rmse: 1.1111584783217938
Epoch  3 / 20  -  train_rmse: 1.0457439238596105
Epoch  4 / 20  -  train_rmse: 1.0057746487964505
Epoch  5 / 20  -  train_rmse: 0.9742714803037644
Epoch  6 / 20  -  train_rmse: 0.9519457726606385
Epoch  7 / 20  -  train_rmse: 0.9340352891374744
Epoch  8 / 20  -  train_rmse: 0.922432432511659
Epoch  9 / 20  -  train_rmse: 0.9114725283772854
Epoch  10 / 20  -  train_rmse: 0.9042017586797012
Epoch  11 / 20  -  train_rmse: 0.8998170755722176
Epoch  12 / 20  -  train_rmse: 0.8947140275777387
Epoch  13 / 20  -  train_rmse: 0.892280077616079
Epoch  14 / 20  -  train_rmse: 0.888934438328414
Epoch  15 / 20  -  train_rmse: 0.8867558814208203
Epoch  16 / 20  -  train_rmse: 0.8844556175891591
Epoch  17 / 20  -  train_rmse: 0.8830174765348365
Epoch  18 / 20  -  train_rmse: 0.882521792804504
Epoch  19 / 20  -  train_rmse: 0.8822296243910285
Epoch  20 / 20  -  train_rmse: 0.8808961129475015

Test RMSE: 0

# Scikit-learn compatability

In [None]:
from sklearn.model_selection import GridSearchCV, ParameterGrid

param_grid = {
    'kernel': ['linear', 'sigmoid', 'rbf'],
    'n_factors': [10, 20, 50],
    'n_epochs': [10, 20, 50],
    'reg': [0, 0.005, 0.1]
}

grid_search = GridSearchCV(KernelMF(verbose=0), scoring = 'neg_root_mean_squared_error', param_grid=param_grid, n_jobs=-1, cv=5, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   23.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.3min


In [None]:
grid_search.best_score_
grid_search.best_params_