In [9]:
# Clear all variables and conserve memory
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [185]:
# Data manipulation
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100

# Recommender systems
import funk_svd
from funk_svd.dataset import fetch_ml_ratings
from sklearn.metrics import mean_absolute_error, mean_squared_error
import surprise as sp

# Other
import numba as nb
import os
import random
import sys
from sys import getsizeof

# Reload imported code
%load_ext autoreload
%autoreload 2

# Custom modules
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.models import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# MAL Recommendations

In [2]:
# Load data

# user_df = pd.read_csv('../data/raw/users_cleaned.csv')
# anime_df = pd.read_csv('../data/raw/anime_cleaned.csv')
user_animelist_df = pd.read_csv('../data/raw/animelists_cleaned.csv')

user_animelist_df = user_animelist_df[['username', 'anime_id', 'my_score']]
user_animelist_df.rename(columns = {'username': 'u_id', 'anime_id': 'i_id', 'my_score': 'rating'}, inplace = True)

# Split data
train = user_animelist_df.sample(frac=0.8, random_state=7)
val = user_animelist_df.drop(train.index.tolist()).sample(frac=0.5, random_state=8)
test = user_animelist_df.drop(train.index.tolist()).drop(val.index.tolist())

Let's start by comparing Funk SVD and Surprise libraries to see how they compare with each other

## Funk SVD

In [12]:
%%time

svd = funk_svd.SVD(learning_rate=0.001, regularization=0.005, n_epochs=100,
          n_factors=15, min_rating=1, max_rating=5)

svd.fit(X=train, X_val=val, early_stopping=True, shuffle=False)

pred = svd.predict(test)
mae = mean_squared_error(test["rating"], pred)

print(f'Test MSE: {mae:.2f}')

Preprocessing data...

Epoch 1/100  | val_loss: 12.08 - val_rmse: 3.48 - val_mae: 3.11 - took 4.6 sec
Epoch 2/100  | val_loss: 10.88 - val_rmse: 3.30 - val_mae: 2.85 - took 3.2 sec
Epoch 3/100  | val_loss: 10.23 - val_rmse: 3.20 - val_mae: 2.66 - took 3.2 sec
Epoch 4/100  | val_loss: 9.82 - val_rmse: 3.13 - val_mae: 2.56 - took 3.0 sec
Epoch 5/100  | val_loss: 9.46 - val_rmse: 3.08 - val_mae: 2.47 - took 3.1 sec
Epoch 6/100  | val_loss: 9.18 - val_rmse: 3.03 - val_mae: 2.40 - took 3.3 sec
Epoch 7/100  | val_loss: 9.00 - val_rmse: 3.00 - val_mae: 2.36 - took 3.2 sec
Epoch 8/100  | val_loss: 8.87 - val_rmse: 2.98 - val_mae: 2.32 - took 3.1 sec
Epoch 9/100  | val_loss: 8.79 - val_rmse: 2.96 - val_mae: 2.30 - took 3.1 sec
Epoch 10/100 | val_loss: 8.72 - val_rmse: 2.95 - val_mae: 2.28 - took 3.1 sec
Epoch 11/100 | val_loss: 8.68 - val_rmse: 2.95 - val_mae: 2.27 - took 3.1 sec
Epoch 12/100 | val_loss: 8.64 - val_rmse: 2.94 - val_mae: 2.26 - took 3.1 sec
Epoch 13/100 | val_loss: 8.62 - val_rm

## Surprise

In [5]:
%%time

surprise_svd = sp.SVD(lr_all=0.001, reg_all=0.005, n_epochs=100,
          n_factors=15)

reader = sp.Reader()

data_train = sp.Dataset.load_from_df(train[['u_id', 'i_id', 'rating']], reader = reader).build_full_trainset()
data_test = sp.Dataset.load_from_df(test[['u_id', 'i_id', 'rating']], reader = reader).build_full_trainset().build_testset()
surprise_svd.fit(data_train)

Wall time: 50min 33s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1bfe3c06108>

In [6]:
%%time

# Get predictions and calculate test RMSE

sp_predictions = surprise_svd.test(data_test)
sp.accuracy.rmse(sp_predictions)

RMSE: 3.2992
Wall time: 1min 2s


3.2991533921074048

## Comments

Overall they produce similar and decent results, however the Funk SVD package is much faster in speed. By looking at the github repo it seems to be the difference between Numba and Cython. The downside to this is that Funk SVD only provides and SVD implementation and none others.

# Experimenting algorithms with movie data

In [187]:
df = fetch_ml_ratings(variant = '1m')

# Prepare data
train = df.sample(frac=0.8, random_state=7)
val = df.drop(train.index.tolist()).sample(frac=0.5, random_state=8)
test = df.drop(train.index.tolist()).drop(val.index.tolist())

## Simple Model with global mean

In [161]:
global_mean = train['rating'].mean()
pred = [global_mean for _ in range(test.shape[0])]

rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:4f}')


Test RMSE: 1.116911


## Baseline Model with biases

In [175]:
global_mean = train['rating'].mean()

user_biases = train.drop(['i_id'], axis=1).groupby('u_id').mean() - global_mean
item_biases = train.drop(['u_id'], axis=1).groupby('i_id').mean() - global_mean

In [176]:
pred = pd.merge(test[['u_id', 'i_id']], user_biases, how = 'left', left_on = 'u_id', right_index = True)
pred = pd.merge(pred, item_biases, how = 'left', left_on = 'i_id', right_index = True)
pred.fillna(0, inplace=True)
pred = pred['rating_x'] + pred['rating_y'] + global_mean

rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:4f}')


Test RMSE: 0.938412


## Baseline Model with biases - SGD

In [194]:
%%time

baseline_model = BaselineModel(n_epochs = 20, reg = 0.005)
baseline_model.fit(train)

Epoch  1 / 20  -  train_rmse: 6.196910058098994e-05
Epoch  2 / 20  -  train_rmse: 3.0374388476043757e-06
Epoch  3 / 20  -  train_rmse: 2.1143773117482846e-06
Epoch  4 / 20  -  train_rmse: 9.998486170829027e-07
Epoch  5 / 20  -  train_rmse: 4.164351069822115e-07
Epoch  6 / 20  -  train_rmse: 1.3317714122277283e-07
Epoch  7 / 20  -  train_rmse: 1.4631216438962827e-08
Epoch  8 / 20  -  train_rmse: 1.0867114685708993e-08
Epoch  9 / 20  -  train_rmse: 9.998871521273606e-08
Epoch  10 / 20  -  train_rmse: 2.6577130552643495e-07
Epoch  11 / 20  -  train_rmse: 4.920030231244285e-07
Epoch  12 / 20  -  train_rmse: 7.624426932040988e-07
Epoch  13 / 20  -  train_rmse: 1.061983624065085e-06
Epoch  14 / 20  -  train_rmse: 1.3775795316492494e-06
Epoch  15 / 20  -  train_rmse: 1.698652439713734e-06
Epoch  16 / 20  -  train_rmse: 2.0170840514535404e-06
Epoch  17 / 20  -  train_rmse: 2.326962516121101e-06
Epoch  18 / 20  -  train_rmse: 2.6242243599428368e-06
Epoch  19 / 20  -  train_rmse: 2.9062805812922

BaselineModel(lr=0.005, max_rating=5, min_rating=0, n_epochs=20, reg=0.005,
              verbose=1)

## Custom SVD

In [205]:
%%time 
random.seed(2)
np.random.seed(2)
svd = SVD(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)
svd.fit(train)

pred, pred_possible = svd.predict(test)
rmse = mean_squared_error(test['rating'], pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.06055417459619898
Epoch  2 / 20  -  train_rmse: 0.012014171581518273
Epoch  3 / 20  -  train_rmse: 0.0026045595794161954
Epoch  4 / 20  -  train_rmse: 0.0007084621474111399
Epoch  5 / 20  -  train_rmse: 0.0002769511151859069
Epoch  6 / 20  -  train_rmse: 0.00015685467501911417
Epoch  7 / 20  -  train_rmse: 0.00011258677295116814
Epoch  8 / 20  -  train_rmse: 8.804513578312554e-05
Epoch  9 / 20  -  train_rmse: 6.771620039728805e-05
Epoch  10 / 20  -  train_rmse: 4.773444095496389e-05
Epoch  11 / 20  -  train_rmse: 2.843289274759688e-05
Epoch  12 / 20  -  train_rmse: 1.1980960720196331e-05
Epoch  13 / 20  -  train_rmse: 1.624799926434436e-06
Epoch  14 / 20  -  train_rmse: 1.5582180956251972e-06
Epoch  15 / 20  -  train_rmse: 1.7073258054052865e-05
Epoch  16 / 20  -  train_rmse: 5.485147537296227e-05
Epoch  17 / 20  -  train_rmse: 0.0001233444015317176
Epoch  18 / 20  -  train_rmse: 0.00023322848125212976
Epoch  19 / 20  -  train_rmse: 0.0003979330411642640

## Funk SVD

In [184]:
%%time

np.random.seed(2)
random.seed(2)

train = df.sample(frac=0.8, random_state=7)
val = df.drop(train.index.tolist()).sample(frac=0.5, random_state=8)
test = df.drop(train.index.tolist()).drop(val.index.tolist())

svd = funk_svd.SVD(learning_rate=0.001, regularization=0.005, n_epochs=20,
          n_factors=100, min_rating=1, max_rating=5)

svd.fit(X=train, X_val=val, early_stopping=False, shuffle=False)

pred_funk = svd.predict(test)
mae = mean_squared_error(test["rating"], pred_funk, squared = False)

print(f'Test RMSE: {mae:.2f}')

Preprocessing data...

Epoch 1/20  | took 0.2 sec
Epoch 2/20  | took 0.3 sec
Epoch 3/20  | took 0.3 sec
Epoch 4/20  | took 0.3 sec
Epoch 5/20  | took 0.3 sec
Epoch 6/20  | took 0.3 sec
Epoch 7/20  | took 0.3 sec
Epoch 8/20  | took 0.3 sec
Epoch 9/20  | took 0.3 sec
Epoch 10/20 | took 0.3 sec
Epoch 11/20 | took 0.3 sec
Epoch 12/20 | took 0.3 sec
Epoch 13/20 | took 0.3 sec
Epoch 14/20 | took 0.3 sec
Epoch 15/20 | took 0.3 sec
Epoch 16/20 | took 0.2 sec
Epoch 17/20 | took 0.3 sec
Epoch 18/20 | took 0.3 sec
Epoch 19/20 | took 0.2 sec
Epoch 20/20 | took 0.2 sec

Training took 6 sec
Test RMSE: 0.92
Wall time: 6.76 s


## Surprise

In [90]:
# Load the movielens-100k dataset (download it if needed),
data = sp.Dataset.load_builtin('ml-100k')

In [91]:
%%time
# We'll use the famous SVD algorithm.
algo = sp.SVD()

# Run 5-fold cross-validation and print results
sp.model_selection.cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9265  0.9351  0.9324  0.9415  0.9408  0.9353  0.0056  
MAE (testset)     0.7294  0.7344  0.7358  0.7421  0.7399  0.7363  0.0044  
Fit time          4.16    4.19    4.22    4.25    4.20    4.20    0.03    
Test time         0.13    0.45    0.11    0.11    0.11    0.18    0.13    
Wall time: 22.6 s


{'test_rmse': array([0.92651162, 0.93507166, 0.93241232, 0.94154717, 0.94084358]),
 'test_mae': array([0.72938989, 0.73444756, 0.73576219, 0.74210202, 0.73993852]),
 'fit_time': (4.163815021514893,
  4.187639236450195,
  4.216679334640503,
  4.247923374176025,
  4.1983418464660645),
 'test_time': (0.12666082382202148,
  0.45206427574157715,
  0.11275815963745117,
  0.1120297908782959,
  0.11228561401367188)}

In [93]:
%%time

surprise_svd = sp.SVD(lr_all=0.001, reg_all=0.005, n_epochs=20,
          n_factors=50)

reader = sp.Reader()

data_train = sp.Dataset.load_from_df(train[['u_id', 'i_id', 'rating']], reader = reader).build_full_trainset()
data_test = sp.Dataset.load_from_df(test[['u_id', 'i_id', 'rating']], reader = reader).build_full_trainset().build_testset()
surprise_svd.fit(data_train)

pred = surprise_svd.test(data_test)
sp.accuracy.rmse(pred)

RMSE: 0.9169
Wall time: 31.1 s


0.9168845476049273

In [21]:
surprise_svd.test(data_test)[:20]

[Prediction(uid=259, iid=200, r_ui=4.0, est=4.1670259763161255, details={'was_impossible': False}),
 Prediction(uid=259, iid=357, r_ui=5.0, est=4.430772728105106, details={'was_impossible': False}),
 Prediction(uid=259, iid=179, r_ui=4.0, est=4.20424005908081, details={'was_impossible': False}),
 Prediction(uid=259, iid=1135, r_ui=5.0, est=3.231236971348656, details={'was_impossible': False}),
 Prediction(uid=259, iid=15, r_ui=3.0, est=3.91758055378108, details={'was_impossible': False}),
 Prediction(uid=259, iid=121, r_ui=3.0, est=3.590538369392122, details={'was_impossible': False}),
 Prediction(uid=259, iid=294, r_ui=3.0, est=3.2821077351936805, details={'was_impossible': False}),
 Prediction(uid=259, iid=748, r_ui=4.0, est=3.323876222072292, details={'was_impossible': False}),
 Prediction(uid=259, iid=147, r_ui=4.0, est=3.8508097317210708, details={'was_impossible': False}),
 Prediction(uid=851, iid=696, r_ui=3.0, est=3.3294724032153775, details={'was_impossible': False}),
 Predict