In [1]:
import sys
sys.path.append('../../')

from os.path import join as pjoin

import pandas as pd
import numpy as np

from src.utils import read_json_df
from src.models.factorization import AlternatingLeastSquaresModel, FunkSVDModel
from src.models.nn import NNColaborativeModel

In [2]:
DATASET_PATH = "../../data/yelp_dataset/"

In [3]:
review_df = read_json_df(pjoin(DATASET_PATH, "yelp_academic_dataset_review.json"))
business_df = read_json_df(pjoin(DATASET_PATH, "yelp_academic_dataset_business.json"))
user_df = read_json_df(pjoin(DATASET_PATH, "yelp_academic_dataset_user.json"))

In [4]:
review_df['date'] = pd.to_datetime(review_df['date'])

## ALS

Hyperparameter tuning:

In [8]:
for lambda_ in [0.1, 1, 10, 100]:
    for n_factors in [10, 20, 50, 100, 200]:
        als_model = AlternatingLeastSquaresModel(n_factors=n_factors, reguralization_param=lambda_, eps=1e-2)
        metrics = als_model.evaluate(review_df, user_df, business_df, short_eval=True, short_eval_train_samples=10_000)

        print(f"Lambda={lambda_}, {n_factors=}: RMSE={metrics['rmse']}, Accuracy={metrics['accuracy']}, MAP@K={metrics['MAP@K']}")

Evaluation fold: 100%|██████████| 1/1 [00:23<00:00, 23.90s/it]


Lambda=0.1, n_factors=10: RMSE=1.2166583370891606, Accuracy=0.301, MAP@K=0.0


Evaluation fold: 100%|██████████| 1/1 [00:30<00:00, 30.03s/it]


Lambda=0.1, n_factors=20: RMSE=1.2337812911032755, Accuracy=0.302, MAP@K=0.0009520574534161491


Evaluation fold: 100%|██████████| 1/1 [00:35<00:00, 35.28s/it]


Lambda=0.1, n_factors=50: RMSE=1.216983639456942, Accuracy=0.308, MAP@K=0.00029772688060731537


Evaluation fold: 100%|██████████| 1/1 [00:42<00:00, 42.41s/it]


Lambda=0.1, n_factors=100: RMSE=1.206947928120437, Accuracy=0.304, MAP@K=0.0005241761559696342


Evaluation fold: 100%|██████████| 1/1 [01:20<00:00, 80.46s/it]


Lambda=0.1, n_factors=200: RMSE=1.1910057974850807, Accuracy=0.317, MAP@K=0.0010483523119392685


Evaluation fold: 100%|██████████| 1/1 [00:17<00:00, 17.39s/it]


Lambda=1, n_factors=10: RMSE=1.3613160062176735, Accuracy=0.21, MAP@K=2.7173913043478262e-05


Evaluation fold: 100%|██████████| 1/1 [00:18<00:00, 18.50s/it]


Lambda=1, n_factors=20: RMSE=1.1948659552566443, Accuracy=0.3, MAP@K=0.0


Evaluation fold: 100%|██████████| 1/1 [00:24<00:00, 24.26s/it]


Lambda=1, n_factors=50: RMSE=1.1908634443652306, Accuracy=0.312, MAP@K=0.0006350284679089025


Evaluation fold: 100%|██████████| 1/1 [00:41<00:00, 41.76s/it]


Lambda=1, n_factors=100: RMSE=1.2057957597985491, Accuracy=0.305, MAP@K=0.0002667788129744651


Evaluation fold: 100%|██████████| 1/1 [01:16<00:00, 76.26s/it]


Lambda=1, n_factors=200: RMSE=1.2460602227003066, Accuracy=0.239, MAP@K=0.00022979209799861973


Evaluation fold: 100%|██████████| 1/1 [00:17<00:00, 17.38s/it]


Lambda=10, n_factors=10: RMSE=1.148321494043257, Accuracy=0.32, MAP@K=0.0010483523119392685


Evaluation fold: 100%|██████████| 1/1 [00:19<00:00, 19.11s/it]


Lambda=10, n_factors=20: RMSE=1.2046292604353062, Accuracy=0.25, MAP@K=0.0


Evaluation fold: 100%|██████████| 1/1 [00:23<00:00, 23.64s/it]


Lambda=10, n_factors=50: RMSE=1.1499304809642, Accuracy=0.325, MAP@K=0.0


Evaluation fold: 100%|██████████| 1/1 [00:36<00:00, 36.36s/it]


Lambda=10, n_factors=100: RMSE=1.190070551334774, Accuracy=0.257, MAP@K=0.0


Evaluation fold: 100%|██████████| 1/1 [01:08<00:00, 68.56s/it]


Lambda=10, n_factors=200: RMSE=1.1537055530924052, Accuracy=0.32, MAP@K=0.0


Evaluation fold: 100%|██████████| 1/1 [00:20<00:00, 20.09s/it]


Lambda=100, n_factors=10: RMSE=1.139492087613765, Accuracy=0.328, MAP@K=0.0


Evaluation fold: 100%|██████████| 1/1 [00:24<00:00, 24.01s/it]


Lambda=100, n_factors=20: RMSE=1.1462382084204559, Accuracy=0.328, MAP@K=0.0


Evaluation fold: 100%|██████████| 1/1 [00:34<00:00, 34.45s/it]


Lambda=100, n_factors=50: RMSE=1.1354198762837842, Accuracy=0.328, MAP@K=0.0


Evaluation fold: 100%|██████████| 1/1 [00:40<00:00, 40.07s/it]


Lambda=100, n_factors=100: RMSE=1.138961064068638, Accuracy=0.328, MAP@K=0.0


Evaluation fold: 100%|██████████| 1/1 [01:52<00:00, 112.50s/it]

Lambda=100, n_factors=200: RMSE=1.1348668592973437, Accuracy=0.328, MAP@K=0.0





Best model found:

In [6]:
als_model = AlternatingLeastSquaresModel(n_factors=10, reguralization_param=10, eps=1e-2)
als_model.evaluate(review_df, user_df, business_df, short_eval=True, short_eval_train_samples=100_000)

Evaluation fold: 100%|██████████| 1/1 [12:23<00:00, 743.33s/it]


{'rmse': 1.2592897376561798,
 'mae': 1.0770081622258794,
 'accuracy': 0.2119,
 'f1': 0.11503925313041885,
 'precision': 0.5417094352467278,
 'recall': 0.20759367098066414,
 'AP@1': 0.0,
 'AP@3': 8.804366966015144e-05,
 'AP@K': 0.00013206550449022716,
 'MAP@K': 6.3349516598328e-05}

## FunkSVD

Hyperparameter tuning:

In [7]:
for lambda_ in [0.1, 1, 10]:
    for n_factors in [10, 20, 50]:
        for learning_rate in [1e-2, 0.1, 1]:
            for n_epoch in [5, 10, 50]:
                fsvdmodel = FunkSVDModel(n_factors=n_factors, reguralization_param=lambda_, learning_rate=learning_rate, n_epoch=n_epoch)
                metrics = fsvdmodel.evaluate(review_df, user_df, business_df, short_eval=True, short_eval_train_samples=10_000)

                print(f"Lambda={lambda_}, {n_factors=}, {learning_rate=}, {n_epoch=}: RMSE={metrics['rmse']}, Accuracy={metrics['accuracy']}, MAP@K={metrics['MAP@K']}")

100%|██████████| 10/10 [00:03<00:00,  2.52it/s] ?it/s]
Evaluation fold: 100%|██████████| 1/1 [00:17<00:00, 17.47s/it]


Lambda=0.1, n_factors=10, learning_rate=0.01, n_epoch=5: RMSE=1.105692823347056, Accuracy=0.352, MAP@K=0.0007180598688750863


100%|██████████| 10/10 [00:07<00:00,  1.30it/s] ?it/s]
Evaluation fold:   0%|          | 0/1 [00:10<?, ?it/s]


KeyboardInterrupt: 

Best model found:

In [None]:
fsvdmodel = FunkSVDModel(n_factors=n_factors, reguralization_param=lambda_, learning_rate=learning_rate, n_epoch=n_epoch)
fsvdmodel.evaluate(review_df, user_df, business_df, short_eval=True, short_eval_train_samples=100_000)

## NN Colaborative Filtering

Hyperparameter tuning:

In [29]:
for epochs in [1, 3, 10, 30]:
    for n_factors in [5, 10, 20, 50]:
        for learning_rate in [1e-3, 1e-2, 0.1]:
            nnmodel = NNColaborativeModel(learning_rate=learning_rate, n_embed=n_factors, epochs=epochs, batch_size=1024)
            metrics = nnmodel.evaluate(review_df, user_df, business_df, short_eval=True, short_eval_train_samples=10_000)
            print(f"Epochs={epochs}, {n_factors=}, {learning_rate=}: RMSE={metrics['rmse']}, Accuracy={metrics['accuracy']}, MAP@K={metrics['MAP@K']}")


Evaluation fold: 100%|██████████| 1/1 [00:13<00:00, 13.63s/it]


Epochs=1, n_factors=5, learning_rate=0.001: RMSE=1.153872540990505, Accuracy=0.272, MAP@K=0.0001754442719116632


Evaluation fold: 100%|██████████| 1/1 [00:13<00:00, 13.47s/it]


Epochs=1, n_factors=5, learning_rate=0.01: RMSE=1.1301813317687588, Accuracy=0.328, MAP@K=0.0002667788129744651


Evaluation fold: 100%|██████████| 1/1 [00:13<00:00, 13.35s/it]


Epochs=1, n_factors=5, learning_rate=0.1: RMSE=1.2853015210447702, Accuracy=0.312, MAP@K=0.0004512810559006211


Evaluation fold: 100%|██████████| 1/1 [00:13<00:00, 13.44s/it]


Epochs=1, n_factors=10, learning_rate=0.001: RMSE=1.1318634554135296, Accuracy=0.329, MAP@K=2.7173913043478262e-05


Evaluation fold: 100%|██████████| 1/1 [00:13<00:00, 13.64s/it]


Epochs=1, n_factors=10, learning_rate=0.01: RMSE=1.1281237396820485, Accuracy=0.323, MAP@K=0.00013015441683919944


Evaluation fold: 100%|██████████| 1/1 [00:13<00:00, 13.71s/it]


Epochs=1, n_factors=10, learning_rate=0.1: RMSE=1.2853015210447702, Accuracy=0.312, MAP@K=0.0011422748447204969


Evaluation fold: 100%|██████████| 1/1 [00:13<00:00, 13.94s/it]


Epochs=1, n_factors=20, learning_rate=0.001: RMSE=1.1438469043725836, Accuracy=0.278, MAP@K=0.00047964113181504493


Evaluation fold: 100%|██████████| 1/1 [00:13<00:00, 13.55s/it]


Epochs=1, n_factors=20, learning_rate=0.01: RMSE=1.1273760659326866, Accuracy=0.329, MAP@K=0.00029772688060731537


Evaluation fold: 100%|██████████| 1/1 [00:13<00:00, 13.44s/it]


Epochs=1, n_factors=20, learning_rate=0.1: RMSE=1.2853015210447702, Accuracy=0.312, MAP@K=0.00035994651483781917


Evaluation fold: 100%|██████████| 1/1 [00:13<00:00, 13.65s/it]


Epochs=1, n_factors=50, learning_rate=0.001: RMSE=1.1464346142984096, Accuracy=0.281, MAP@K=0.0


Evaluation fold: 100%|██████████| 1/1 [00:13<00:00, 13.64s/it]


Epochs=1, n_factors=50, learning_rate=0.01: RMSE=1.1316969601128872, Accuracy=0.327, MAP@K=0.0001754442719116632


Evaluation fold: 100%|██████████| 1/1 [00:13<00:00, 13.45s/it]


Epochs=1, n_factors=50, learning_rate=0.1: RMSE=1.2853015210447702, Accuracy=0.312, MAP@K=0.00022979209799861973


Evaluation fold: 100%|██████████| 1/1 [00:13<00:00, 13.89s/it]


Epochs=3, n_factors=5, learning_rate=0.001: RMSE=1.1500563314078949, Accuracy=0.329, MAP@K=0.0


Evaluation fold: 100%|██████████| 1/1 [00:13<00:00, 13.77s/it]


Epochs=3, n_factors=5, learning_rate=0.01: RMSE=1.1318809574245388, Accuracy=0.328, MAP@K=0.0005241761559696342


Evaluation fold: 100%|██████████| 1/1 [00:13<00:00, 13.60s/it]


Epochs=3, n_factors=5, learning_rate=0.1: RMSE=1.2853015210447702, Accuracy=0.312, MAP@K=0.00022979209799861973


Evaluation fold: 100%|██████████| 1/1 [00:14<00:00, 14.00s/it]


Epochs=3, n_factors=10, learning_rate=0.001: RMSE=1.1339387291341594, Accuracy=0.328, MAP@K=0.0


Evaluation fold: 100%|██████████| 1/1 [00:13<00:00, 13.83s/it]


Epochs=3, n_factors=10, learning_rate=0.01: RMSE=1.1334180680775807, Accuracy=0.329, MAP@K=0.0005241761559696342


Evaluation fold: 100%|██████████| 1/1 [00:13<00:00, 13.45s/it]


Epochs=3, n_factors=10, learning_rate=0.1: RMSE=1.2853015210447702, Accuracy=0.312, MAP@K=0.00022979209799861973


Evaluation fold: 100%|██████████| 1/1 [00:13<00:00, 13.88s/it]


Epochs=3, n_factors=20, learning_rate=0.001: RMSE=1.1372480086277565, Accuracy=0.328, MAP@K=0.00029772688060731537


Evaluation fold: 100%|██████████| 1/1 [00:13<00:00, 13.87s/it]


Epochs=3, n_factors=20, learning_rate=0.01: RMSE=1.1279079507008258, Accuracy=0.328, MAP@K=8.454106280193236e-05


Evaluation fold: 100%|██████████| 1/1 [00:13<00:00, 13.69s/it]


Epochs=3, n_factors=20, learning_rate=0.1: RMSE=1.2853015210447702, Accuracy=0.312, MAP@K=0.0008113354037267079


Evaluation fold: 100%|██████████| 1/1 [00:14<00:00, 14.01s/it]


Epochs=3, n_factors=50, learning_rate=0.001: RMSE=1.1407995550938346, Accuracy=0.328, MAP@K=0.0


Evaluation fold: 100%|██████████| 1/1 [00:14<00:00, 14.07s/it]


Epochs=3, n_factors=50, learning_rate=0.01: RMSE=1.1239369937246773, Accuracy=0.335, MAP@K=0.0


Evaluation fold: 100%|██████████| 1/1 [00:13<00:00, 13.76s/it]


Epochs=3, n_factors=50, learning_rate=0.1: RMSE=1.2853015210447702, Accuracy=0.312, MAP@K=0.000405236369910283


Evaluation fold: 100%|██████████| 1/1 [00:14<00:00, 14.82s/it]


Epochs=10, n_factors=5, learning_rate=0.001: RMSE=1.1347128152284665, Accuracy=0.328, MAP@K=0.0007959152864044168


Evaluation fold: 100%|██████████| 1/1 [00:14<00:00, 14.93s/it]


Epochs=10, n_factors=5, learning_rate=0.01: RMSE=1.1324496216423574, Accuracy=0.329, MAP@K=0.0009132375776397516


Evaluation fold: 100%|██████████| 1/1 [00:14<00:00, 14.68s/it]


Epochs=10, n_factors=5, learning_rate=0.1: RMSE=1.2853015210447702, Accuracy=0.312, MAP@K=0.0010516951345755695


Evaluation fold: 100%|██████████| 1/1 [00:14<00:00, 14.78s/it]


Epochs=10, n_factors=10, learning_rate=0.001: RMSE=1.1353685860754918, Accuracy=0.328, MAP@K=0.0


Evaluation fold: 100%|██████████| 1/1 [00:14<00:00, 14.86s/it]


Epochs=10, n_factors=10, learning_rate=0.01: RMSE=1.1456049065404936, Accuracy=0.326, MAP@K=0.0001754442719116632


Evaluation fold: 100%|██████████| 1/1 [00:14<00:00, 14.80s/it]


Epochs=10, n_factors=10, learning_rate=0.1: RMSE=1.2853015210447702, Accuracy=0.312, MAP@K=0.0005275189786059352


Evaluation fold: 100%|██████████| 1/1 [00:14<00:00, 14.95s/it]


Epochs=10, n_factors=20, learning_rate=0.001: RMSE=1.1332656918708586, Accuracy=0.327, MAP@K=0.0


Evaluation fold: 100%|██████████| 1/1 [00:14<00:00, 14.87s/it]


Epochs=10, n_factors=20, learning_rate=0.01: RMSE=1.1313971834512522, Accuracy=0.323, MAP@K=0.0005241761559696342


Evaluation fold: 100%|██████████| 1/1 [00:14<00:00, 14.89s/it]


Epochs=10, n_factors=20, learning_rate=0.1: RMSE=1.2853015210447702, Accuracy=0.312, MAP@K=0.00022979209799861973


Evaluation fold: 100%|██████████| 1/1 [00:15<00:00, 15.55s/it]


Epochs=10, n_factors=50, learning_rate=0.001: RMSE=1.134891179761967, Accuracy=0.328, MAP@K=0.00018752156659765357


Evaluation fold: 100%|██████████| 1/1 [00:15<00:00, 15.26s/it]


Epochs=10, n_factors=50, learning_rate=0.01: RMSE=1.1383307417835673, Accuracy=0.341, MAP@K=0.0010483523119392685


Evaluation fold: 100%|██████████| 1/1 [00:15<00:00, 15.08s/it]


Epochs=10, n_factors=50, learning_rate=0.1: RMSE=1.2853015210447702, Accuracy=0.312, MAP@K=0.00045958419599723946


Evaluation fold: 100%|██████████| 1/1 [00:17<00:00, 17.72s/it]


Epochs=30, n_factors=5, learning_rate=0.001: RMSE=1.1360938406369039, Accuracy=0.328, MAP@K=0.000753968253968254


Evaluation fold: 100%|██████████| 1/1 [00:17<00:00, 17.91s/it]


Epochs=30, n_factors=5, learning_rate=0.01: RMSE=1.1782114124390242, Accuracy=0.313, MAP@K=0.00035994651483781917


Evaluation fold: 100%|██████████| 1/1 [00:17<00:00, 17.80s/it]


Epochs=30, n_factors=5, learning_rate=0.1: RMSE=1.2853015210447702, Accuracy=0.312, MAP@K=0.00022979209799861973


Evaluation fold: 100%|██████████| 1/1 [00:17<00:00, 17.98s/it]


Epochs=30, n_factors=10, learning_rate=0.001: RMSE=1.1343548323155923, Accuracy=0.319, MAP@K=0.0002788561076604555


Evaluation fold: 100%|██████████| 1/1 [00:17<00:00, 17.74s/it]


Epochs=30, n_factors=10, learning_rate=0.01: RMSE=1.1642129200080746, Accuracy=0.327, MAP@K=0.0004278812974465148


Evaluation fold: 100%|██████████| 1/1 [00:17<00:00, 17.86s/it]


Epochs=30, n_factors=10, learning_rate=0.1: RMSE=1.2853015210447702, Accuracy=0.312, MAP@K=0.00022979209799861973


Evaluation fold: 100%|██████████| 1/1 [00:18<00:00, 18.11s/it]


Epochs=30, n_factors=20, learning_rate=0.001: RMSE=1.1184675289774038, Accuracy=0.34, MAP@K=0.0005241761559696342


Evaluation fold: 100%|██████████| 1/1 [00:18<00:00, 18.41s/it]


Epochs=30, n_factors=20, learning_rate=0.01: RMSE=1.1574101316838323, Accuracy=0.334, MAP@K=0.00013015441683919944


Evaluation fold: 100%|██████████| 1/1 [00:18<00:00, 18.10s/it]


Epochs=30, n_factors=20, learning_rate=0.1: RMSE=1.2853015210447702, Accuracy=0.312, MAP@K=0.000405236369910283


Evaluation fold: 100%|██████████| 1/1 [00:19<00:00, 19.09s/it]


Epochs=30, n_factors=50, learning_rate=0.001: RMSE=1.1251291662050267, Accuracy=0.337, MAP@K=0.0


Evaluation fold: 100%|██████████| 1/1 [00:18<00:00, 18.78s/it]


Epochs=30, n_factors=50, learning_rate=0.01: RMSE=1.1371887272715828, Accuracy=0.348, MAP@K=0.00022979209799861973


Evaluation fold: 100%|██████████| 1/1 [00:18<00:00, 18.69s/it]

Epochs=30, n_factors=50, learning_rate=0.1: RMSE=1.2853015210447702, Accuracy=0.312, MAP@K=0.00022979209799861973





Best model found:

In [7]:
nnmodel = NNColaborativeModel(learning_rate=0.01, n_embed=50, epochs=10, batch_size=1024)
nnmodel.evaluate(review_df, user_df, business_df, short_eval=True, short_eval_train_samples=100_000)

Evaluation fold: 100%|██████████| 1/1 [10:19<00:00, 619.19s/it]


{'rmse': 1.2567935022053274,
 'mae': 0.9297294791340828,
 'accuracy': 0.3495,
 'f1': 0.21502334180449365,
 'precision': 0.313925154998302,
 'recall': 0.23354447453903235,
 'AP@1': 0.0,
 'AP@3': 8.804366966015144e-05,
 'AP@K': 0.00010565240359218173,
 'MAP@K': 7.805490570942235e-05}