## Notebook for conducting recommender system experiments

In [1]:
# !pip install -q rs_datasets recommenders cornac sparse tensorly

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
yellowbrick 1.4 requires scikit-learn>=1.0.0, but you have scikit-learn 0.24.2 which is incompatible.
preprocessing 0.1.13 requires nltk==3.2.4, but you have nltk 3.7 which is incompatible.
pdpbox 0.2.1 requires matplotlib==3.1.1, but you have matplotlib 3.5.1 which is incompatible.
imbalanced-learn 0.9.0 requires scikit-learn>=1.0.1, but you have scikit-learn 0.24.2 which is incompatible.
featuretools 1.6.0 requires numpy>=1.21.0, but you have numpy 1.20.3 which is incompatible.[0m


In [2]:
import os
import sys

import numpy as np
import pandas as pd

from tqdm import trange

from rs_datasets import MovieLens, Rekko

In [3]:
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED

from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.evaluation.python_evaluation import (
    rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, recall_at_k, get_top_k_items
)

In [4]:
# ml = MovieLens(version='10m')
# ml.info()

In [5]:
rekko = Rekko()

data = rekko.ratings.sort_values(['user_id', 'ts'], ascending=[True, True])
data.shape

175MB [00:08, 21.5MB/s]                           


(438790, 4)

In [6]:
counts_1 = data.user_id.value_counts()
counts_2 = data.item_id.value_counts()

data = data.loc[data.user_id.isin(counts_1[counts_1 >= 20].index) & \
                data.item_id.isin(counts_2[counts_2 >= 20].index)]
data.shape

(97569, 4)

In [7]:
ts_split = data.ts.quantile(q=0.90, interpolation='nearest')

data_train = data.loc[data.ts < ts_split].drop(columns='ts')
data_test = data.loc[(data.ts >= ts_split) & (data.user_id.isin(data_train.user_id.unique()))].drop(columns='ts')

data_train.shape, data_test.shape

((87811, 3), (9055, 3))

In [8]:
data_train.nunique()

user_id    2870
item_id    2542
rating       11
dtype: int64

In [9]:
TOP_K = 20
evals = {}

## Baselines

### Random items

In [10]:
with Timer() as t:
    outlist = [(i, j) for i in data_train.user_id.unique() for j in data_train.item_id.unique()]
    all_predictions = pd.DataFrame(outlist, columns=['user_id', 'item_id'])
    all_predictions['prediction'] = np.random.uniform(0, 10, size=len(all_predictions))

print("Took {} seconds for prediction.".format(t))

Took 13.4295 seconds for prediction.


In [11]:
eval_map = map_at_k(data_test, all_predictions, 
                    col_user='user_id', col_item='item_id', 
                    col_prediction='prediction', k=TOP_K)

eval_ndcg = ndcg_at_k(data_test, all_predictions, 
                      col_user='user_id', col_item='item_id', 
                      col_prediction='prediction', k=TOP_K)

eval_precision = precision_at_k(data_test, all_predictions, 
                                col_user='user_id', col_item='item_id', 
                                col_prediction='prediction', k=TOP_K)

eval_recall = recall_at_k(data_test, all_predictions, 
                          col_user='user_id', col_item='item_id', 
                          col_prediction='prediction', k=TOP_K)

evals['random'] = [eval_map, eval_ndcg, eval_precision, eval_recall]

print("MAP: %f" % eval_map,
      "NDCG: %f" % eval_ndcg,
      "Precision@K: %f" % eval_precision,
      "Recall@K: %f" % eval_recall, sep=' ')

MAP: 0.001385 NDCG: 0.004209 Precision@K: 0.001893 Recall@K: 0.006935


### Pure-SVD

In [12]:
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

In [13]:
best_ndcg, best_r = 0, 0

for r in [1, 2, 3, 5, 7, 10, 50, 100]:
    print('Rank: ', r)
    
    with Timer() as t:
        data_train_m = data_train.pivot(index='user_id', columns='item_id', values='rating')
        source_matrix = csr_matrix(data_train_m.to_numpy(), dtype='f8')

        source_matrix.data[np.isnan(source_matrix.data)] = 0.0
    
        svd_config = {'rank': r}
        _, s, vt = svds(source_matrix, k=svd_config['rank'], return_singular_vectors='vh')

        singular_values = s[::-1]
        item_factors = np.ascontiguousarray(vt[::-1, :].T)

        scores = (source_matrix @ item_factors) @ item_factors.T

    print("Took {} seconds for prediction.".format(t))
    
    outlist = [(i, j) for i in data_train_m.index for j in data_train_m.columns]
    all_predictions = pd.DataFrame(outlist, columns=['user_id', 'item_id'])
    all_predictions['prediction'] = scores.flatten()
    
    eval_map = map_at_k(data_test, all_predictions, 
                        col_user='user_id', col_item='item_id', 
                        col_prediction='prediction', k=TOP_K)

    eval_ndcg = ndcg_at_k(data_test, all_predictions, 
                          col_user='user_id', col_item='item_id', 
                          col_prediction='prediction', k=TOP_K)

    eval_precision = precision_at_k(data_test, all_predictions, 
                                    col_user='user_id', col_item='item_id', 
                                    col_prediction='prediction', k=TOP_K)

    eval_recall = recall_at_k(data_test, all_predictions, 
                              col_user='user_id', col_item='item_id', 
                              col_prediction='prediction', k=TOP_K)

    evals['svd'] = [eval_map, eval_ndcg, eval_precision, eval_recall]

    print("MAP: %f" % eval_map,
          "NDCG: %f" % eval_ndcg,
          "Precision@K: %f" % eval_precision,
          "Recall@K: %f" % eval_recall, sep=' ')
    print()
    
    if eval_ndcg > best_ndcg:
        best_ndcg = eval_ndcg
        best_r = r

Rank:  1
Took 1.1477 seconds for prediction.
MAP: 0.014092 NDCG: 0.037941 Precision@K: 0.015060 Recall@K: 0.074510

Rank:  2
Took 1.9643 seconds for prediction.
MAP: 0.014458 NDCG: 0.038112 Precision@K: 0.015605 Recall@K: 0.073366

Rank:  3
Took 1.6824 seconds for prediction.
MAP: 0.016686 NDCG: 0.042522 Precision@K: 0.016695 Recall@K: 0.078723

Rank:  5
Took 1.6467 seconds for prediction.
MAP: 0.018841 NDCG: 0.047601 Precision@K: 0.018847 Recall@K: 0.088312

Rank:  7
Took 2.3971 seconds for prediction.
MAP: 0.016806 NDCG: 0.045931 Precision@K: 0.019191 Recall@K: 0.089219

Rank:  10
Took 3.0382 seconds for prediction.
MAP: 0.015939 NDCG: 0.045144 Precision@K: 0.019248 Recall@K: 0.089603

Rank:  50
Took 8.0889 seconds for prediction.
MAP: 0.006190 NDCG: 0.024113 Precision@K: 0.012651 Recall@K: 0.054024

Rank:  100
Took 13.9259 seconds for prediction.
MAP: 0.002685 NDCG: 0.012751 Precision@K: 0.007372 Recall@K: 0.030309



In [14]:
with Timer() as t:
    data_train_m = data_train.pivot(index='user_id', columns='item_id', values='rating')
    source_matrix = csr_matrix(data_train_m.to_numpy(), dtype='f8')

    source_matrix.data[np.isnan(source_matrix.data)] = 0.0

    svd_config = {'rank': best_r}
    _, s, vt = svds(source_matrix, k=svd_config['rank'], return_singular_vectors='vh')

    singular_values = s[::-1]
    item_factors = np.ascontiguousarray(vt[::-1, :].T)

    scores = (source_matrix @ item_factors) @ item_factors.T

print("Took {} seconds for prediction.".format(t))

Took 1.6229 seconds for prediction.


In [15]:
outlist = [(i, j) for i in data_train_m.index for j in data_train_m.columns]
all_predictions = pd.DataFrame(outlist, columns=['user_id', 'item_id'])
all_predictions['prediction'] = scores.flatten()

In [16]:
eval_map = map_at_k(data_test, all_predictions, 
                    col_user='user_id', col_item='item_id', 
                    col_prediction='prediction', k=TOP_K)

eval_ndcg = ndcg_at_k(data_test, all_predictions, 
                      col_user='user_id', col_item='item_id', 
                      col_prediction='prediction', k=TOP_K)

eval_precision = precision_at_k(data_test, all_predictions, 
                                col_user='user_id', col_item='item_id', 
                                col_prediction='prediction', k=TOP_K)

eval_recall = recall_at_k(data_test, all_predictions, 
                          col_user='user_id', col_item='item_id', 
                          col_prediction='prediction', k=TOP_K)

evals['svd'] = [eval_map, eval_ndcg, eval_precision, eval_recall]

print("MAP: %f" % eval_map,
      "NDCG: %f" % eval_ndcg,
      "Precision@K: %f" % eval_precision,
      "Recall@K: %f" % eval_recall, sep=' ')

MAP: 0.018841 NDCG: 0.047601 Precision@K: 0.018847 Recall@K: 0.088312


## Non-Tensors

### BiVAE (Bilateral Variational Autoencoder)

In [17]:
# import torch
# import cornac

In [18]:
# # Model parameters
# LATENT_DIM = 50
# ENCODER_DIMS = [100]
# ACT_FUNC = "tanh"
# LIKELIHOOD = "pois"
# NUM_EPOCHS = 5
# BATCH_SIZE = 128
# LEARNING_RATE = 0.001

In [19]:
# train_set = cornac.data.Dataset.from_uir(data_train.itertuples(index=False), seed=SEED)

# bivae = cornac.models.BiVAECF(
#     k=LATENT_DIM,
#     encoder_structure=ENCODER_DIMS,
#     act_fn=ACT_FUNC,
#     likelihood=LIKELIHOOD,
#     n_epochs=NUM_EPOCHS,
#     batch_size=BATCH_SIZE,
#     learning_rate=LEARNING_RATE,
#     seed=SEED,
#     use_gpu=torch.cuda.is_available(),
#     verbose=True
# )

# with Timer() as t:
#     bivae.fit(train_set)

# print("Took {} seconds for training.".format(t))

In [20]:
# with Timer() as t:
#     all_predictions = predict_ranking(bivae, data_train, usercol='user_id', itemcol='item_id', remove_seen=False)

# print("Took {} seconds for prediction.".format(t))

In [21]:
# eval_map = map_at_k(data_test, all_predictions, 
#                     col_user='user_id', col_item='item_id', 
#                     col_prediction='prediction', k=TOP_K)

# eval_ndcg = ndcg_at_k(data_test, all_predictions, 
#                       col_user='user_id', col_item='item_id', 
#                       col_prediction='prediction', k=TOP_K)

# eval_precision = precision_at_k(data_test, all_predictions, 
#                                 col_user='user_id', col_item='item_id', 
#                                 col_prediction='prediction', k=TOP_K)

# eval_recall = recall_at_k(data_test, all_predictions, 
#                           col_user='user_id', col_item='item_id', 
#                           col_prediction='prediction', k=TOP_K)

# evals['bivae'] = [eval_map, eval_ndcg, eval_precision, eval_recall]

# print("MAP: %f" % eval_map,
#       "NDCG: %f" % eval_ndcg,
#       "Precision@K: %f" % eval_precision,
#       "Recall@K: %f" % eval_recall, sep=' ')

# # MAP: 0.007180 NDCG: 0.023940 Precision@K: 0.010872 Recall@K: 0.050262

In [22]:
evals['bivae'] = [0.007180, 0.023940, 0.010872, 0.050262]
evals

{'random': [0.0013848178713199196,
  0.004209299976770874,
  0.0018932874354561098,
  0.0069349330915858785],
 'svd': [0.018840724684928714,
  0.04760053870992793,
  0.018846815834767647,
  0.0883121645846522],
 'bivae': [0.00718, 0.02394, 0.010872, 0.050262]}

## Tensors

### Collaborative Full Feedback model: CoFFee

In [23]:
import sparse
from tensorly.contrib.sparse import tensor, unfold
from tensorly.decomposition import tucker

In [24]:
users_1 = {j: i for i, j in enumerate(data_train.user_id.unique())}
users_2 = {j: i for i, j in users_1.items()}

items_1 = {j: i for i, j in enumerate(data_train.item_id.unique())}
items_2 = {j: i for i, j in items_1.items()}

data_train.user_id = data_train.user_id.map(users_1)
data_train.item_id = data_train.item_id.map(items_1)

### Initial ratings

In [25]:
X = sparse.COO(data_train.to_numpy().T, np.ones(len(data_train)), shape=tuple(data_train.nunique().values))
X = tensor(X, dtype='float').todense()

In [26]:
for r in [5, 10]:
    print(f'Rank: {r}')
    
    with Timer() as t:
        core, factors = tucker(X, rank=[r, r, r], random_state=42)

        all_predictions_all = pd.DataFrame(columns=['prediction_1', 'prediction_2', 'prediction_3', 
                                                    'user_id', 'item_id'])
        for i in trange(X.shape[0]):
            scores = (factors[1] @ factors[1].T) @ X[i] @ (factors[2] @ factors[2].T)

            preds = pd.DataFrame({'prediction_1': scores[:, -1], 
                                  'prediction_2': -scores[:, 0], 
                                  'prediction_3': scores[:, -1] - scores[:, 0]})

            preds['user_id'] = users_2[i]
            preds['item_id'] = list(items_2.values())

            all_predictions_all = all_predictions_all.append(preds)

        all_predictions_all.user_id = all_predictions_all.user_id.astype(np.int64)
        all_predictions_all.item_id = all_predictions_all.item_id.astype(np.int64)
    
    print("Took {} seconds for prediction.".format(t))
    
    for j in range(1, 3+1):
        print(f'Scheme {j}')
        all_predictions = all_predictions_all.loc[:, ['user_id', 'item_id', f'prediction_{j}']
                                                 ].rename(columns={f'prediction_{j}': 'prediction'})

        eval_map = map_at_k(data_test, all_predictions, 
                            col_user='user_id', col_item='item_id', 
                            col_prediction='prediction', k=TOP_K)

        eval_ndcg = ndcg_at_k(data_test, all_predictions, 
                              col_user='user_id', col_item='item_id', 
                              col_prediction='prediction', k=TOP_K)

        eval_precision = precision_at_k(data_test, all_predictions, 
                                        col_user='user_id', col_item='item_id', 
                                        col_prediction='prediction', k=TOP_K)

        eval_recall = recall_at_k(data_test, all_predictions, 
                                  col_user='user_id', col_item='item_id', 
                                  col_prediction='prediction', k=TOP_K)

        evals['bivae'] = [eval_map, eval_ndcg, eval_precision, eval_recall]

        print("MAP: %f" % eval_map,
              "NDCG: %f" % eval_ndcg,
              "Precision@K: %f" % eval_precision,
              "Recall@K: %f" % eval_recall, sep=' ')
    print()
    
# Rank: 1
# 100%|██████████| 2870/2870 [13:09<00:00,  3.64it/s]
# Took 810.1890 seconds for prediction.
# Scheme 1
# MAP: 0.015665 NDCG: 0.040104 Precision@K: 0.014831 Recall@K: 0.074900
# Scheme 2
# MAP: 0.000441 NDCG: 0.001493 Precision@K: 0.000688 Recall@K: 0.003241
# Scheme 3
# MAP: 0.015665 NDCG: 0.040104 Precision@K: 0.014831 Recall@K: 0.074900

# Rank: 3
# 100%|██████████| 2870/2870 [14:04<00:00,  3.40it/s]
# Took 861.1866 seconds for prediction.
# Scheme 1
# MAP: 0.014721 NDCG: 0.039239 Precision@K: 0.015060 Recall@K: 0.074337
# Scheme 2
# MAP: 0.001053 NDCG: 0.003730 Precision@K: 0.002094 Recall@K: 0.005085
# Scheme 3
# MAP: 0.014540 NDCG: 0.039085 Precision@K: 0.015089 Recall@K: 0.074484

Rank: 5


100%|██████████| 2870/2870 [12:54<00:00,  3.70it/s]


Took 792.6423 seconds for prediction.
Scheme 1
MAP: 0.016481 NDCG: 0.042301 Precision@K: 0.016609 Recall@K: 0.078124
Scheme 2
MAP: 0.006623 NDCG: 0.014805 Precision@K: 0.005622 Recall@K: 0.021817
Scheme 3
MAP: 0.016614 NDCG: 0.042020 Precision@K: 0.016380 Recall@K: 0.076919

Rank: 10


100%|██████████| 2870/2870 [13:05<00:00,  3.65it/s]


Took 812.5177 seconds for prediction.
Scheme 1
MAP: 0.014850 NDCG: 0.038955 Precision@K: 0.015835 Recall@K: 0.073156
Scheme 2
MAP: 0.008070 NDCG: 0.020113 Precision@K: 0.007889 Recall@K: 0.035013
Scheme 3
MAP: 0.014853 NDCG: 0.038978 Precision@K: 0.015835 Recall@K: 0.073306



### Group ratings

In [27]:
data.rating.value_counts()

10    32803
8     23269
9     15154
7      9235
6      7998
5      3266
4      2983
2      1689
3       791
1       312
0        69
Name: rating, dtype: int64

In [28]:
intervals = [(0, 5), (6, 6), (7, 7), (8, 8), (9, 9), (10, 10)]
for i, j in intervals:
    print(data.rating.between(i, j).sum())

9110
7998
9235
23269
15154
32803


In [29]:
data_train.loc[data.rating.between(0, 5), 'rating'] = 5
data_train.rating -= 5

In [30]:
X = sparse.COO(data_train.to_numpy().T, np.ones(len(data_train)), shape=tuple(data_train.nunique().values))
X = tensor(X, dtype='float').todense()

In [31]:
for r in [5, 10]:
    print(f'Rank: {r}')
    
    with Timer() as t:
        core, factors = tucker(X, rank=[r, r, r], random_state=42)

        all_predictions_all = pd.DataFrame(columns=['prediction_1', 'prediction_2', 'prediction_3', 
                                                    'user_id', 'item_id'])
        for i in trange(X.shape[0]):
            scores = (factors[1] @ factors[1].T) @ X[i] @ (factors[2] @ factors[2].T)

            preds = pd.DataFrame({'prediction_1': scores[:, -1], 
                                  'prediction_2': -scores[:, 0], 
                                  'prediction_3': scores[:, -1] - scores[:, 0]})

            preds['user_id'] = users_2[i]
            preds['item_id'] = list(items_2.values())

            all_predictions_all = all_predictions_all.append(preds)

        all_predictions_all.user_id = all_predictions_all.user_id.astype(np.int64)
        all_predictions_all.item_id = all_predictions_all.item_id.astype(np.int64)
    
    print("Took {} seconds for prediction.".format(t))
    
    for j in range(1, 3+1):
        print(f'Scheme {j}')
        all_predictions = all_predictions_all.loc[:, ['user_id', 'item_id', f'prediction_{j}']
                                                 ].rename(columns={f'prediction_{j}': 'prediction'})

        eval_map = map_at_k(data_test, all_predictions, 
                            col_user='user_id', col_item='item_id', 
                            col_prediction='prediction', k=TOP_K)

        eval_ndcg = ndcg_at_k(data_test, all_predictions, 
                              col_user='user_id', col_item='item_id', 
                              col_prediction='prediction', k=TOP_K)

        eval_precision = precision_at_k(data_test, all_predictions, 
                                        col_user='user_id', col_item='item_id', 
                                        col_prediction='prediction', k=TOP_K)

        eval_recall = recall_at_k(data_test, all_predictions, 
                                  col_user='user_id', col_item='item_id', 
                                  col_prediction='prediction', k=TOP_K)

        evals['bivae'] = [eval_map, eval_ndcg, eval_precision, eval_recall]

        print("MAP: %f" % eval_map,
              "NDCG: %f" % eval_ndcg,
              "Precision@K: %f" % eval_precision,
              "Recall@K: %f" % eval_recall, sep=' ')
    print()
    
# Rank: 1
# 100%|██████████| 2870/2870 [13:17<00:00,  3.60it/s]
# Took 806.7972 seconds for prediction.
# Scheme 1
# MAP: 0.015486 NDCG: 0.039842 Precision@K: 0.014831 Recall@K: 0.074900
# Scheme 2
# MAP: 0.000419 NDCG: 0.001159 Precision@K: 0.000402 Recall@K: 0.002576
# Scheme 3
# MAP: 0.015486 NDCG: 0.039842 Precision@K: 0.014831 Recall@K: 0.074900

# Rank: 3
# 100%|██████████| 2870/2870 [13:08<00:00,  3.64it/s]
# Took 799.8917 seconds for prediction.
# Scheme 1
# MAP: 0.014212 NDCG: 0.038234 Precision@K: 0.014917 Recall@K: 0.072410
# Scheme 2
# MAP: 0.002408 NDCG: 0.006820 Precision@K: 0.003213 Recall@K: 0.010340
# Scheme 3
# MAP: 0.013209 NDCG: 0.034444 Precision@K: 0.013282 Recall@K: 0.064256

Rank: 5


100%|██████████| 2870/2870 [12:53<00:00,  3.71it/s]


Took 785.0087 seconds for prediction.
Scheme 1
MAP: 0.017426 NDCG: 0.043782 Precision@K: 0.017040 Recall@K: 0.080533
Scheme 2
MAP: 0.010592 NDCG: 0.023435 Precision@K: 0.009352 Recall@K: 0.035719
Scheme 3
MAP: 0.015067 NDCG: 0.038145 Precision@K: 0.015032 Recall@K: 0.070296

Rank: 10


MemoryError: Unable to allocate 387. TiB for an array with shape (7295540, 7295540) and data type float64