In [1]:
import os
import sys

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
sys.path.append('../')

In [2]:
import gc
from typing import Callable, List, Tuple

import pandas as pd
import numpy as np
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix, diags
from scipy.sparse.linalg import svds
import time
from tqdm import tqdm

from src.preprocess import add_time_idx
from src.prepare_data import train_val_test_split, get_users_history
from src.SVD import SVD
from src.unbiased_metrics import get_metrics, hr, mrr, ndcg

## Load Data

In [3]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']
data = pd.read_table('../data/ml-1m.dat', names = columns, sep = "::", encoding = "latin1", engine='python')

In [4]:
train, val_1, val_2, test, train_dict, test_dict = train_val_test_split(data, filter_negative=False)

In [5]:
test_users_history = get_users_history(test, train_dict, test_dict)

47862it [2:44:03,  4.86it/s]


In [6]:
val_users_history_1 = get_users_history(val_1, train_dict, test_dict)

23931it [39:42, 10.04it/s]


In [7]:
val_users_history_2 = get_users_history(val_2, train_dict, test_dict)

23931it [42:35,  9.37it/s]


In [8]:
train.to_csv('train.csv', index=False)
val_1.to_csv('val_1.csv', index=False)
val_2.to_csv('val_2.csv', index=False)
test.to_csv('test.csv', index=False)
test_users_history.to_csv('test_users_history.csv', index=False)
val_users_history_1.to_csv('val_users_history_1.csv', index=False)
val_users_history_2.to_csv('val_users_history_2.csv', index=False)

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
val_2 = pd.read_csv('val_2.csv')
test_users_history = pd.read_csv('test_users_history.csv')
val_users_history_2 = pd.read_csv('val_users_history_2.csv')

# Hyper parameters selection 

In [9]:
def grid_search(param_list: list, 
                train: pd.DataFrame, 
                val_history: pd.DataFrame, 
                val: pd.DataFrame) -> Tuple[list, float]:
    best_params = param_list[0]
    max_ndcg = -1
    for params in param_list:
        svd = SVD(**params)
        svd.fit(train)
        scores = svd.predict_folding_in(val_history)
        preds = svd.get_top_k(scores, user_col = 'test_user_idx')      
        
        preds = preds.merge(val, on='test_user_idx', how='inner') 
        preds = preds[preds['rating'] >= 3.5]
        
        cur_ndcg = ndcg(preds)
        print(f'params = {params}')
        print(f'ndcg = {cur_ndcg}')
        
        if cur_ndcg > max_ndcg:
            max_ndcg = cur_ndcg
            best_params = params
    return best_params, max_ndcg

In [10]:
param_grid = {'alpha': np.linspace(0.1, 0.8, 8), 'n_factors': np.linspace(50, 500, 10).astype(int)}
param_list = list(ParameterGrid(param_grid))

In [11]:
best_params, max_ndcg = grid_search(param_list, train, val_users_history_1, val_1)

100%|██████████| 1/1 [00:01<00:00,  1.16s/it]


params = {'alpha': 0.1, 'n_factors': 50}
ndcg = 0.032193


100%|██████████| 1/1 [00:01<00:00,  1.12s/it]


params = {'alpha': 0.1, 'n_factors': 100}
ndcg = 0.034343


100%|██████████| 1/1 [00:01<00:00,  1.11s/it]


params = {'alpha': 0.1, 'n_factors': 150}
ndcg = 0.035178


100%|██████████| 1/1 [00:01<00:00,  1.12s/it]


params = {'alpha': 0.1, 'n_factors': 200}
ndcg = 0.037096


100%|██████████| 1/1 [00:01<00:00,  1.09s/it]


params = {'alpha': 0.1, 'n_factors': 250}
ndcg = 0.036119


100%|██████████| 1/1 [00:01<00:00,  1.10s/it]


params = {'alpha': 0.1, 'n_factors': 300}
ndcg = 0.037319


100%|██████████| 1/1 [00:01<00:00,  1.11s/it]


params = {'alpha': 0.1, 'n_factors': 350}
ndcg = 0.037289


100%|██████████| 1/1 [00:01<00:00,  1.05s/it]


params = {'alpha': 0.1, 'n_factors': 400}
ndcg = 0.035919


100%|██████████| 1/1 [00:01<00:00,  1.10s/it]


params = {'alpha': 0.1, 'n_factors': 450}
ndcg = 0.035173


100%|██████████| 1/1 [00:01<00:00,  1.13s/it]


params = {'alpha': 0.1, 'n_factors': 500}
ndcg = 0.037208


100%|██████████| 1/1 [00:01<00:00,  1.19s/it]


params = {'alpha': 0.2, 'n_factors': 50}
ndcg = 0.03351


100%|██████████| 1/1 [00:01<00:00,  1.21s/it]


params = {'alpha': 0.2, 'n_factors': 100}
ndcg = 0.035345


100%|██████████| 1/1 [00:01<00:00,  1.15s/it]


params = {'alpha': 0.2, 'n_factors': 150}
ndcg = 0.034959


100%|██████████| 1/1 [00:01<00:00,  1.18s/it]


params = {'alpha': 0.2, 'n_factors': 200}
ndcg = 0.035894


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


params = {'alpha': 0.2, 'n_factors': 250}
ndcg = 0.037417


100%|██████████| 1/1 [00:01<00:00,  1.12s/it]


params = {'alpha': 0.2, 'n_factors': 300}
ndcg = 0.036098


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


params = {'alpha': 0.2, 'n_factors': 350}
ndcg = 0.03623


100%|██████████| 1/1 [00:01<00:00,  1.12s/it]


params = {'alpha': 0.2, 'n_factors': 400}
ndcg = 0.036819


100%|██████████| 1/1 [00:01<00:00,  1.11s/it]


params = {'alpha': 0.2, 'n_factors': 450}
ndcg = 0.035835


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


params = {'alpha': 0.2, 'n_factors': 500}
ndcg = 0.036118


100%|██████████| 1/1 [00:01<00:00,  1.12s/it]


params = {'alpha': 0.30000000000000004, 'n_factors': 50}
ndcg = 0.03365


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


params = {'alpha': 0.30000000000000004, 'n_factors': 100}
ndcg = 0.036451


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


params = {'alpha': 0.30000000000000004, 'n_factors': 150}
ndcg = 0.035767


100%|██████████| 1/1 [00:01<00:00,  1.13s/it]


params = {'alpha': 0.30000000000000004, 'n_factors': 200}
ndcg = 0.037034


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


params = {'alpha': 0.30000000000000004, 'n_factors': 250}
ndcg = 0.035014


100%|██████████| 1/1 [00:01<00:00,  1.13s/it]


params = {'alpha': 0.30000000000000004, 'n_factors': 300}
ndcg = 0.036201


100%|██████████| 1/1 [00:01<00:00,  1.12s/it]


params = {'alpha': 0.30000000000000004, 'n_factors': 350}
ndcg = 0.035109


100%|██████████| 1/1 [00:01<00:00,  1.12s/it]


params = {'alpha': 0.30000000000000004, 'n_factors': 400}
ndcg = 0.034813


100%|██████████| 1/1 [00:01<00:00,  1.11s/it]


params = {'alpha': 0.30000000000000004, 'n_factors': 450}
ndcg = 0.033662


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


params = {'alpha': 0.30000000000000004, 'n_factors': 500}
ndcg = 0.035265


100%|██████████| 1/1 [00:01<00:00,  1.12s/it]


params = {'alpha': 0.4, 'n_factors': 50}
ndcg = 0.035951


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


params = {'alpha': 0.4, 'n_factors': 100}
ndcg = 0.036707


100%|██████████| 1/1 [00:01<00:00,  1.16s/it]


params = {'alpha': 0.4, 'n_factors': 150}
ndcg = 0.037926


100%|██████████| 1/1 [00:01<00:00,  1.15s/it]


params = {'alpha': 0.4, 'n_factors': 200}
ndcg = 0.036264


100%|██████████| 1/1 [00:01<00:00,  1.13s/it]


params = {'alpha': 0.4, 'n_factors': 250}
ndcg = 0.034958


100%|██████████| 1/1 [00:01<00:00,  1.13s/it]


params = {'alpha': 0.4, 'n_factors': 300}
ndcg = 0.034336


100%|██████████| 1/1 [00:01<00:00,  1.16s/it]


params = {'alpha': 0.4, 'n_factors': 350}
ndcg = 0.033631


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


params = {'alpha': 0.4, 'n_factors': 400}
ndcg = 0.034395


100%|██████████| 1/1 [00:01<00:00,  1.13s/it]


params = {'alpha': 0.4, 'n_factors': 450}
ndcg = 0.032515


100%|██████████| 1/1 [00:01<00:00,  1.13s/it]


params = {'alpha': 0.4, 'n_factors': 500}
ndcg = 0.031578


100%|██████████| 1/1 [00:01<00:00,  1.12s/it]


params = {'alpha': 0.5, 'n_factors': 50}
ndcg = 0.035703


100%|██████████| 1/1 [00:01<00:00,  1.13s/it]


params = {'alpha': 0.5, 'n_factors': 100}
ndcg = 0.036047


100%|██████████| 1/1 [00:01<00:00,  1.13s/it]


params = {'alpha': 0.5, 'n_factors': 150}
ndcg = 0.035084


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


params = {'alpha': 0.5, 'n_factors': 200}
ndcg = 0.034937


100%|██████████| 1/1 [00:01<00:00,  1.13s/it]


params = {'alpha': 0.5, 'n_factors': 250}
ndcg = 0.034595


100%|██████████| 1/1 [00:01<00:00,  1.15s/it]


params = {'alpha': 0.5, 'n_factors': 300}
ndcg = 0.03262


100%|██████████| 1/1 [00:01<00:00,  1.16s/it]


params = {'alpha': 0.5, 'n_factors': 350}
ndcg = 0.031239


100%|██████████| 1/1 [00:01<00:00,  1.13s/it]


params = {'alpha': 0.5, 'n_factors': 400}
ndcg = 0.033171


100%|██████████| 1/1 [00:01<00:00,  1.13s/it]


params = {'alpha': 0.5, 'n_factors': 450}
ndcg = 0.029579


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


params = {'alpha': 0.5, 'n_factors': 500}
ndcg = 0.030016


100%|██████████| 1/1 [00:01<00:00,  1.12s/it]


params = {'alpha': 0.6, 'n_factors': 50}
ndcg = 0.0367


100%|██████████| 1/1 [00:01<00:00,  1.16s/it]


params = {'alpha': 0.6, 'n_factors': 100}
ndcg = 0.035407


100%|██████████| 1/1 [00:01<00:00,  1.13s/it]


params = {'alpha': 0.6, 'n_factors': 150}
ndcg = 0.034764


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


params = {'alpha': 0.6, 'n_factors': 200}
ndcg = 0.032362


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


params = {'alpha': 0.6, 'n_factors': 250}
ndcg = 0.03276


100%|██████████| 1/1 [00:01<00:00,  1.13s/it]


params = {'alpha': 0.6, 'n_factors': 300}
ndcg = 0.031303


100%|██████████| 1/1 [00:01<00:00,  1.13s/it]


params = {'alpha': 0.6, 'n_factors': 350}
ndcg = 0.029794


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


params = {'alpha': 0.6, 'n_factors': 400}
ndcg = 0.029842


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


params = {'alpha': 0.6, 'n_factors': 450}
ndcg = 0.026275


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


params = {'alpha': 0.6, 'n_factors': 500}
ndcg = 0.026454


100%|██████████| 1/1 [00:01<00:00,  1.11s/it]


params = {'alpha': 0.7000000000000001, 'n_factors': 50}
ndcg = 0.036222


100%|██████████| 1/1 [00:01<00:00,  1.15s/it]


params = {'alpha': 0.7000000000000001, 'n_factors': 100}
ndcg = 0.034786


100%|██████████| 1/1 [00:01<00:00,  1.19s/it]


params = {'alpha': 0.7000000000000001, 'n_factors': 150}
ndcg = 0.034823


100%|██████████| 1/1 [00:01<00:00,  1.16s/it]


params = {'alpha': 0.7000000000000001, 'n_factors': 200}
ndcg = 0.032208


100%|██████████| 1/1 [00:01<00:00,  1.17s/it]


params = {'alpha': 0.7000000000000001, 'n_factors': 250}
ndcg = 0.03087


100%|██████████| 1/1 [00:01<00:00,  1.15s/it]


params = {'alpha': 0.7000000000000001, 'n_factors': 300}
ndcg = 0.029988


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


params = {'alpha': 0.7000000000000001, 'n_factors': 350}
ndcg = 0.027585


100%|██████████| 1/1 [00:01<00:00,  1.16s/it]


params = {'alpha': 0.7000000000000001, 'n_factors': 400}
ndcg = 0.027099


100%|██████████| 1/1 [00:01<00:00,  1.17s/it]


params = {'alpha': 0.7000000000000001, 'n_factors': 450}
ndcg = 0.024789


100%|██████████| 1/1 [00:01<00:00,  1.19s/it]


params = {'alpha': 0.7000000000000001, 'n_factors': 500}
ndcg = 0.023848


100%|██████████| 1/1 [00:01<00:00,  1.17s/it]


params = {'alpha': 0.8, 'n_factors': 50}
ndcg = 0.035865


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


params = {'alpha': 0.8, 'n_factors': 100}
ndcg = 0.036027


100%|██████████| 1/1 [00:01<00:00,  1.15s/it]


params = {'alpha': 0.8, 'n_factors': 150}
ndcg = 0.032741


100%|██████████| 1/1 [00:01<00:00,  1.15s/it]


params = {'alpha': 0.8, 'n_factors': 200}
ndcg = 0.032402


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


params = {'alpha': 0.8, 'n_factors': 250}
ndcg = 0.028527


100%|██████████| 1/1 [00:01<00:00,  1.18s/it]


params = {'alpha': 0.8, 'n_factors': 300}
ndcg = 0.026435


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


params = {'alpha': 0.8, 'n_factors': 350}
ndcg = 0.025954


100%|██████████| 1/1 [00:01<00:00,  1.18s/it]


params = {'alpha': 0.8, 'n_factors': 400}
ndcg = 0.023651


100%|██████████| 1/1 [00:01<00:00,  1.20s/it]


params = {'alpha': 0.8, 'n_factors': 450}
ndcg = 0.022186


100%|██████████| 1/1 [00:01<00:00,  1.23s/it]


params = {'alpha': 0.8, 'n_factors': 500}
ndcg = 0.021361


In [12]:
best_params, max_ndcg

({'alpha': 0.4, 'n_factors': 150}, 0.037926)

In [4]:
best_params = {'alpha': 0.4, 'n_factors': 150}

In [5]:
svd = SVD(**best_params)
svd.fit(train)
scores_val = svd.predict_folding_in(val_users_history_2)
scores_test = svd.predict_folding_in(test_users_history)

preds_val = svd.get_top_k(scores_val, user_col='test_user_idx') 
preds_test = svd.get_top_k(scores_test, user_col='test_user_idx')

preds_val = preds_val.merge(val_2, on='test_user_idx', how='inner')
preds_test = preds_test.merge(test, on='test_user_idx', how='inner') 

metrics_df, beta = get_metrics(preds_test, preds_val)

100%|██████████| 1/1 [00:01<00:00,  1.23s/it]
100%|██████████| 1/1 [00:02<00:00,  2.15s/it]


In [6]:
metrics_df

Unnamed: 0,type,HR,MRR,nDCG
0,Biased,0.088995,0.02231,0.037369
1,Unbiased,0.1478,0.081114,0.079055
2,Unbiased_feedback_sampling,0.53237,0.178634,0.047297


In [7]:
beta

0.06454922265522797

In [8]:
preds_test_pos = preds_test[preds_test['rating'] >= 3.5]

In [9]:
hr(preds_test_pos, beta=beta, sample_feedback=True, return_confidence_interval=True)

(0.53237, 0.0012565594698326077)

In [10]:
mrr(preds_test_pos, beta=beta, sample_feedback=True, return_confidence_interval=True)

(0.178634, 0.0007327384206406162)

In [11]:
ndcg(preds_test_pos, beta=beta, sample_feedback=True, return_confidence_interval=True)

(0.047297, 3.293627487450934e-05)