In [1]:
%matplotlib inline

In [2]:
import os 
import sys

prj_dir = os.path.abspath(os.getcwd() + '/..')
print(prj_dir)
sys.path.append(prj_dir)

/Users/raul.sanchez/k/santander_product_recommendation


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import ndcg_score

In [4]:
from src import config
from src.features import folds
from src.features.utils.features import PRODUCT_FEATURES, TOP_N

In [None]:
challengers = [
    'item_most_popular', 
    'user_most_popular', 
    'recsysnet', 
    'net_multilabel',
    'fact_matrix']

ndcg_all, ndcg, coldstart_ratio, coldstart_nb_users = [], [], {}, {}
for fold_id in folds.list():
    print(fold_id)
    train, test, _ = folds.get(fold_id)
    cold_star_users = set(test['ncodpers'].unique()) - set(train['ncodpers'].unique())
    is_cold_start = test.index.isin(cold_star_users)
    
    coldstart_ratio[test['fecha_dato'].min()] = is_cold_start.mean()
    coldstart_nb_users[test['fecha_dato'].min()] = is_cold_start.sum()
    
    fold_ndcg_all, fold_ndcg = {}, {}
    for challenger_name in challengers:
        print(challenger_name)
        scores = os.path.join(
            config.PRJ_DIR, 
            'data/processed/ranks/{0}/{1}.csv.gz'.format(
                challenger_name, 
                '{:02d}'.format(fold_id)))
        scores = pd.read_csv(scores)

        fold_ndcg_all[challenger_name] = ndcg_score(
            test.loc[is_cold_start][PRODUCT_FEATURES],
            scores.loc[is_cold_start][PRODUCT_FEATURES],
            k=TOP_N)

        fold_ndcg[challenger_name] = ndcg_score(
            test.loc[is_cold_start][PRODUCT_FEATURES].drop('ind_cco_fin_ult1', axis=1),
            scores.loc[is_cold_start][PRODUCT_FEATURES].drop('ind_cco_fin_ult1', axis=1),
            k=TOP_N)
    
    ndcg_all.append(fold_ndcg_all)
    ndcg.append(fold_ndcg)

coldstart_nb_users = pd.Series(coldstart_nb_users)
coldstart_ratio = pd.Series(coldstart_ratio)
ndcg_all_df = pd.DataFrame(ndcg_all, index=coldstart_ratio.index)
ndcg_df = pd.DataFrame(ndcg, index=coldstart_ratio.index)

1


In [None]:
ax = coldstart_nb_users.plot(
    marker='o', 
    grid=True,
    title='Cold Star')

ax.set_ylabel('nb-users')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 5))

ndcg_all_df.plot(
    marker='o',
    grid=True,
    ax=ax)

ax.set_title('Normalized Discounted Cumulative Gain\n(Cold star NDCG@10)')
ax.set_ylabel('NDCG')
ax.set_xlabel('Fold-id')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 5))

ndcg_df.plot(
    marker='o',
    grid=True,
    ax=ax)

ax.set_title('Normalized Discounted Cumulative Gain\n(Cold star NDCG@10 Most Pop. Removed)')
ax.set_ylabel('NDCG')
ax.set_xlabel('Fold-id')