In [6]:
import pandas as pd
import numpy as np
import requests
from tqdm.auto import tqdm
from scipy.stats import mode 
from pprint import pprint
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender
import warnings
warnings.filterwarnings("ignore")

from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

In [2]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('../service/modelss/')))
from userknn import UserKnn


# Get KION dataset 

<a href="https://ods.ai/competitions/competition-recsys-21/data"> Dataset description [ru] </a>


# EDA

In [3]:
interactions = pd.read_csv('../data/raw/kion_train/interactions.csv')
users = pd.read_csv('../data/raw/kion_train/users.csv')
items = pd.read_csv('../data/raw/kion_train/items.csv')

In [4]:
# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight}, 
                    inplace=True) 

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

## interactions

In [5]:
pd.concat([interactions.head(), interactions.tail()])

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
5476246,648596,12225,2021-08-13,76,0.0
5476247,546862,9673,2021-04-13,2308,49.0
5476248,697262,15297,2021-08-20,18307,63.0
5476249,384202,16197,2021-04-19,6203,100.0
5476250,319709,4436,2021-08-15,3921,45.0


In [25]:
print(f"Interactions dataframe shape: {interactions.shape}")
print(f"Unique users in interactions: {interactions['user_id'].nunique():_}")
print(f"Unique items in interactions: {interactions['item_id'].nunique():_}")

Interactions dataframe shape: (5476251, 5)
Unique users in interactions: 962_179
Unique items in interactions: 15_706


In [26]:
max_date = interactions['datetime'].max()
min_date = interactions['datetime'].min()

print(f"min date in interactions: {min_date}")
print(f"max date in interactions: {max_date}")

min date in interactions: 2021-03-13 00:00:00
max date in interactions: 2021-08-22 00:00:00


In [27]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   item_id      int64         
 2   datetime     datetime64[ns]
 3   weight       int64         
 4   watched_pct  float64       
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 208.9 MB


## users

In [28]:
pd.concat([users.head(), users.tail()])

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
840192,339025,age_65_inf,income_0_20,Ж,0
840193,983617,age_18_24,income_20_40,Ж,1
840194,251008,,,,0
840195,590706,,,Ж,0
840196,166555,age_65_inf,income_20_40,Ж,0


In [29]:
print(f"Users dataframe shape {users.shape}")
print(f"Unique users: {users['user_id'].nunique():_}")

Users dataframe shape (840197, 5)
Unique users: 840_197


## items

In [30]:
pd.concat([items.head(2), items.tail(2)])

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Грандинетти, Джеральдин Чаплин, Елена Анайя, Каэтано Велозо, Леонор Уотлинг, Лола Дуэньяс, Лолес Леон, Малу Айродо, Мариола Фуэнтес, Пас Вега, Пина Бауш, Ро...",Мелодрама легендарного Педро Альмодовара «Поговори с ней» в 2003 году получила премию «Оскар» за лучший сценарий. Журналист Марко берет интервью у знаменитой женщины-тореро Лидии и вскоре влюбляе...,"Поговори, ней, 2002, Испания, друзья, любовь, сильные, женщины, преодоление, трудностей, отношения, дружба, отношения, паре, отношения, мужчины, женщины, романтические, отношения, потеря, близких,..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон Манцукас, Джон Глейсер, Карл Грин, Кристен Риттер, Лэнс Реддик, Морис Комт, Патрик Кернс, Ребекка Коллинз, Роза Салазар, Росс П. Кук, Стеффи Гроут, Ти...","Уморительная современная комедия на популярную тему о том, как не надо отмечать мальчишник. Главный герой усвоил, что не надо звать на свадьбу своего друга Джейсона, из-за которого он вместо сваде...","Голые, перцы, 2014, США, друзья, свадьбы, преодоление, трудностей, расставания, отношения, дружба, риск, недоразумение, мужская, дружба, мальчишники, девичники"
15961,4538,series,Среди камней,Darklands,2019.0,"драмы, спорт, криминал",Россия,0.0,18.0,,"Марк О’Коннор, Конор МакМахон","Дэйн Уайт О’Хара, Томас Кэйн-Бирн, Джудит Родди, Марк О’Халлоран, Джимми Смоллхорн","Семнадцатилетний Дэмиен мечтает вырваться за пределы своего района и стать профессиональным бойцом. Когда его кумир и старший брат исчезает, парень попадает в чуждый ему мир насилия, наркотиков и ...","Среди, камней, 2019, Россия"
15962,3206,series,Гоша,,2019.0,комедии,Россия,0.0,16.0,,Михаил Миронов,"Мкртыч Арзуманян, Виктория Рунцова","Добродушный Гоша не может выйти из дома, чтобы не попасть в нелепую и курьёзную историю. Но даже неудачники мечтают о любви, и наш герой — не исключение, ведь оптимизма ему не занимать.","Гоша, 2019, Россия"


In [31]:
print(f"Items dataframe shape {items.shape}")
print(f"Unique item_id: {items['item_id'].nunique():_}")

Items dataframe shape (15963, 14)
Unique item_id: 15_963


#  userkNN model  CV

Compare implicit `CosineRecommender` and `TFIDFRecommender` as an ItemKnn base 



In [11]:
# setting for cv 
n_folds = 7
unit = "W"
n_units = 1

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")

Start date and last date of the test fold: (Timestamp('2021-06-27 00:00:00'), Timestamp('2021-08-22 00:00:00'))


### Test fold borders

In [13]:
from rectools.model_selection import TimeRangeSplitter
from  rectools.dataset.interactions import Interactions

periods = n_folds + 1
freq = f"{n_units}{unit}"
print(
    f"start_date: {start_date}\n"
    f"last_date: {last_date}\n"
    f"periods: {periods}\n"
    f"freq: {freq}\n"
)
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(Interactions(interactions))}")

start_date: 2021-06-27 00:00:00
last_date: 2021-08-22 00:00:00
periods: 8
freq: 1W

Test fold borders: ['2021-06-27' '2021-07-04' '2021-07-11' '2021-07-18' '2021-07-25'
 '2021-08-01' '2021-08-08' '2021-08-15']
Real number of folds: 7


In [11]:
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics

# calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

# few simple models to compare
models = {
    "cosine_itemknn": CosineRecommender(),
    "tfidf_itemknn": TFIDFRecommender(),
}


# Model training by fold

In [15]:
%%time

results = []

fold_iterator = cv.split(Interactions(interactions), collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    pprint(fold_info)

    df_train = interactions.iloc[train_ids].copy()
    df_test = interactions.iloc[test_ids][Columns.UserItem].copy()

    catalog = df_train[Columns.Item].unique()
    
    for model_name, model in models.items():
        userknn_model = UserKnn(model=model, N_users=50)
        userknn_model.fit(df_train)
    
        recos = userknn_model.predict(df_test)
    
        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )
    
        fold = {"fold": i_fold, "model": model_name}
        fold.update(metric_values)
        results.append(fold)
        


{'End date': Timestamp('2021-07-04 00:00:00', freq='W-SUN'),
 'Start date': Timestamp('2021-06-27 00:00:00', freq='W-SUN'),
 'Test': 237414,
 'Test items': 5947,
 'Test users': 98930,
 'Train': 2533586,
 'Train items': 14092,
 'Train users': 536802}


  0%|          | 0/536802 [00:00<?, ?it/s]

  0%|          | 0/536802 [00:00<?, ?it/s]


{'End date': Timestamp('2021-07-11 00:00:00', freq='W-SUN'),
 'Start date': Timestamp('2021-07-04 00:00:00', freq='W-SUN'),
 'Test': 211146,
 'Test items': 6209,
 'Test users': 86167,
 'Train': 2886800,
 'Train items': 14357,
 'Train users': 595902}


  0%|          | 0/595902 [00:00<?, ?it/s]

  0%|          | 0/595902 [00:00<?, ?it/s]


{'End date': Timestamp('2021-07-18 00:00:00', freq='W-SUN'),
 'Start date': Timestamp('2021-07-11 00:00:00', freq='W-SUN'),
 'Test': 214489,
 'Test items': 6313,
 'Test users': 84234,
 'Train': 3192875,
 'Train items': 14711,
 'Train users': 640144}


  0%|          | 0/640144 [00:00<?, ?it/s]

  0%|          | 0/640144 [00:00<?, ?it/s]


{'End date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'),
 'Start date': Timestamp('2021-07-18 00:00:00', freq='W-SUN'),
 'Test': 231207,
 'Test items': 6491,
 'Test users': 87632,
 'Train': 3506106,
 'Train items': 14928,
 'Train users': 687200}


  0%|          | 0/687200 [00:00<?, ?it/s]

  0%|          | 0/687200 [00:00<?, ?it/s]


{'End date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'),
 'Start date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'),
 'Test': 249396,
 'Test items': 6611,
 'Test users': 93092,
 'Train': 3838180,
 'Train items': 15061,
 'Train users': 734701}


  0%|          | 0/734701 [00:00<?, ?it/s]

  0%|          | 0/734701 [00:00<?, ?it/s]


{'End date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'),
 'Start date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'),
 'Test': 264039,
 'Test items': 6609,
 'Test users': 98161,
 'Train': 4203885,
 'Train items': 15212,
 'Train users': 788721}


  0%|          | 0/788721 [00:00<?, ?it/s]

  0%|          | 0/788721 [00:00<?, ?it/s]


{'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'),
 'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'),
 'Test': 276699,
 'Test items': 6715,
 'Test users': 101983,
 'Train': 4587708,
 'Train items': 15404,
 'Train users': 842129}


  0%|          | 0/842129 [00:00<?, ?it/s]

  0%|          | 0/842129 [00:00<?, ?it/s]

CPU times: user 3h 47min 23s, sys: 23.7 s, total: 3h 47min 47s
Wall time: 2h 27min 33s


# 👌 Metrics 

`Metrics by fold`



## Metric description 
- ### <a href="https://rectools.readthedocs.io/en/latest/api/rectools.metrics.novelty.MeanInvUserFreq.html#rectools.metrics.novelty.MeanInvUserFreq"> Mean Inverse User Frequency (novelty)</a>

- ### <a href="https://rectools.readthedocs.io/en/latest/api/rectools.metrics.serendipity.Serendipity.html"> Serendipity = novelty and relevance</a>

In [16]:
df_metrics = pd.DataFrame(results)
df_metrics

Unnamed: 0,fold,model,prec@10,recall@10,novelty,serendipity
0,0,cosine_itemknn,0.004879,0.027282,7.783925,3.1e-05
1,0,tfidf_itemknn,0.008546,0.04834,7.799313,3.5e-05
2,1,cosine_itemknn,0.004807,0.028028,7.8136,3.3e-05
3,1,tfidf_itemknn,0.008504,0.05056,7.827066,3.9e-05
4,2,cosine_itemknn,0.004103,0.023102,7.95327,3.7e-05
5,2,tfidf_itemknn,0.00683,0.038295,7.952497,4.2e-05
6,3,cosine_itemknn,0.003865,0.020485,8.063779,4.4e-05
7,3,tfidf_itemknn,0.006591,0.035558,8.037984,5.3e-05
8,4,cosine_itemknn,0.0037,0.019591,8.118989,4.7e-05
9,4,tfidf_itemknn,0.006383,0.033929,8.077043,5.8e-05


In [17]:
# df_metrics.to_pickle("df_metrics.pickle")

## Metrics mean by fold
`we can compare two models`

In [18]:
df_metrics.groupby('model').mean()[metrics.keys()]

Unnamed: 0_level_0,prec@10,recall@10,novelty,serendipity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cosine_itemknn,0.00405,0.022161,8.006506,4e-05
tfidf_itemknn,0.006997,0.038679,7.986296,4.9e-05


## Metrics std by fold

`If a diff between model metrics less than an std value => there is no significant difference observed`

- For instance, for the serendipity metric there is no such difference between cosine_itemknn and tfidf_itemknn model results

In [19]:
df_metrics.groupby('model').std()[metrics.keys()]

Unnamed: 0_level_0,prec@10,recall@10,novelty,serendipity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cosine_itemknn,0.000589,0.004139,0.159162,6e-06
tfidf_itemknn,0.001085,0.007707,0.131361,9e-06


In [20]:
df = df_metrics.groupby('model').mean()[metrics.keys()]

In [21]:
diff = df.loc['cosine_itemknn'] - df.loc['tfidf_itemknn']
diff

prec@10       -0.002946
recall@10     -0.016518
novelty        0.020210
serendipity   -0.000009
dtype: float64

## Decrease N_users

In [7]:
models = {
    "tfidf_itemknn": TFIDFRecommender(),
#     "BM25": BM25Recommender(),
}
n_users = {
    "n_40": 40,
    "n_30": 30,
    "n_20": 20,
    "n_15": 15,
    "n_10": 10,
}
metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

In [8]:
# setting for cv 
n_folds = 7
unit = "D"
n_units = 5

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")

Start date and last date of the test fold: (Timestamp('2021-07-17 00:00:00'), Timestamp('2021-08-22 00:00:00'))


In [9]:
from rectools.model_selection import TimeRangeSplitter
from  rectools.dataset.interactions import Interactions

periods = n_folds + 1
freq = f"{n_units}{unit}"
print(
    f"start_date: {start_date}\n"
    f"last_date: {last_date}\n"
    f"periods: {periods}\n"
    f"freq: {freq}\n"
)
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(Interactions(interactions))}")

start_date: 2021-07-17 00:00:00
last_date: 2021-08-22 00:00:00
periods: 8
freq: 5D

Test fold borders: ['2021-07-17' '2021-07-22' '2021-07-27' '2021-08-01' '2021-08-06'
 '2021-08-11' '2021-08-16' '2021-08-21']
Real number of folds: 7


In [10]:
%%time

results = []

fold_iterator = cv.split(Interactions(interactions), collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    pprint(fold_info)

    df_train = interactions.iloc[train_ids].copy()
    df_test = interactions.iloc[test_ids][Columns.UserItem].copy()

    catalog = df_train[Columns.Item].unique()
    
    for model_name, model in models.items():
        for name, n in n_users.items():
            print(model_name, name)
            userknn_model = UserKnn(model=model, N_users=n)
            userknn_model.fit(df_train)

            recos = userknn_model.predict(df_test)

            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=df_test,
                prev_interactions=df_train,
                catalog=catalog,
            )

            fold = {"fold": i_fold, "model": model_name, "n_users": n}
            fold.update(metric_values)
            results.append(fold)
        


{'End date': Timestamp('2021-07-22 00:00:00', freq='5D'),
 'Start date': Timestamp('2021-07-17 00:00:00', freq='5D'),
 'Test': 171552,
 'Test items': 5965,
 'Test users': 71945,
 'Train': 3456392,
 'Train items': 14912,
 'Train users': 679748}
tfidf_itemknn n_40


  0%|          | 0/679748 [00:00<?, ?it/s]

tfidf_itemknn n_30


  0%|          | 0/679748 [00:00<?, ?it/s]

tfidf_itemknn n_20


  0%|          | 0/679748 [00:00<?, ?it/s]

tfidf_itemknn n_15


  0%|          | 0/679748 [00:00<?, ?it/s]

tfidf_itemknn n_10


  0%|          | 0/679748 [00:00<?, ?it/s]


{'End date': Timestamp('2021-07-27 00:00:00', freq='5D'),
 'Start date': Timestamp('2021-07-22 00:00:00', freq='5D'),
 'Test': 174592,
 'Test items': 6081,
 'Test users': 72506,
 'Train': 3694639,
 'Train items': 14998,
 'Train users': 714299}
tfidf_itemknn n_40


  0%|          | 0/714299 [00:00<?, ?it/s]

tfidf_itemknn n_30


  0%|          | 0/714299 [00:00<?, ?it/s]

tfidf_itemknn n_20


  0%|          | 0/714299 [00:00<?, ?it/s]

tfidf_itemknn n_15


  0%|          | 0/714299 [00:00<?, ?it/s]

tfidf_itemknn n_10


  0%|          | 0/714299 [00:00<?, ?it/s]


{'End date': Timestamp('2021-08-01 00:00:00', freq='5D'),
 'Start date': Timestamp('2021-07-27 00:00:00', freq='5D'),
 'Test': 186642,
 'Test items': 6088,
 'Test users': 77743,
 'Train': 3941089,
 'Train items': 15109,
 'Train users': 749447}
tfidf_itemknn n_40


  0%|          | 0/749447 [00:00<?, ?it/s]

tfidf_itemknn n_30


  0%|          | 0/749447 [00:00<?, ?it/s]

tfidf_itemknn n_20


  0%|          | 0/749447 [00:00<?, ?it/s]

tfidf_itemknn n_15


  0%|          | 0/749447 [00:00<?, ?it/s]

tfidf_itemknn n_10


  0%|          | 0/749447 [00:00<?, ?it/s]


{'End date': Timestamp('2021-08-06 00:00:00', freq='5D'),
 'Start date': Timestamp('2021-08-01 00:00:00', freq='5D'),
 'Test': 193921,
 'Test items': 6096,
 'Test users': 80069,
 'Train': 4203885,
 'Train items': 15212,
 'Train users': 788721}
tfidf_itemknn n_40


  0%|          | 0/788721 [00:00<?, ?it/s]

tfidf_itemknn n_30


  0%|          | 0/788721 [00:00<?, ?it/s]

tfidf_itemknn n_20


  0%|          | 0/788721 [00:00<?, ?it/s]

tfidf_itemknn n_15


  0%|          | 0/788721 [00:00<?, ?it/s]

tfidf_itemknn n_10


  0%|          | 0/788721 [00:00<?, ?it/s]


{'End date': Timestamp('2021-08-11 00:00:00', freq='5D'),
 'Start date': Timestamp('2021-08-06 00:00:00', freq='5D'),
 'Test': 203193,
 'Test items': 6123,
 'Test users': 82818,
 'Train': 4474647,
 'Train items': 15356,
 'Train users': 826429}
tfidf_itemknn n_40


  0%|          | 0/826429 [00:00<?, ?it/s]

tfidf_itemknn n_30


  0%|          | 0/826429 [00:00<?, ?it/s]

tfidf_itemknn n_20


  0%|          | 0/826429 [00:00<?, ?it/s]

tfidf_itemknn n_15


  0%|          | 0/826429 [00:00<?, ?it/s]

tfidf_itemknn n_10


  0%|          | 0/826429 [00:00<?, ?it/s]


{'End date': Timestamp('2021-08-16 00:00:00', freq='5D'),
 'Start date': Timestamp('2021-08-11 00:00:00', freq='5D'),
 'Test': 211952,
 'Test items': 6325,
 'Test users': 86828,
 'Train': 4755668,
 'Train items': 15480,
 'Train users': 865063}
tfidf_itemknn n_40


  0%|          | 0/865063 [00:00<?, ?it/s]

tfidf_itemknn n_30


  0%|          | 0/865063 [00:00<?, ?it/s]

tfidf_itemknn n_20


  0%|          | 0/865063 [00:00<?, ?it/s]

tfidf_itemknn n_15


  0%|          | 0/865063 [00:00<?, ?it/s]

tfidf_itemknn n_10


  0%|          | 0/865063 [00:00<?, ?it/s]


{'End date': Timestamp('2021-08-21 00:00:00', freq='5D'),
 'Start date': Timestamp('2021-08-16 00:00:00', freq='5D'),
 'Test': 207727,
 'Test items': 6109,
 'Test users': 85168,
 'Train': 5051815,
 'Train items': 15577,
 'Train users': 906071}
tfidf_itemknn n_40


  0%|          | 0/906071 [00:00<?, ?it/s]

tfidf_itemknn n_30


  0%|          | 0/906071 [00:00<?, ?it/s]

tfidf_itemknn n_20


  0%|          | 0/906071 [00:00<?, ?it/s]

tfidf_itemknn n_15


  0%|          | 0/906071 [00:00<?, ?it/s]

tfidf_itemknn n_10


  0%|          | 0/906071 [00:00<?, ?it/s]

CPU times: user 13h 45min 11s, sys: 1min 9s, total: 13h 46min 21s
Wall time: 8h 53min 29s


In [11]:
df_metrics = pd.DataFrame(results)
df_metrics

Unnamed: 0,fold,model,n_users,prec@10,recall@10,novelty,serendipity
0,0,tfidf_itemknn,40,0.006235,0.036011,8.088503,5e-05
1,0,tfidf_itemknn,30,0.006235,0.036011,8.088503,5e-05
2,0,tfidf_itemknn,20,0.006235,0.036011,8.088503,5e-05
3,0,tfidf_itemknn,15,0.00601,0.034749,8.016872,4.7e-05
4,0,tfidf_itemknn,10,0.005412,0.031378,7.957084,4e-05
5,1,tfidf_itemknn,40,0.005857,0.033396,8.145902,5.2e-05
6,1,tfidf_itemknn,30,0.005857,0.033396,8.145902,5.2e-05
7,1,tfidf_itemknn,20,0.005857,0.033396,8.145902,5.2e-05
8,1,tfidf_itemknn,15,0.005637,0.031693,8.061765,4.8e-05
9,1,tfidf_itemknn,10,0.005202,0.029139,7.99331,4.3e-05


In [13]:
df_metrics.groupby('n_users').mean()[metrics.keys()]

Unnamed: 0_level_0,prec@10,recall@10,novelty,serendipity
n_users,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,0.005012,0.028036,8.009602,4.3e-05
15,0.005495,0.030875,8.080023,5e-05
20,0.005718,0.032345,8.158266,5.4e-05
30,0.005718,0.032345,8.158266,5.4e-05
40,0.005718,0.032345,8.158266,5.4e-05
