In [1]:
import pandas as pd
import numpy as np
import gc
from numba import jit, typeof, typed, types, prange

from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric

from rec_lib.utils import reduce_mem_usage, set_target, customers_prep, articles_prep, df_lvl2_prep, top_12_recs
from rec_lib.metrics import precision_at_k, ap_k, recall, recall_at_k
from rec_lib.models import get_baseline_preds

import warnings
warnings.filterwarnings("ignore")

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%%time
result = pd.read_parquet('archive/result.parquet')
customers = pd.read_parquet('archive/customers.parquet')
articles = pd.read_parquet('archive/articles.parquet')

CPU times: user 2.64 s, sys: 1.01 s, total: 3.65 s
Wall time: 2.69 s


### Подготовка данных

In [4]:
%%time
customers = customers_prep(customers)

CPU times: user 1.76 s, sys: 186 ms, total: 1.94 s
Wall time: 1.9 s


In [5]:
%%time
articles = articles_prep(articles)

CPU times: user 55.8 ms, sys: 1.01 ms, total: 56.8 ms
Wall time: 55 ms


### Собираем датафрейм для обучения модели 2го уровня

In [6]:
%%time
result = df_lvl2_prep(result)

CPU times: user 1min 39s, sys: 7.64 s, total: 1min 46s
Wall time: 1min 45s


In [7]:
%%time
# добавляем данные из датафремов покупателей и категорий
result = result.merge(customers, on='customer_id', how='left')
result = result.merge(articles, on='article_id', how='left')

CPU times: user 21.4 s, sys: 6.21 s, total: 27.6 s
Wall time: 27.6 s


In [8]:
del customers;
del articles;
gc.collect()

0

In [9]:
features = list(result)
features.remove('target')
features.remove('customer_id')
features.remove('article_id')
cat_feats = features.copy()
cat_feats.remove('age')
cat_feats.remove('customer_id_short')
cat_feats.remove('article_id_short')

### Обучим модель CatBoostClassifier 

In [10]:
%%time
clf = get_baseline_preds(result, features, cat_feats)

CPU times: user 2h 4min 2s, sys: 3min 39s, total: 2h 7min 41s
Wall time: 10min 5s


In [12]:
%%time
# получаем предсказания модели
preds = clf.predict_proba(result[features])
preds = preds[:,1]

CPU times: user 2min 12s, sys: 2.27 s, total: 2min 15s
Wall time: 1min 4s


In [14]:
customers_id = result['customer_id_short'].values
article_id = result['article_id_short'].values
target = result['target'].values

In [15]:
%%time
# ранжируем предсказания и берем топ12
customers_id_arr, recs = top_12_recs(customers_id, article_id, target, preds)

CPU times: user 29.4 s, sys: 451 ms, total: 29.9 s
Wall time: 29.8 s


In [16]:
# трансформируем массивы рекомендаций
recs = recs.reshape(len(set(customers_id)),12)
customers_id_arr = customers_id_arr.astype(np.int32)

In [17]:
spam = pd.DataFrame([(customers_id_arr[i], recs[i]) for i in range(len(set(customers_id)))], columns=['customer_id_short', 'top_12_recs'])

In [18]:
result_test = pd.read_parquet('archive/result.parquet')
result_test = result_test.merge(spam, on='customer_id_short', how='left')

In [19]:
print('Recall_own_rec_12 : ', result_test.apply(lambda row: recall(row['top_12_recs'], row['actual_article_id_short']), axis=1).mean())

Recall_own_rec_12 :  0.022902486423667465


In [21]:
print('MAP_own_rec_12 : ', result_test.apply(lambda row: ap_k(row['top_12_recs'], row['actual_article_id_short'], 12), axis=1).mean())

MAP_own_rec_12 :  0.0016198802750783724
