In [5]:
import pandas as pd
import numpy as np
import gc
# from joblib import dump, load
from numba import jit, typeof, typed, types, prange

from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric

from IPython.display import clear_output

from rec_lib.utils import reduce_mem_usage
from rec_lib.metrics import recall, precision_at_k, ap_k

import warnings
warnings.filterwarnings("ignore")

In [6]:
%load_ext autoreload
%autoreload 2

In [7]:
# from catboost.utils import get_gpu_device_count
# print('В наличие %i GPU' % get_gpu_device_count())

In [8]:
%%time
result = pd.read_parquet('archive/result.parquet')
customers = pd.read_parquet('archive/customers.parquet')
articles = pd.read_parquet('archive/articles.parquet')

CPU times: user 2.51 s, sys: 793 ms, total: 3.3 s
Wall time: 2.16 s


### Подготовка данных

In [9]:
customers = customers[list(customers)[:-1]]

In [10]:
customers['FN'] = customers['FN'].fillna(0.0)
customers['FN'] = customers['FN'].apply(str)
customers['Active'] = customers['Active'].fillna(0.0)
customers['Active'] = customers['Active'].apply(str)
customers['club_member_status'] = customers['club_member_status'].fillna('NONE')
customers['club_member_status'] = customers['club_member_status'].apply(str)
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].fillna('NONE')
customers.loc[customers['fashion_news_frequency'] == 'None'] = 'NONE'
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].apply(str)

age_mode = float(customers['age'].mode())
customers['age'] = customers['age'].fillna(age_mode)
customers[customers['age'] == 'NONE'] = age_mode

In [11]:
art_feats = ['article_id',
            'product_type_no',
#             'product_type_name',
            'product_group_name',
            'graphical_appearance_name',
            'colour_group_name',
            'perceived_colour_value_name',
            'perceived_colour_master_name',
            'department_no',
#             'department_name',
            'index_name',
            'index_group_name',
            'garment_group_name']

articles = articles[art_feats]
articles['product_type_no'] = articles['product_type_no'].apply(str)
articles['department_no'] = articles['department_no'].apply(str)

### Собираем датафрейм для обучения модели 2го уровня

In [12]:
%%time
s = result.apply(lambda x: pd.Series(x['own_rec']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'article_id_short'

CPU times: user 15.7 s, sys: 772 ms, total: 16.5 s
Wall time: 16.5 s


In [13]:
result = result.drop(['own_rec', 'sim_users'], axis=1).join(s)

In [14]:
%%time
result['actual_article_id_short'] = result['actual_article_id_short'].apply(lambda x: np.array(x))

CPU times: user 25.2 s, sys: 3.37 s, total: 28.6 s
Wall time: 28.6 s


In [15]:
actual_article_id_short_arr = np.array(result['actual_article_id_short'])
article_id_short_arr = np.array(result['article_id_short'])

In [16]:
@jit(nopython=True, fastmath=True)
def set_target(arr_actual, val):
    if isin(arr_actual, val):
        return 1
    return 0

In [None]:
%%time
target_arr = [set_target(actual_article_id_short_arr[i], article_id_short_arr[i]) for i in range(len(article_id_short_arr))]

In [None]:
result['target'] = target_arr
result = result.drop('actual_article_id_short', axis=1)

In [None]:
del s;
del actual_article_id_short_arr;
del article_id_short_arr;
del target_arr;
gc.collect()

In [None]:
result = reduce_mem_usage(result)

In [None]:
%%time
# исходные id покупателей и категорий
spam1 = pd.read_parquet('archive/transactions_train_for_power_bi.parquet', columns=['customer_id', 'customer_id_short'])
spam1 = spam1.drop_duplicates(keep='last')
spam2 = pd.read_parquet('archive/transactions_train_for_power_bi.parquet', columns=['article_id', 'article_id_short'])
spam2 = spam2.drop_duplicates(keep='last')

In [None]:
%%time
# добавляем исходные id покупателей и категорий
result = result.merge(spam1, on='customer_id_short', how='left')
result = result.merge(spam2, on='article_id_short', how='left')

In [None]:
del spam1;
del spam2;
gc.collect()

In [None]:
%%time
# добавляем данные из датафремов покупателей и категорий
result = result.merge(customers, on='customer_id', how='left')
result = result.merge(articles, on='article_id', how='left')

In [None]:
del customers;
del articles;
gc.collect()

In [None]:
features = list(result)
features.remove('target')
features.remove('customer_id')
features.remove('article_id')
cat_feats = features.copy()
cat_feats.remove('age')
cat_feats.remove('customer_id_short')
cat_feats.remove('article_id_short')

In [None]:
# feats_num = 0
# for el in cat_feats:
#     feats_num += X_train[el].nunique()
#     print(el, X_train[el].nunique(), feats_num)

### Модели
- model 1 (-24 - молодежь)
- model 2 (25-44 - зрелое экономически активное население)
- model 3 (45-64 - пожилое экономически активное население)
- model 4 (65+ пенсионный возраст, малый достаток)

##### Модели

In [None]:
min_age = result['age'].min()
max_age = result['age'].max()

In [None]:
age_cats = [(min_age, 24), (25, 44), (45, 64), (65, max_age)]

In [None]:
def get_preds():
    
    clf = CatBoostClassifier(iterations=100, eval_metric='Logloss', use_best_model=True, random_seed=42)
    
    for cat_range in age_cats:
        cat_min_age = cat_range[0]
        cat_max_age = cat_range[1]
        
        print(f'Возрастная категория {cat_min_age}-{cat_max_age}')
        
        X_train = result[features].loc[(result[features]['age'] >= cat_min_age) & (result[features]['age'] <= cat_max_age)]
        y_train = result.loc[(result['age'] >= cat_min_age) & (result['age'] <= cat_max_age)]['target']
        X_train[cat_feats] = X_train[cat_feats].astype('category')
        
        eval_data = Pool(X_train, y_train, cat_features=cat_feats)
        
        # CatBoostClassifier
        if cat_min_age == min_age:
            clf.fit(X_train, y_train, cat_features=cat_feats, eval_set=eval_data, early_stopping_rounds=5, verbose=True)
        else:
            clf.fit(X_train, y_train, cat_features=cat_feats, init_model=clf, eval_set=eval_data, early_stopping_rounds=5, verbose=True)
        
        clear_output(wait=True)
        
    return clf

In [None]:
%%time
clf = get_preds()

In [None]:
%%time
X_train = result[features]

In [None]:
%%time
# model preds
preds = clf.predict_proba(X_train)
preds = preds[:,1]

In [None]:
customers_id = result['customer_id_short'].values
article_id = result['article_id_short'].values
target = result['target'].values

In [None]:
@jit(nopython=True, fastmath=True)
def top_12_recs(customers_id, article_id, target, preds):
    customers_id_arr = np.zeros(0)
    recs = np.zeros(0)
    k1 = 0
    k2 = 500
    for i in range(len(set(customers_id))):
        customers_id_spam = customers_id[k1:k2]
        article_id_spam = article_id[k1:k2]
        target_spam = target[k1:k2]
        preds_spam = preds[k1:k2]
        
        mask = preds_spam.argsort()[::-1]
        rec = np.zeros(12)
        
        customers_id_arr = np.append(customers_id_arr, customers_id_spam[0])
        
        for i in range(12):
            rec[i] = article_id_spam[mask[i]]
            
        recs = np.append(recs, rec)
            
        k1 += 500
        k2 += 500
        
    return customers_id_arr, recs

In [None]:
%%time
customers_id_arr, recs = top_12_recs(customers_id, article_id, target, preds)

In [None]:
recs = recs.reshape(len(set(customers_id)),12)
customers_id_arr = customers_id_arr.astype(np.int32)

In [None]:
spam = pd.DataFrame([(customers_id_arr[i], recs[i]) for i in range(len(set(customers_id)))], columns=['customer_id_short', 'top_12_recs'])

In [None]:
result_test = pd.read_parquet('archive/result.parquet')
result_test = result_test.merge(spam, on='customer_id_short', how='left')

In [None]:
print('Recall_own_rec_12 : ', result_test.apply(lambda row: recall(row['top_12_recs'], row['actual_article_id_short']), axis=1).mean())

In [None]:
# Recall
# Logloss:       0.022902486423667465
# CrossEntropy:  0.022902486423667465
# BrierScore:    0.020891048007429233

In [None]:
print('MAP_own_rec_12 : ', result_test.apply(lambda row: ap_k(row['top_12_recs'], row['actual_article_id_short'], 12), axis=1).mean())

In [126]:
# MAP@12
# Logloss:       0.0016198802750783724
# CrossEntropy:  0.0016198802750783724
# BrierScore:    0.0014921423739928075