In [1]:
import pandas as pd
import numpy as np
import gc
# from joblib import dump, load
from numba import jit, typeof, typed, types, prange

from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric

from IPython.display import clear_output

from rec_lib.utils import reduce_mem_usage, precision_at_k, ap_k, prefilter_items, recall, recall_at_k, isin

import warnings
warnings.filterwarnings("ignore")

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# from catboost.utils import get_gpu_device_count
# print('В наличие %i GPU' % get_gpu_device_count())

In [4]:
%%time
result = pd.read_parquet('archive/result.parquet')
customers = pd.read_parquet('archive/customers.parquet')
articles = pd.read_parquet('archive/articles.parquet')

CPU times: user 2.52 s, sys: 657 ms, total: 3.18 s
Wall time: 2.43 s


### Подготовка данных

In [5]:
customers = customers[list(customers)[:-1]]

In [6]:
customers['FN'] = customers['FN'].fillna(0.0)
customers['FN'] = customers['FN'].apply(str)
customers['Active'] = customers['Active'].fillna(0.0)
customers['Active'] = customers['Active'].apply(str)
customers['club_member_status'] = customers['club_member_status'].fillna('NONE')
customers['club_member_status'] = customers['club_member_status'].apply(str)
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].fillna('NONE')
customers.loc[customers['fashion_news_frequency'] == 'None'] = 'NONE'
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].apply(str)

age_mode = float(customers['age'].mode())
# age_median = customers[customers['age'] != 'NONE']['age'].median()
customers['age'] = customers['age'].fillna(age_mode)
customers[customers['age'] == 'NONE'] = age_mode

In [7]:
art_feats = ['article_id',
            'product_type_no',
#             'product_type_name',
            'product_group_name',
            'graphical_appearance_name',
            'colour_group_name',
            'perceived_colour_value_name',
            'perceived_colour_master_name',
            'department_no',
#             'department_name',
            'index_name',
            'index_group_name',
            'garment_group_name']

articles = articles[art_feats]
articles['product_type_no'] = articles['product_type_no'].apply(str)
articles['department_no'] = articles['department_no'].apply(str)

### Собираем датафрейм для обучения модели 2го уровня

In [8]:
%%time
s = result.apply(lambda x: pd.Series(x['own_rec']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'article_id_short'

CPU times: user 14.6 s, sys: 825 ms, total: 15.4 s
Wall time: 15.4 s


In [9]:
result = result.drop(['own_rec', 'sim_users'], axis=1).join(s)

In [10]:
%%time
result['actual_article_id_short'] = result['actual_article_id_short'].apply(lambda x: np.array(x))

CPU times: user 24 s, sys: 3.27 s, total: 27.3 s
Wall time: 27.3 s


In [11]:
actual_article_id_short_arr = np.array(result['actual_article_id_short'])
article_id_short_arr = np.array(result['article_id_short'])

In [12]:
@jit(nopython=True, fastmath=True)
def set_target(arr_actual, val):
    if isin(arr_actual, val):
        return 1
    return 0

In [13]:
%%time
target_arr = [set_target(actual_article_id_short_arr[i], article_id_short_arr[i]) for i in range(len(article_id_short_arr))]

CPU times: user 24.6 s, sys: 117 ms, total: 24.8 s
Wall time: 24.8 s


In [14]:
result['target'] = target_arr
result = result.drop('actual_article_id_short', axis=1)

In [15]:
del s;
del actual_article_id_short_arr;
del article_id_short_arr;
del target_arr;
gc.collect()

6637

In [16]:
result = reduce_mem_usage(result)

Memory usage of dataframe is 1167.72 MB
Memory usage after optimization is: 620.35 MB
Decreased by 46.9%


In [17]:
%%time
# исходные id покупателей и категорий
spam1 = pd.read_parquet('archive/transactions_train_for_power_bi.parquet', columns=['customer_id', 'customer_id_short'])
spam1 = spam1.drop_duplicates(keep='last')
spam2 = pd.read_parquet('archive/transactions_train_for_power_bi.parquet', columns=['article_id', 'article_id_short'])
spam2 = spam2.drop_duplicates(keep='last')

CPU times: user 6.92 s, sys: 1.11 s, total: 8.04 s
Wall time: 6.72 s


In [18]:
%%time
# добавляем исходные id покупателей и категорий
result = result.merge(spam1, on='customer_id_short', how='left')
result = result.merge(spam2, on='article_id_short', how='left')

CPU times: user 9.69 s, sys: 1.3 s, total: 11 s
Wall time: 11 s


In [19]:
del spam1;
del spam2;
gc.collect()

0

In [20]:
%%time
# добавляем данные из датафремов покупателей и категорий
result = result.merge(customers, on='customer_id', how='left')
result = result.merge(articles, on='article_id', how='left')

CPU times: user 19.5 s, sys: 5.38 s, total: 24.9 s
Wall time: 24.9 s


In [21]:
del customers;
del articles;
gc.collect()

0

In [22]:
features = list(result)
features.remove('target')
features.remove('customer_id')
features.remove('article_id')
cat_feats = features.copy()
cat_feats.remove('age')
cat_feats.remove('customer_id_short')
cat_feats.remove('article_id_short')

In [23]:
# feats_num = 0
# for el in cat_feats:
#     feats_num += X_train[el].nunique()
#     print(el, X_train[el].nunique(), feats_num)

### Модели
- model 1 (-24 - молодежь)
- model 2 (25-44 - зрелое экономически активное население)
- model 3 (45-64 - пожилое экономически активное население)
- model 4 (65+ пенсионный возраст, малый достаток)

##### Модели

In [24]:
min_age = result['age'].min()
max_age = result['age'].max()

In [25]:
age_cats = [(min_age, 24), (25, 44), (45, 64), (65, max_age)]

In [26]:
result.shape[0] / result['target'].sum()

936.786955883073

In [27]:
def get_preds():
    
    preds = np.zeros(0)
    customers_id = np.zeros(0)
    article_id = np.zeros(0)
    target = np.zeros(0)
    
    for cat_range in age_cats:
        cat_min_age = cat_range[0]
        cat_max_age = cat_range[1]
        
        print(f'Возрастная категория {cat_min_age}-{cat_max_age}')
        
        X_train = result[features].loc[(result[features]['age'] >= cat_min_age) & (result[features]['age'] <= cat_max_age)]
        y_train = result.loc[(result['age'] >= cat_min_age) & (result['age'] <= cat_max_age)]['target']
        X_train[cat_feats] = X_train[cat_feats].astype('category')
        
        eval_data = Pool(X_train, y_train, cat_features=cat_feats)
        
        # CatBoostClassifier
        # score_function:Cosine, L2
        clf = CatBoostClassifier(iterations=150, eval_metric='Logloss', scale_pos_weight=50, learning_rate=0.6, use_best_model=True, random_seed=42) # scale_pos_weight
        clf.fit(X_train, y_train, cat_features=cat_feats, eval_set=eval_data, early_stopping_rounds=5, verbose=True)
        
        # model preds
        preds_spam = clf.predict_proba(X_train)
        preds_spam = preds_spam[:,1]
        
        customers_id_spam = result.loc[(result['age'] >= cat_min_age) & (result['age'] <= cat_max_age)]['customer_id_short'].values.astype(np.int32)
        article_id_spam = result.loc[(result['age'] >= cat_min_age) & (result['age'] <= cat_max_age)]['article_id_short'].values.astype(np.int32)
        target_spam = result.loc[(result['age'] >= cat_min_age) & (result['age'] <= cat_max_age)]['target'].values
    
        customers_id = np.append(customers_id, customers_id_spam)
        article_id = np.append(article_id, article_id_spam)
        target = np.append(target, target_spam)
        preds = np.append(preds, preds_spam)
        
        del X_train;
        del y_train;
        del eval_data;
        del clf;
        del preds_spam;
        del customers_id_spam;
        del target_spam;
        gc.collect()
        
        clear_output(wait=True)
        
    return customers_id, article_id, target, preds

In [28]:
%%time
customers_id, article_id, target, preds = get_preds()

CPU times: user 3h 25min 36s, sys: 5min 34s, total: 3h 31min 10s
Wall time: 16min 4s


In [29]:
customers_id = customers_id.astype(np.int32)
article_id = article_id.astype(np.int32)

In [30]:
@jit(nopython=True, fastmath=True)
def top_12_recs(customers_id, article_id, target, preds):
    customers_id_arr = np.zeros(0)
    recs = np.zeros(0)
    k1 = 0
    k2 = 500
    for i in range(len(set(customers_id))):
        customers_id_spam = customers_id[k1:k2]
        article_id_spam = article_id[k1:k2]
        target_spam = target[k1:k2]
        preds_spam = preds[k1:k2]
        
        mask = preds_spam.argsort()[::-1]
        rec = np.zeros(12)
        
        customers_id_arr = np.append(customers_id_arr, customers_id_spam[0])
        
        for i in range(12):
            rec[i] = article_id_spam[mask[i]]
            
        recs = np.append(recs, rec)
            
        k1 += 500
        k2 += 500
        
    return customers_id_arr, recs

In [31]:
%%time
customers_id_arr, recs = top_12_recs(customers_id, article_id, target, preds)

CPU times: user 28.5 s, sys: 489 ms, total: 29 s
Wall time: 29 s


In [32]:
recs = recs.reshape(len(set(customers_id)),12)
customers_id_arr = customers_id_arr.astype(np.int32)

In [33]:
spam = pd.DataFrame([(customers_id_arr[i], recs[i]) for i in range(len(set(customers_id)))], columns=['customer_id_short', 'top_12_recs'])

In [34]:
result_test = pd.read_parquet('archive/result.parquet')
result_test = result_test.merge(spam, on='customer_id_short', how='left')

In [35]:
print('Recall_own_rec_12 : ', result_test.apply(lambda row: recall(row['top_12_recs'], row['actual_article_id_short']), axis=1).mean())

Recall_own_rec_12 :  0.04481846805032747


In [36]:
# Recall iter_num=170
# Logloss scale_pos_weight=50:      0.044582620960084154   1ч 6мин

# Recall iter_num=150
# Logloss scale_pos_weight=50:      0.04481846805032747    16мин

In [37]:
print('MAP_own_rec_12 : ', result_test.apply(lambda row: ap_k(row['top_12_recs'], row['actual_article_id_short'], 12), axis=1).mean())

MAP_own_rec_12 :  0.003788928980852199


In [38]:
# MAP@12 iter_num=170
# Logloss scale_pos_weight=50:      0.0038094989590683973  1ч 6мин

# MAP@12 iter_num=150
# Logloss scale_pos_weight=50:      0.003788928980852199   16мин