In [21]:
fname = './Telegram Desktop'

Ниже описаны вспомогательные функции для подсчёта скора (мы решили, что веса будем брать в соответствии с Вашим услоием)

In [48]:
import os
import time
import tqdm
import keras
import datetime
import operator
import warnings
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dateutil import parser
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.svm import NuSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingCVClassifier
from catboost import CatBoostClassifier
from sklearn.utils.testing import all_estimators
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from catboost import Pool, CatBoostClassifier
from keras.models import Model, model_from_json

warnings.simplefilter('ignore')

def calc_score(proba):
    return -10 * proba[0] + proba[1] * 0.5 - proba[2] * 0.1 + proba[3] * 0.1

def scale(x):
    if x > -0.026:
        return +1
    return -1

def weight(pr):
    if pr == "view":
        return 0.1
    if pr == "skip":
        return -0.1
    if pr == "like":
        return 0.5
    if pr == "dislike":
        return -10
    if pr == 0:
        return -10
    if pr == 1:
        return 0.5
    if pr == 2:
        return 0.1
    if pr == 3:
        return -0.1
    raise

def get_final_score(preds_proba, y_test, dry_run=False):
    my_score, max_score = 0, 0
    
    preds = [scale(calc_score(x_)) for x_ in preds_proba]

    if not dry_run:
        for i in range(len(preds)):
            my_score += weight(y_test[i]) * preds[i]
            max_score += abs(weight(y_test[i]))
        print(my_score)
        print(max_score)
        print(my_score / max_score)
    
    return preds

def normalize_and_clear(df, normalize_columns=[], drop_columns=[]):
    df_norm = df.copy()
    df_norm = df.fillna(0)
    
    for column in normalize_columns:
        status = df_norm[column].unique().tolist()
        if 0 in status:
            status.remove(0)
        df_norm = df_norm.replace(status, np.arange(1, len(status) + 1, 1))
    
    df_norm = df_norm.drop(drop_columns, axis=1)
    
    return df_norm.copy()

def get_average_embedding(x):
    texts = [y['text'] for y in x]
    words = " ".join(texts).split(" ")
    word_amount = {}
    for item in words:
        if item not in russian_stopwords:
            if item in word_amount:
                word_amount[item] += 1
            else:
                word_amount[item] = 1
    cnt = 0
    cnt_ok = 0
    result_emb = [0 for i in range(300)]
    for word in words:
        if word not in russian_stopwords:
            if word in ru_emb:
                cnt_ok += 1
                result_emb += ru_emb[word] / word_amount[word]
        cnt += 1
    print(cnt / cnt_ok)
    return result_emb

def embed_to_df(embeds, story_ids):
    story_ids = np.array(story_ids).reshape(-1,1)
    story_ids_embeds = np.concatenate((embeds, story_ids), axis=1)
    print(story_ids_embeds.shape)
    return pd.DataFrame(story_ids_embeds, columns = ["embed_avg_" + str(i) for i in range(300)] + ['story_id'])

def embed_stories_data_by_regex(stories_data):
    patterns = [
        ('text_has_advices', r'(правил|совет|не\ нужно)'),
        ('text_has_finances', r'(деньг|купюр|монет|финанс|вклад|копит|кэшбэк|денег|акци|инвестор)'),
        ('text_has_cinema', r'(кино|фильм|премьер)'),
        ('text_has_city', r'(москв|питер|киев|новгород|минск|нью\ ёрк|санкт\ петер|сеул)'),
        ('text_has_travel', r'(путешеств|поездк|перелет|перелёт|отель|отеля|турист)'),
        ('text_has_transport', r'(\ машин|автомоб|поезд|самолет|самолёт|джип|водитель)'),
        ('test_has_shop', r'(покупк|магазин|бутик|торгов|aliexpress|купит|купил|авито|прода)'),
        ('text_has_sport', r'(спорт|матч|футбол|хокке|баскетбол|тренировк|волейб|турнир|чемпионат)'),
        ('text_has_culture', r'(живопис|литерат|музык|композит|художн|скульпт|выставк)'),
        ('text_has_life_story', r'(жизнь\ в|жизнь\ на|жизнь\ около)'),
        ('text_has_rubl', r'(рубль|рубля|рублей|к\ рублю|млн|доллар)'),
        ('text_has_banc_card', r'(банковская\ карта|банковской\ карты|банковскую\ карту|банковских\ карт|банковской\ карте|кредитк|банкомат)'),
        ('text_has_tinkoff', r'тинькофф'),
        ('text_has_question_at_the_beginning', r'^(как|что)'),
        ('text_has_gadgets', r'(гаджет|ноут|айфон|макбук|notebook|phone|gadget|планшет|гейм)'),
        ('text_has_dopustim', r'^допустим\,'),
        ('text_has_test', r'пройти\ тест'),
        ('text_has_digit_at_the_beginning', r'^\d')
    ]
    for c, p in tqdm_notebook(patterns):
        stories_data[c] = stories_data.all_text.apply(lambda x : 1 if re.search(p, x.lower().strip()) else 0)
    return stories_data

all_features_columns = ['customer_id', 'story_id', 'show_time', 'gender_cd', 'age', 'marital_status_cd', 'children_cnt',
 'job_position_cd', 'open_products', 'utl_products', 'cls_products', 'view', 'skip', 'like', 'dislike', 'cust_time',
 'text_has_advices', 'text_has_finances', 'text_has_cinema', 'text_has_city', 'text_has_travel', 'text_has_transport',
 'test_has_shop', 'text_has_sport', 'text_has_culture', 'text_has_life_story', 'text_has_rubl', 'text_has_banc_card',
 'text_has_tinkoff', 'text_has_question_at_the_beginning', 'text_has_gadgets', 'pages', 'story_w_stat', 'like_by_story_id',
 'dislike_by_story_id', 'skip_by_story_id', 'view_by_story_id', 'name_cifra_dnya', 'like_of_nearest', 'vopros',
 'dot', 'length', 'font_size', 'has_images', 'has_text', 'text_avg', 'text_pages_ratio', 'images_pages_ratio',
 'mean_tr_before_story', 'max_tr_before_story', 'tr_count_before_story', 'tr_amount_before_story', 'last_tr_time_(ts)_before_story',
 'last_tr_time_(hours)_before_story', 'event_index', 'time_since_last_event_ts', 'time_since_last_event_days',
 'event_dttm_weekday', 'first_session_dttm_weekday', 'event_dttm_hour', 'first_session_dttm_hour', 'time_to_next_show_this',
 'time_to_next_show_any', 'time_to_prev_show_this', 'time_to_prev_show_any', 'will_watch_next_hour', 'will_watch_next_day',
 'watched_prev_hour', 'watched_prev_day', 'will_be_after_cnt', 'watched_this_story', 'will_be_after_this_story',
 '0', '1', 'event_dttm', 'event', 'weekday', 'is_weekend', 'group_size', 'group_number']

all_features_test_columns = ['Unnamed: 0', 'customer_id', 'story_id', 'show_time', 'gender_cd', 'age', 'marital_status_cd', 'children_cnt',
 'job_position_cd', 'open_products', 'utl_products', 'cls_products', 'view', 'skip', 'like', 'dislike', 'cust_time',
 'text_has_advices', 'text_has_finances', 'text_has_cinema', 'text_has_city', 'text_has_travel', 'text_has_transport',
 'test_has_shop', 'text_has_sport', 'text_has_culture', 'text_has_life_story', 'text_has_rubl', 'text_has_banc_card',
 'text_has_tinkoff', 'text_has_question_at_the_beginning', 'text_has_gadgets', 'pages', 'story_w_stat', 'like_by_story_id',
 'dislike_by_story_id', 'skip_by_story_id', 'view_by_story_id', 'name_cifra_dnya', 'like_of_nearest', 'vopros',
 'dot', 'length', 'font_size', 'has_images', 'has_text', 'text_avg', 'text_pages_ratio', 'images_pages_ratio',
 'mean_tr_before_story', 'max_tr_before_story', 'tr_count_before_story', 'tr_amount_before_story', 'last_tr_time_(ts)_before_story',
 'last_tr_time_(hours)_before_story', 'event_index', 'time_since_last_event_ts', 'time_since_last_event_days',
 'event_dttm_weekday', 'first_session_dttm_weekday', 'event_dttm_hour', 'first_session_dttm_hour', 'time_to_next_show_this',
 'time_to_next_show_any', 'time_to_prev_show_this', 'time_to_prev_show_any', 'will_watch_next_hour', 'will_watch_next_day',
 'watched_prev_hour', 'watched_prev_day', 'will_be_after_cnt', 'watched_this_story', 'will_be_after_this_story',
 '0', '1', 'event_dttm', 'event', 'weekday', 'is_weekend', 'group_size', 'group_number']

drop_data_columns = []

drop_data_test_columns = ["Unnamed: 0"]

drop_story_no_info = ['text_has_advices', 'text_has_finances', 
                      'text_has_cinema', 'text_has_city', 'text_has_travel', 
                      'text_has_transport', 'test_has_shop', 'text_has_sport', 
                      'text_has_culture', 'text_has_life_story', 'text_has_rubl', 
                      'text_has_banc_card', 'text_has_tinkoff', 'text_has_question_at_the_beginning', 
                      'text_has_gadgets', 'pages', 'name_cifra_dnya', 
                      'like_of_nearest', 'vopros', 'dot', 'length', 
                      'font_size', 'has_images', 'has_text', 'text_avg', 
                      'text_pages_ratio', 'images_pages_ratio']

drop_unk_customer = ['customer_id']
drop_unk_story = ['story_id']

normalize_customer = ['product_1', 'product_2', 'product_3', 
                            'product_5', 'gender_cd', 'marital_status_cd']
normalize_reaction = ['event']
drop_customer = ['first_session_dttm', 'job_title', 'product_0', 'product_4', 'product_6']
drop_reaction = ['event_dttm']

reactions_dict = {1: 0.1, 2: -0.1, 3: 0.5, 4:-10.0}
reactions_name_id = {'dislike': 0, 'like': 1, 'skip': 2, 'view': 3}

Здесь просто читаем файлы из входных данных. Vova3 - это просто распаршенные данные об историях(ведь они были даны в csv)

In [23]:
reaction_info_train = pd.read_csv(os.path.join(fname, 'customer_train.csv'))

embeddings = pd.read_csv(os.path.join(fname, 'all_stories_data.csv'))

stories_info_train = pd.read_csv(os.path.join(fname, 'vova_3.csv'))

reaction_info_train = pd.read_csv(os.path.join(fname, 'stories_reaction_train.csv'))

transactions_info = pd.read_csv(os.path.join(fname, 'transactions.csv'))

customer_info_test = pd.read_csv(os.path.join(fname, 'customer_test.csv'))
reaction_info_test = pd.read_csv(os.path.join(fname, 'stories_reaction_test.csv'))

Здесь начинаем экстрактить фичи по историям (одна из самых важных частей). Также банально чистим колонки от Null и удаляем ненужную инфу.

In [24]:
customer_info_train = pd.read_csv(os.path.join(fname, 'customer_train.csv'))

stories_info_train = pd.read_csv(os.path.join(fname, 'vova_3.csv'))
evalnp = np.vectorize(eval)

stories_info_train['images'] = evalnp(stories_info_train['images'])
stories_info_train['texts'] = evalnp(stories_info_train['texts'])
stories_info_train['image_height'] = stories_info_train['images'].apply(lambda x: x[0]['size'][0] if len(x) > 0 else 0)
stories_info_train['image_width'] = stories_info_train['images'].apply(lambda x: x[0]['size'][1] if len(x) > 0 else 0)
stories_info_train['images_amount'] = stories_info_train['images'].apply(lambda x: len(x))
stories_info_train['texts_amount'] = stories_info_train['texts'].apply(lambda x: len(x))
stories_info_train['texts_overall_len'] = embeddings['all_text'].apply(lambda x: len(x))
stories_info_train['texts_average_len'] = stories_info_train['texts_overall_len'] / stories_info_train['texts_amount']
stories_info_train['text_pages_ratio'] = stories_info_train['texts_amount'] / stories_info_train['pages']
stories_info_train['image_pages_ratio'] = stories_info_train['images_amount'] / stories_info_train['pages']
stories_info_train['story_name_len'] = stories_info_train['name'].apply(lambda x: len(str(x)))

stories_info_train = stories_info_train.drop(['texts', 'images', 'name', 'Unnamed: 0'], axis=1)

reaction_info_train = pd.read_csv(os.path.join(fname, 'stories_reaction_train.csv'))

transactions_info = pd.read_csv(os.path.join(fname, 'transactions.csv'))

customer_info_test = pd.read_csv(os.path.join(fname, 'customer_test.csv'))
reaction_info_test = pd.read_csv(os.path.join(fname, 'stories_reaction_test.csv'))

customer_info_train_norm = normalize_and_clear(customer_info_train, normalize_customer, drop_customer)
reaction_info_train_norm = normalize_and_clear(reaction_info_train, normalize_reaction, drop_reaction)

merged_train_info_norm = pd.merge(reaction_info_train_norm, customer_info_train_norm, on='customer_id')

customer_info_test_norm = normalize_and_clear(customer_info_test, normalize_customer, drop_customer)
reaction_info_test_norm = normalize_and_clear(reaction_info_test, [], drop_reaction)


merged_test_info_norm = pd.merge(reaction_info_test_norm, customer_info_test_norm, on='customer_id').drop(['answer_id'], axis=1)

Счётчики по пользователю и истории. Сколько лайков, дизлайков и тд.

In [25]:
def get_counters_by_type(id_type):
    events_grouped = reaction_info_train_norm.groupby([id_type, 'event']).count().reset_index()
    columns_must_have = [1, 2, 3, 4]
    grouped = events_grouped.groupby([id_type])
    events_counters = pd.DataFrame(columns = [id_type, 1, 2, 3, 4])
    ids = sorted(list(set(events_grouped[id_type].tolist())))
    for index, id_ in tqdm.tqdm(enumerate(ids)):
        new_record = grouped.get_group(id_).drop([id_type], axis=1).reset_index()
        new_record = new_record.T.drop(['index'])
        new_record.columns = new_record.iloc[0]
        new_record = new_record.drop(new_record.index[0])
        for column in columns_must_have:
            if column not in new_record:
                new_record[column] = 0
        new_record[id_type] = id_
        events_counters.append(new_record)
        events_counters = pd.concat([events_counters, new_record])    
    return events_counters

In [26]:
story_id_sum_events = reaction_info_train_norm.groupby('story_id')['event'].sum().reset_index()

In [27]:
story_id_events_counters = get_counters_by_type('story_id')

913it [00:04, 182.71it/s]


In [28]:
story_id_features = pd.merge(story_id_events_counters, story_id_sum_events, on='story_id').rename(columns = {1: 'view_by_story_id', 2: 'skip_by_story_id', 3: 'like_by_story_id', 4: 'dislike_by_story_id', 
                                                                                                    'event': 'sum_story_reactions'})
story_id_features

Unnamed: 0,story_id,view_by_story_id,skip_by_story_id,like_by_story_id,dislike_by_story_id,sum_story_reactions
0,123,673,468,0,0,1609
1,126,171,596,0,0,1363
2,127,20,57,0,0,134
3,128,542,401,6,0,1362
4,129,187,226,0,0,639
...,...,...,...,...,...,...
908,1100024,1,1,0,0,3
909,1100025,15,1,0,0,17
910,1100026,4,0,0,0,4
911,1100027,7,0,0,0,7


Фичи по транзакциям.

In [29]:
customer_sum_transactions = transactions_info.groupby(['customer_id']).sum().drop(['transaction_month',
                                                                              'transaction_day', 'merchant_mcc'],
                                                                             axis=1).reset_index().rename(columns={'transaction_amt': 'sum_transactions'})

customer_mean_transactions = transactions_info.groupby(['customer_id']).mean().drop(['transaction_month',
                                                                                'transaction_day','merchant_mcc'],
                                                                               axis=1).reset_index().rename(columns={'transaction_amt': 'mean_transactions'})


customer_event_amount = reaction_info_train_norm.groupby(['customer_id']).sum().reset_index().drop(['story_id'],axis=1).rename(columns={'event': 'events_amount'})

merchant_mean_transactions = transactions_info.groupby(['merchant_mcc']).sum().drop(['transaction_month',
                                                                                'transaction_day', 'merchant_id', 'customer_id'],
                                                                               axis=1).reset_index().rename(columns={'transaction_amt': 'sum_transactions_mcc'})

merchant_mean_transactions = transactions_info.groupby(['merchant_mcc']).mean().drop(['transaction_month',
                                                                                'transaction_day', 'merchant_id', 'customer_id'],
                                                                               axis=1).reset_index().rename(columns={'transaction_amt': 'mean_transactions_mcc'})

In [30]:
transactions_info

Unnamed: 0,customer_id,transaction_month,transaction_day,transaction_amt,merchant_id,merchant_mcc
0,855115,7,3,1500,4554547,5411
1,997036,6,6,0,1657528,5411
2,398237,5,24,2500,26375569,5813
3,997036,6,2,0,16304402,5411
4,291636,7,25,0,1259505,5411
...,...,...,...,...,...,...
3951110,153986,6,30,29500,92701772,5719
3951111,346407,6,18,500,120278116,4816
3951112,358612,6,6,2500,111686898,5691
3951113,346407,6,7,500,2765037,5411


In [32]:
customer_id_events_counters = get_counters_by_type('customer_id')

41001it [05:26, 125.55it/s]


In [33]:
customer_id_events_counters = customer_id_events_counters.rename(columns = 
                                                                 {1: 'view', 2: 'skip',
                                                                  3: 'like', 4: 'dislike'}).reset_index()

In [34]:
customer_mcc = transactions_info.groupby(['customer_id', 'merchant_mcc']).sum().drop(['transaction_month', 
                                                 'transaction_day',
                                                 'transaction_amt',
                                                 'merchant_id'], axis = 1).reset_index()
customer_mcc_count = customer_mcc.groupby(['customer_id']).count().rename(columns = {'merchant_mcc': 'amount_merchant_mcc'})

Мерджим все фичи, на этом заканчивается feature genertion. Здесь нет ещё фичей про будущее(мы не успели воспроизвести код, но при необходимости сделаем). Они говорят сколько и каких действий сделает пользователь в след день/час/неделю/месяц.

In [35]:
merged_train_all = pd.merge(merged_train_info_norm, customer_event_amount, how='left', left_on='customer_id', right_on='customer_id')
merged_train_all = pd.merge(merged_train_all, customer_mean_transactions, how='left', left_on='customer_id', right_on='customer_id')
merged_train_all = pd.merge(merged_train_all, customer_sum_transactions, how='left', left_on='customer_id', right_on='customer_id')
merged_train_all = pd.merge(merged_train_all, customer_id_events_counters, how='left', left_on='customer_id', right_on='customer_id')
merged_train_all = pd.merge(merged_train_all, customer_mcc_count, how='left', left_on='customer_id', right_on='customer_id')
merged_train_all = pd.merge(merged_train_all, story_id_features, how='left', left_on='story_id', right_on='story_id')
merged_train_all = pd.merge(merged_train_all, stories_info_train, how='left', left_on='story_id', right_on='story_id')
# merged_train_all = pd.merge(merged_train_all, stories_embeds, how='left', left_on='story_id', right_on='story_id')
merged_train_all = merged_train_all.fillna(0).drop(['index'], axis=1)

merged_test_all = pd.merge(merged_test_info_norm, customer_event_amount, how='left', left_on='customer_id', right_on='customer_id')
merged_test_all = pd.merge(merged_test_all, customer_mean_transactions, how='left', left_on='customer_id', right_on='customer_id')
merged_test_all = pd.merge(merged_test_all, customer_sum_transactions, how='left', left_on='customer_id', right_on='customer_id')
merged_test_all = pd.merge(merged_test_all, customer_id_events_counters, how='left', left_on='customer_id', right_on='customer_id')
merged_test_all = pd.merge(merged_test_all, customer_mcc_count, how='left', left_on='customer_id', right_on='customer_id')
merged_test_all = pd.merge(merged_test_all, story_id_features, how='left', left_on='story_id', right_on='story_id')
merged_test_all = pd.merge(merged_test_all, stories_info_train, how='left', left_on='story_id', right_on='story_id')
# merged_test_all = pd.merge(merged_test_all, stories_embeds, how='left', left_on='story_id', right_on='story_id')
merged_test_all = merged_test_all.fillna(0).drop(['index'], axis=1)
print(merged_train_all.shape)

(436544, 37)


Здесь мы счиываем наш файлик со сгенеренными фичами (предупреждали, что читаем из файла).

In [37]:
Yura_train_X

Unnamed: 0,customer_id,story_id,show_time,gender_cd,age,marital_status_cd,children_cnt,job_position_cd,open_products,utl_products,...,time_to_prev_show_this,time_to_prev_show_any,will_watch_next_hour,will_watch_next_day,watched_prev_hour,watched_prev_day,will_be_after_cnt,watched_this_story,will_be_after_this_story,ts
0,843538,522,11.500000,1.0,25.0,4.0,0.0,22.0,0.0,1.0,...,0.0,-0.0,2.0,2.0,2.0,2.0,8.0,0.0,0.0,1.522312e+09
1,843538,247,11.500000,1.0,25.0,4.0,0.0,22.0,0.0,1.0,...,0.0,0.0,1.0,1.0,1.0,1.0,9.0,0.0,0.0,1.522312e+09
2,749436,498,4.783333,1.0,35.0,4.0,0.0,19.0,0.0,2.0,...,0.0,0.0,1.0,1.0,1.0,1.0,15.0,0.0,0.0,1.522461e+09
3,843538,428,5.216667,1.0,25.0,4.0,0.0,22.0,0.0,1.0,...,0.0,-0.0,2.0,2.0,6.0,6.0,6.0,0.0,0.0,1.522722e+09
4,843538,419,5.216667,1.0,25.0,4.0,0.0,22.0,0.0,1.0,...,0.0,409374.0,1.0,1.0,5.0,5.0,7.0,0.0,0.0,1.522722e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473136,36044,663,23.950000,0.0,30.0,1.0,0.0,22.0,1.0,1.0,...,0.0,4987203.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.533071e+09
473137,321133,1415,23.950000,0.0,50.0,4.0,0.0,22.0,0.0,1.0,...,0.0,-0.0,2.0,2.0,2.0,2.0,33.0,0.0,0.0,1.533071e+09
473138,321133,1383,23.950000,0.0,50.0,4.0,0.0,22.0,0.0,1.0,...,0.0,137571.0,1.0,1.0,1.0,1.0,34.0,0.0,0.0,1.533071e+09
473139,961646,409,23.950000,0.0,30.0,4.0,0.0,22.0,0.0,0.0,...,0.0,993815.0,1.0,1.0,1.0,1.0,22.0,0.0,0.0,1.533071e+09


Здесь закомменчены факторы типа размер группы в которой встретилась данная история (группа - это посдледовательность историй с одинаковым таймстемпом)

In [40]:
# reaction_info_train['timestamp'] = reaction_info_train['event_dttm'].apply(lambda x: time.mktime(datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").timetuple()))
# reaction_info_test['timestamp'] = reaction_info_train['event_dttm'].apply(lambda x: time.mktime(datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").timetuple()))
# reaction_info_train['weekday'] = reaction_info_train['event_dttm'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").strftime('%A'))
# reaction_info_test['weekday'] = reaction_info_train['event_dttm'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").strftime('%A'))
# reaction_info_train['is_weekend'] = reaction_info_train['weekday'].apply(lambda x: 1 if x in ['Sunday', 'Saturday'] else 0)
# reaction_info_test['is_weekend'] = reaction_info_train['weekday'].apply(lambda x: 1 if x in ['Sunday', 'Saturday'] else 0)

# time_groups_train = reaction_info_train.sort_values(by=['customer_id', 'timestamp'])
# time_groups_train = time_groups_train.groupby(['customer_id', 'event_dttm'])['event'] \
#          .count().reset_index().sort_values(by=['customer_id', 'event_dttm'])
# time_groups_train['group_number'] = time_groups_train.groupby(['customer_id']).cumcount() + 1
# time_groups_train = time_groups_train.rename(columns={'event': 'group_size'})

# time_groups_test = reaction_info_test.sort_values(by=['customer_id', 'timestamp'])
# time_groups_test = time_groups_test.groupby(['customer_id', 'event_dttm'])['story_id'] \
#          .count().reset_index().sort_values(by=['customer_id', 'event_dttm'])
# time_groups_test['group_number'] = time_groups_test.groupby(['customer_id']).cumcount() + 1
# time_groups_test = time_groups_test.rename(columns={'story_id': 'group_size'})

# merged_time_features_train = pd.merge(reaction_info_train, time_groups_train, how='left', left_on=['customer_id', 'event_dttm'],
#                                       right_on=['customer_id', 'event_dttm']).drop(columns=['timestamp'])

# merged_time_features_test = pd.merge(reaction_info_test, time_groups_test, how='left', left_on=['customer_id', 'event_dttm'],
#                                      right_on=['customer_id', 'event_dttm']).drop(columns=['timestamp'])

# all_features = pd.concat([Yura_train_X, merged_time_features_train], axis=1)
# all_features_test = pd.concat([Yura_test_X, merged_time_features_test], axis=1)

# all_features.columns = all_features_columns
# all_features_test.columns = all_features_test_columns



# all_features_train_norm = all_features.drop(columns = drop_data_columns)
# all_features_test_norm = all_features_test.drop(columns = drop_data_test_columns)
# all_features_test_norm['answer_id'] = np.arange(172049)

all_features_train_norm = Yura_train_X.drop(columns = drop_data_columns)
all_features_test_norm = Yura_test_X.drop(columns = drop_data_test_columns)
all_features_test_norm['answer_id'] = np.arange(172049)
all_features_test_norm = all_features_test_norm.set_index('answer_id')

В решении использовался катбуст. Но и как же без тюнинга параметров?)

In [None]:
# to run this you will probably need to do: pip install sklearn==0.19.1

from skopt import BayesSearchCV
category_cols = [
                   'customer_id',
                   'story_id',
#                    'age',
#                    'gender_cd',
#                    'marital_status_cd',
#                    'children_cnt',
#                    'event_dttm_weekday',
#                    'first_session_dttm_weekday',
#                    'event_dttm_hour'
]


cat_dims = [Yura_train_X.columns.get_loc(i) for i in category_cols]

kek =  {'bagging_temperature': 0.0,
 'depth': 8,
 'l2_leaf_reg': 30,
 'max_ctr_complexity': 2,
 'random_strength': 10.0,
        'iterations': 1000
 }
    
decision = CatBoostClassifier(
   thread_count=32,

   custom_loss=['MultiClass'],
   nan_mode="Forbidden",
   cat_features=cat_dims,
   **kek
)

opt = BayesSearchCV(decision,
                    search_spaces,
                    scoring=roc_auc,
                    cv=ps,
                    n_iter=2000,
                    n_jobs=1,  # use just 1 job with CatBoost in order to avoid segmentation fault
                    return_train_score=False,
                    refit=True,
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=42
                   )


best_params = report_perf(opt, X_like_dislike, y_like_dislike, 'CatBoost', 
                          callbacks=[VerboseCallback(100), 
                                     DeadlineStopper(60*60*3)])

Здесь реализована самая важная часть задачи. Мы заметили, что весь датасет можно разделить на непересекающиеся подптпсеты, каждый из которых характреизет свой набор данных.
Например, в трейне есть customer_id, что есть в тесте но нет инфы про историю. И так далее. Это важно для обучения разных моделей catboost под каждый тип данных со своими категориальными фичами.
Это делает модель более устойчивой.

In [41]:
set_customer_id_in_train = set(reaction_info_train['customer_id'])
set_customer_id_in_test = set(reaction_info_test['customer_id'])
set_story_id_all = set(stories_info_train['story_id'])
set_story_id_in_test = set(reaction_info_test['story_id'])
set_story_id_in_train = set(reaction_info_train['story_id'])

set_story_id_test_no_info = set_story_id_in_test - set_story_id_all
set_story_id_train_no_info = set_story_id_in_train - set_story_id_all
set_story_id_test_unk = set_story_id_in_test - set_story_id_in_train

set_customer_id_test_unk = set_customer_id_in_test - set_customer_id_in_train

In [43]:
features_known_customer_known_story_test = all_features_test_norm[all_features_test_norm['customer_id'].isin(set_customer_id_in_train)
                                           & all_features_test_norm['story_id'].isin(set_story_id_in_train) 
                                           & ~all_features_test_norm['story_id'].isin(set_story_id_test_no_info)]

features_known_customer_unk_story_test = all_features_test_norm[all_features_test_norm['customer_id'].isin(set_customer_id_in_train)
                                           & ~all_features_test_norm['story_id'].isin(set_story_id_in_train) 
                                           & ~all_features_test_norm['story_id'].isin(set_story_id_test_no_info)]

features_known_customer_unk_story_no_info_test = all_features_test_norm[all_features_test_norm['customer_id'].isin(set_customer_id_in_train)
                                           & ~all_features_test_norm['story_id'].isin(set_story_id_in_train) 
                                           & all_features_test_norm['story_id'].isin(set_story_id_test_no_info)]

features_known_customer_known_story_no_info_test = all_features_test_norm[all_features_test_norm['customer_id'].isin(set_customer_id_in_train)
                                           & all_features_test_norm['story_id'].isin(set_story_id_in_train) 
                                           & all_features_test_norm['story_id'].isin(set_story_id_test_no_info)]

features_unk_customer_known_story_test = all_features_test_norm[~all_features_test_norm['customer_id'].isin(set_customer_id_in_train)
                                           & all_features_test_norm['story_id'].isin(set_story_id_in_train) 
                                           & ~all_features_test_norm['story_id'].isin(set_story_id_test_no_info)]

features_unk_customer_unk_story_test = all_features_test_norm[~all_features_test_norm['customer_id'].isin(set_customer_id_in_train)
                                           & ~all_features_test_norm['story_id'].isin(set_story_id_in_train) 
                                           & ~all_features_test_norm['story_id'].isin(set_story_id_test_no_info)]

features_unk_customer_unk_story_no_info_test = all_features_test_norm[~all_features_test_norm['customer_id'].isin(set_customer_id_in_train)
                                           & ~all_features_test_norm['story_id'].isin(set_story_id_in_train) 
                                           & all_features_test_norm['story_id'].isin(set_story_id_test_no_info)]

features_unk_customer_known_story_no_info_test = all_features_test_norm[~all_features_test_norm['customer_id'].isin(set_customer_id_in_train)
                                           & all_features_test_norm['story_id'].isin(set_story_id_in_train) 
                                           & all_features_test_norm['story_id'].isin(set_story_id_test_no_info)]

assert(features_unk_customer_known_story_no_info_test.shape[0] + features_unk_customer_unk_story_no_info_test.shape[0]
      + features_unk_customer_unk_story_test.shape[0] + features_unk_customer_known_story_test.shape[0] 
      + features_known_customer_known_story_no_info_test.shape[0] + features_known_customer_unk_story_no_info_test.shape[0]
      + features_known_customer_unk_story_test.shape[0] + features_known_customer_known_story_test.shape[0] 
      == all_features_test_norm.shape[0])

In [44]:
all_features_sorted_by_event_dttm = pd \
                .concat([all_features_train_norm, reaction_info_train['event_dttm']], axis=1) \
                .sort_values('event_dttm').drop('event_dttm', axis=1)

In [45]:
def train_test_split_without_shuffle(X, y, test_size=0.3):
    tr_size = int((1 - test_size) * len(X))
    return X[:tr_size], X[tr_size:], y[:tr_size], y[tr_size:]

Здесь, посмотрев распределение лайков по времени на датаете выяснилось, что это распределение очень сильно скакнуло ближу к концу обучающей выборки, отсорченной по времени.
Что натолкнуло нас на мысль, что хорошо бы проводить валидацию, отрезав последние 30%.

In [46]:
# X_train, X_test, y_train, y_test = train_test_split(all_features_train_norm, Yura_train_y, test_size=0.3)
X_train, X_test, y_train, y_test = train_test_split_without_shuffle(all_features_sorted_by_event_dttm, Yura_train_y, test_size=0.3)

Здесь мы банально обучаем катбуст под каждый из типов датасетов.

In [267]:
def train_and_and_predict_cb_model(X_train, X_test, y_train, y_test, all_test_features, cat_features, save_path="model", iterations=10):
    
    train_dataset = Pool(data=X_train, label=y_train, cat_features=cat_features)

    eval_dataset = Pool(data=X_test, label=y_test, cat_features=cat_features)

    test_dataset = Pool(data=all_test_features, cat_features=cat_features)

    model = CatBoostClassifier(loss_function='MultiClass', **{'bagging_temperature': 0.0, 'border_count': 255,
                                                              'depth': 8, 'iterations': 2400, 'l2_leaf_reg': 30,
                                                              'max_ctr_complexity': 2, 'random_strength': 10.0})

    model.fit(train_dataset)

    preds_class_val = model.predict(eval_dataset)
    preds_proba_val = model.predict_proba(eval_dataset)
    preds_class_test = model.predict(test_dataset)
    preds_proba_test = model.predict_proba(test_dataset)
    model.save_model(save_path)

    model_val_score = get_final_score(preds_proba_val, y_test.to_numpy())
    model_test_score = get_final_score(preds_proba_test, y_test.to_numpy(), True)
    feature_score = {}
    for i, j in zip(model.feature_names_, model.feature_importances_):
        feature_score[i] = j

    feature_score = sorted(feature_score.items(), key=operator.itemgetter(1))
    
    return preds_proba_test, preds_class_test, feature_score, model_test_score

In [268]:
category_known_customer_known_story = ['customer_id', 'story_id']

cat_known_customer_known_story = [all_features_train_norm.columns.get_loc(i) for i in category_known_customer_known_story]

preds_proba_known_customer_known_story, preds_class_known_customer_known_story, _, model_known_customer_known_story_coef = train_and_and_predict_cb_model(X_train, X_test, y_train, y_test, 
                                                            features_known_customer_known_story_test, cat_known_customer_known_story, "model_known_customer_known_story")

0:	learn: 1.3695401	total: 424ms	remaining: 7m 3s
1:	learn: 1.3536690	total: 462ms	remaining: 3m 50s
2:	learn: 1.3390085	total: 795ms	remaining: 4m 24s
3:	learn: 1.3249735	total: 1.2s	remaining: 4m 59s
4:	learn: 1.3118508	total: 1.64s	remaining: 5m 26s
5:	learn: 1.2994008	total: 2.07s	remaining: 5m 42s
6:	learn: 1.2876899	total: 2.44s	remaining: 5m 46s
7:	learn: 1.2764540	total: 2.77s	remaining: 5m 43s
8:	learn: 1.2658782	total: 3.18s	remaining: 5m 49s
9:	learn: 1.2557403	total: 3.29s	remaining: 5m 26s
10:	learn: 1.2462969	total: 3.71s	remaining: 5m 34s
11:	learn: 1.2371975	total: 4.06s	remaining: 5m 34s
12:	learn: 1.2286228	total: 4.47s	remaining: 5m 39s
13:	learn: 1.2204717	total: 4.88s	remaining: 5m 44s
14:	learn: 1.2127519	total: 5.29s	remaining: 5m 47s
15:	learn: 1.2052924	total: 5.61s	remaining: 5m 45s
16:	learn: 1.1982160	total: 5.99s	remaining: 5m 46s
17:	learn: 1.1914264	total: 6.23s	remaining: 5m 39s
18:	learn: 1.1850788	total: 6.61s	remaining: 5m 41s
19:	learn: 1.1789626	tot

158:	learn: 1.0261008	total: 50.3s	remaining: 4m 25s
159:	learn: 1.0260798	total: 50.3s	remaining: 4m 24s
160:	learn: 1.0260431	total: 50.5s	remaining: 4m 23s
161:	learn: 1.0259448	total: 50.9s	remaining: 4m 23s
162:	learn: 1.0258929	total: 51.1s	remaining: 4m 22s
163:	learn: 1.0258508	total: 51.4s	remaining: 4m 22s
164:	learn: 1.0258224	total: 51.7s	remaining: 4m 21s
165:	learn: 1.0257677	total: 52s	remaining: 4m 21s
166:	learn: 1.0256604	total: 52.3s	remaining: 4m 21s
167:	learn: 1.0253066	total: 52.7s	remaining: 4m 21s
168:	learn: 1.0252979	total: 52.8s	remaining: 4m 19s
169:	learn: 1.0252628	total: 53.2s	remaining: 4m 19s
170:	learn: 1.0251269	total: 53.6s	remaining: 4m 19s
171:	learn: 1.0250936	total: 53.9s	remaining: 4m 19s
172:	learn: 1.0250575	total: 54.2s	remaining: 4m 19s
173:	learn: 1.0250419	total: 54.6s	remaining: 4m 19s
174:	learn: 1.0249442	total: 54.9s	remaining: 4m 18s
175:	learn: 1.0249110	total: 55.3s	remaining: 4m 18s
176:	learn: 1.0248867	total: 55.5s	remaining: 4m

313:	learn: 1.0196713	total: 1m 40s	remaining: 3m 39s
314:	learn: 1.0196588	total: 1m 40s	remaining: 3m 38s
315:	learn: 1.0196365	total: 1m 40s	remaining: 3m 38s
316:	learn: 1.0196348	total: 1m 41s	remaining: 3m 37s
317:	learn: 1.0196026	total: 1m 41s	remaining: 3m 37s
318:	learn: 1.0195867	total: 1m 41s	remaining: 3m 37s
319:	learn: 1.0195773	total: 1m 42s	remaining: 3m 37s
320:	learn: 1.0195434	total: 1m 42s	remaining: 3m 36s
321:	learn: 1.0195198	total: 1m 42s	remaining: 3m 36s
322:	learn: 1.0195040	total: 1m 43s	remaining: 3m 36s
323:	learn: 1.0194847	total: 1m 43s	remaining: 3m 35s
324:	learn: 1.0194757	total: 1m 43s	remaining: 3m 35s
325:	learn: 1.0193769	total: 1m 44s	remaining: 3m 35s
326:	learn: 1.0193427	total: 1m 44s	remaining: 3m 35s
327:	learn: 1.0192962	total: 1m 45s	remaining: 3m 35s
328:	learn: 1.0192909	total: 1m 45s	remaining: 3m 35s
329:	learn: 1.0192909	total: 1m 45s	remaining: 3m 34s
330:	learn: 1.0192870	total: 1m 45s	remaining: 3m 34s
331:	learn: 1.0192726	total:

467:	learn: 1.0155803	total: 2m 29s	remaining: 2m 49s
468:	learn: 1.0155358	total: 2m 29s	remaining: 2m 49s
469:	learn: 1.0154797	total: 2m 30s	remaining: 2m 49s
470:	learn: 1.0154627	total: 2m 30s	remaining: 2m 49s
471:	learn: 1.0153226	total: 2m 30s	remaining: 2m 48s
472:	learn: 1.0152776	total: 2m 31s	remaining: 2m 48s
473:	learn: 1.0152288	total: 2m 31s	remaining: 2m 48s
474:	learn: 1.0151991	total: 2m 32s	remaining: 2m 48s
475:	learn: 1.0151960	total: 2m 32s	remaining: 2m 47s
476:	learn: 1.0151484	total: 2m 32s	remaining: 2m 47s
477:	learn: 1.0151220	total: 2m 32s	remaining: 2m 46s
478:	learn: 1.0148697	total: 2m 33s	remaining: 2m 46s
479:	learn: 1.0147949	total: 2m 33s	remaining: 2m 46s
480:	learn: 1.0147703	total: 2m 33s	remaining: 2m 46s
481:	learn: 1.0146478	total: 2m 34s	remaining: 2m 45s
482:	learn: 1.0145717	total: 2m 34s	remaining: 2m 45s
483:	learn: 1.0145359	total: 2m 34s	remaining: 2m 45s
484:	learn: 1.0145140	total: 2m 35s	remaining: 2m 44s
485:	learn: 1.0143816	total:

620:	learn: 1.0023590	total: 3m 22s	remaining: 2m 3s
621:	learn: 1.0023293	total: 3m 22s	remaining: 2m 3s
622:	learn: 1.0022880	total: 3m 22s	remaining: 2m 2s
623:	learn: 1.0022139	total: 3m 23s	remaining: 2m 2s
624:	learn: 1.0021188	total: 3m 23s	remaining: 2m 2s
625:	learn: 1.0018417	total: 3m 24s	remaining: 2m 1s
626:	learn: 1.0017507	total: 3m 24s	remaining: 2m 1s
627:	learn: 1.0016858	total: 3m 24s	remaining: 2m 1s
628:	learn: 1.0016600	total: 3m 25s	remaining: 2m 1s
629:	learn: 1.0016041	total: 3m 25s	remaining: 2m
630:	learn: 1.0015320	total: 3m 26s	remaining: 2m
631:	learn: 1.0015073	total: 3m 26s	remaining: 2m
632:	learn: 1.0014281	total: 3m 26s	remaining: 1m 59s
633:	learn: 1.0014136	total: 3m 26s	remaining: 1m 59s
634:	learn: 1.0013829	total: 3m 27s	remaining: 1m 59s
635:	learn: 1.0013597	total: 3m 27s	remaining: 1m 58s
636:	learn: 1.0013387	total: 3m 27s	remaining: 1m 58s
637:	learn: 1.0013164	total: 3m 28s	remaining: 1m 58s
638:	learn: 1.0012498	total: 3m 28s	remaining: 1m

773:	learn: 0.9953605	total: 4m 15s	remaining: 1m 14s
774:	learn: 0.9953166	total: 4m 15s	remaining: 1m 14s
775:	learn: 0.9953073	total: 4m 16s	remaining: 1m 13s
776:	learn: 0.9952999	total: 4m 16s	remaining: 1m 13s
777:	learn: 0.9952993	total: 4m 16s	remaining: 1m 13s
778:	learn: 0.9951227	total: 4m 17s	remaining: 1m 12s
779:	learn: 0.9951053	total: 4m 17s	remaining: 1m 12s
780:	learn: 0.9950347	total: 4m 18s	remaining: 1m 12s
781:	learn: 0.9950096	total: 4m 18s	remaining: 1m 12s
782:	learn: 0.9949964	total: 4m 18s	remaining: 1m 11s
783:	learn: 0.9949897	total: 4m 19s	remaining: 1m 11s
784:	learn: 0.9949590	total: 4m 19s	remaining: 1m 11s
785:	learn: 0.9949289	total: 4m 19s	remaining: 1m 10s
786:	learn: 0.9949179	total: 4m 20s	remaining: 1m 10s
787:	learn: 0.9948587	total: 4m 20s	remaining: 1m 10s
788:	learn: 0.9948008	total: 4m 20s	remaining: 1m 9s
789:	learn: 0.9947353	total: 4m 21s	remaining: 1m 9s
790:	learn: 0.9947113	total: 4m 21s	remaining: 1m 9s
791:	learn: 0.9946847	total: 4m

929:	learn: 0.9914520	total: 5m 10s	remaining: 23.3s
930:	learn: 0.9914359	total: 5m 10s	remaining: 23s
931:	learn: 0.9914229	total: 5m 10s	remaining: 22.7s
932:	learn: 0.9914081	total: 5m 11s	remaining: 22.3s
933:	learn: 0.9913702	total: 5m 11s	remaining: 22s
934:	learn: 0.9913568	total: 5m 11s	remaining: 21.7s
935:	learn: 0.9913565	total: 5m 11s	remaining: 21.3s
936:	learn: 0.9913563	total: 5m 12s	remaining: 21s
937:	learn: 0.9913424	total: 5m 12s	remaining: 20.7s
938:	learn: 0.9912449	total: 5m 12s	remaining: 20.3s
939:	learn: 0.9912129	total: 5m 13s	remaining: 20s
940:	learn: 0.9911893	total: 5m 13s	remaining: 19.7s
941:	learn: 0.9911635	total: 5m 14s	remaining: 19.3s
942:	learn: 0.9911632	total: 5m 14s	remaining: 19s
943:	learn: 0.9911356	total: 5m 14s	remaining: 18.7s
944:	learn: 0.9911351	total: 5m 15s	remaining: 18.3s
945:	learn: 0.9911218	total: 5m 15s	remaining: 18s
946:	learn: 0.9911077	total: 5m 15s	remaining: 17.7s
947:	learn: 0.9909586	total: 5m 16s	remaining: 17.4s
948:	

In [269]:
category_known_customer_unk_story = ['customer_id']

_train, _test = X_train.drop(columns=drop_unk_story), X_test.drop(columns=drop_unk_story)
_predict = features_known_customer_unk_story_test.drop(columns=drop_unk_story)

cat_known_customer_unk_story = [_predict.columns.get_loc(i) for i in category_known_customer_unk_story]

preds_proba_known_customer_unk_story, preds_class_known_customer_unk_story, _, model_known_customer_unk_story_coef = train_and_and_predict_cb_model(_train, _test, y_train, y_test, 
                                                            _predict, cat_known_customer_unk_story, "model_known_customer_unk_story")

0:	learn: 1.3697323	total: 407ms	remaining: 6m 46s
1:	learn: 1.3542858	total: 803ms	remaining: 6m 40s
2:	learn: 1.3394632	total: 1.2s	remaining: 6m 38s
3:	learn: 1.3256062	total: 1.58s	remaining: 6m 34s
4:	learn: 1.3124285	total: 1.95s	remaining: 6m 28s
5:	learn: 1.3000427	total: 2.33s	remaining: 6m 26s
6:	learn: 1.2882160	total: 2.72s	remaining: 6m 25s
7:	learn: 1.2771455	total: 3.13s	remaining: 6m 28s
8:	learn: 1.2665794	total: 3.54s	remaining: 6m 29s
9:	learn: 1.2564687	total: 3.9s	remaining: 6m 26s
10:	learn: 1.2470498	total: 4.28s	remaining: 6m 24s
11:	learn: 1.2379746	total: 4.65s	remaining: 6m 23s
12:	learn: 1.2293814	total: 5.05s	remaining: 6m 23s
13:	learn: 1.2213893	total: 5.46s	remaining: 6m 24s
14:	learn: 1.2135710	total: 5.86s	remaining: 6m 24s
15:	learn: 1.2061113	total: 6.24s	remaining: 6m 23s
16:	learn: 1.1990189	total: 6.61s	remaining: 6m 22s
17:	learn: 1.1922725	total: 6.98s	remaining: 6m 20s
18:	learn: 1.1857960	total: 7.36s	remaining: 6m 20s
19:	learn: 1.1796389	tot

158:	learn: 1.0266920	total: 52.7s	remaining: 4m 38s
159:	learn: 1.0264872	total: 53s	remaining: 4m 38s
160:	learn: 1.0264316	total: 53.4s	remaining: 4m 38s
161:	learn: 1.0263914	total: 53.7s	remaining: 4m 37s
162:	learn: 1.0263203	total: 54s	remaining: 4m 37s
163:	learn: 1.0262779	total: 54.3s	remaining: 4m 36s
164:	learn: 1.0261973	total: 54.6s	remaining: 4m 36s
165:	learn: 1.0260948	total: 55s	remaining: 4m 36s
166:	learn: 1.0259988	total: 55.3s	remaining: 4m 35s
167:	learn: 1.0259821	total: 55.3s	remaining: 4m 34s
168:	learn: 1.0259374	total: 55.6s	remaining: 4m 33s
169:	learn: 1.0256366	total: 56s	remaining: 4m 33s
170:	learn: 1.0256025	total: 56.3s	remaining: 4m 32s
171:	learn: 1.0255562	total: 56.5s	remaining: 4m 31s
172:	learn: 1.0254740	total: 56.8s	remaining: 4m 31s
173:	learn: 1.0254334	total: 57.2s	remaining: 4m 31s
174:	learn: 1.0253476	total: 57.5s	remaining: 4m 31s
175:	learn: 1.0252960	total: 57.8s	remaining: 4m 30s
176:	learn: 1.0251979	total: 58.1s	remaining: 4m 30s
1

312:	learn: 1.0200878	total: 1m 41s	remaining: 3m 43s
313:	learn: 1.0200660	total: 1m 42s	remaining: 3m 43s
314:	learn: 1.0200356	total: 1m 42s	remaining: 3m 42s
315:	learn: 1.0200057	total: 1m 42s	remaining: 3m 42s
316:	learn: 1.0199898	total: 1m 43s	remaining: 3m 42s
317:	learn: 1.0199826	total: 1m 43s	remaining: 3m 41s
318:	learn: 1.0199703	total: 1m 43s	remaining: 3m 41s
319:	learn: 1.0199521	total: 1m 44s	remaining: 3m 41s
320:	learn: 1.0199379	total: 1m 44s	remaining: 3m 41s
321:	learn: 1.0199310	total: 1m 44s	remaining: 3m 40s
322:	learn: 1.0199044	total: 1m 45s	remaining: 3m 40s
323:	learn: 1.0198329	total: 1m 45s	remaining: 3m 40s
324:	learn: 1.0198234	total: 1m 45s	remaining: 3m 40s
325:	learn: 1.0198079	total: 1m 46s	remaining: 3m 39s
326:	learn: 1.0197806	total: 1m 46s	remaining: 3m 39s
327:	learn: 1.0197694	total: 1m 47s	remaining: 3m 39s
328:	learn: 1.0197495	total: 1m 47s	remaining: 3m 39s
329:	learn: 1.0197349	total: 1m 47s	remaining: 3m 38s
330:	learn: 1.0197265	total:

466:	learn: 1.0146623	total: 2m 32s	remaining: 2m 54s
467:	learn: 1.0146151	total: 2m 32s	remaining: 2m 53s
468:	learn: 1.0145484	total: 2m 33s	remaining: 2m 53s
469:	learn: 1.0144158	total: 2m 33s	remaining: 2m 53s
470:	learn: 1.0142815	total: 2m 33s	remaining: 2m 52s
471:	learn: 1.0140530	total: 2m 34s	remaining: 2m 52s
472:	learn: 1.0139663	total: 2m 34s	remaining: 2m 52s
473:	learn: 1.0139451	total: 2m 34s	remaining: 2m 51s
474:	learn: 1.0138531	total: 2m 35s	remaining: 2m 51s
475:	learn: 1.0138238	total: 2m 35s	remaining: 2m 51s
476:	learn: 1.0136988	total: 2m 35s	remaining: 2m 50s
477:	learn: 1.0136477	total: 2m 36s	remaining: 2m 50s
478:	learn: 1.0136000	total: 2m 36s	remaining: 2m 50s
479:	learn: 1.0135865	total: 2m 36s	remaining: 2m 49s
480:	learn: 1.0133369	total: 2m 36s	remaining: 2m 49s
481:	learn: 1.0132560	total: 2m 37s	remaining: 2m 49s
482:	learn: 1.0131908	total: 2m 37s	remaining: 2m 48s
483:	learn: 1.0131002	total: 2m 37s	remaining: 2m 48s
484:	learn: 1.0130428	total:

619:	learn: 1.0013988	total: 3m 20s	remaining: 2m 2s
620:	learn: 1.0012423	total: 3m 20s	remaining: 2m 2s
621:	learn: 1.0011634	total: 3m 20s	remaining: 2m 2s
622:	learn: 1.0011381	total: 3m 21s	remaining: 2m 1s
623:	learn: 1.0011036	total: 3m 21s	remaining: 2m 1s
624:	learn: 1.0010597	total: 3m 21s	remaining: 2m 1s
625:	learn: 1.0008945	total: 3m 22s	remaining: 2m
626:	learn: 1.0008427	total: 3m 22s	remaining: 2m
627:	learn: 1.0007771	total: 3m 22s	remaining: 2m
628:	learn: 1.0006624	total: 3m 23s	remaining: 1m 59s
629:	learn: 1.0005880	total: 3m 23s	remaining: 1m 59s
630:	learn: 1.0005607	total: 3m 23s	remaining: 1m 59s
631:	learn: 1.0005418	total: 3m 23s	remaining: 1m 58s
632:	learn: 1.0004636	total: 3m 24s	remaining: 1m 58s
633:	learn: 1.0003121	total: 3m 24s	remaining: 1m 58s
634:	learn: 1.0002338	total: 3m 24s	remaining: 1m 57s
635:	learn: 1.0002229	total: 3m 25s	remaining: 1m 57s
636:	learn: 1.0001919	total: 3m 25s	remaining: 1m 57s
637:	learn: 1.0001048	total: 3m 25s	remaining:

773:	learn: 0.9941020	total: 4m 7s	remaining: 1m 12s
774:	learn: 0.9940744	total: 4m 7s	remaining: 1m 11s
775:	learn: 0.9940463	total: 4m 8s	remaining: 1m 11s
776:	learn: 0.9940202	total: 4m 8s	remaining: 1m 11s
777:	learn: 0.9940201	total: 4m 8s	remaining: 1m 10s
778:	learn: 0.9939233	total: 4m 8s	remaining: 1m 10s
779:	learn: 0.9938455	total: 4m 9s	remaining: 1m 10s
780:	learn: 0.9938271	total: 4m 9s	remaining: 1m 9s
781:	learn: 0.9937881	total: 4m 9s	remaining: 1m 9s
782:	learn: 0.9937653	total: 4m 10s	remaining: 1m 9s
783:	learn: 0.9937065	total: 4m 10s	remaining: 1m 8s
784:	learn: 0.9936763	total: 4m 10s	remaining: 1m 8s
785:	learn: 0.9936538	total: 4m 10s	remaining: 1m 8s
786:	learn: 0.9936346	total: 4m 11s	remaining: 1m 7s
787:	learn: 0.9935976	total: 4m 11s	remaining: 1m 7s
788:	learn: 0.9935577	total: 4m 11s	remaining: 1m 7s
789:	learn: 0.9935362	total: 4m 11s	remaining: 1m 6s
790:	learn: 0.9935355	total: 4m 12s	remaining: 1m 6s
791:	learn: 0.9935281	total: 4m 12s	remaining: 1

929:	learn: 0.9895079	total: 4m 52s	remaining: 22s
930:	learn: 0.9893983	total: 4m 52s	remaining: 21.7s
931:	learn: 0.9893590	total: 4m 53s	remaining: 21.4s
932:	learn: 0.9893343	total: 4m 53s	remaining: 21.1s
933:	learn: 0.9893055	total: 4m 53s	remaining: 20.8s
934:	learn: 0.9893050	total: 4m 54s	remaining: 20.5s
935:	learn: 0.9892921	total: 4m 54s	remaining: 20.1s
936:	learn: 0.9892917	total: 4m 54s	remaining: 19.8s
937:	learn: 0.9892701	total: 4m 55s	remaining: 19.5s
938:	learn: 0.9892531	total: 4m 55s	remaining: 19.2s
939:	learn: 0.9892526	total: 4m 55s	remaining: 18.9s
940:	learn: 0.9892367	total: 4m 55s	remaining: 18.6s
941:	learn: 0.9892143	total: 4m 56s	remaining: 18.2s
942:	learn: 0.9892007	total: 4m 56s	remaining: 17.9s
943:	learn: 0.9891787	total: 4m 56s	remaining: 17.6s
944:	learn: 0.9890742	total: 4m 57s	remaining: 17.3s
945:	learn: 0.9890527	total: 4m 57s	remaining: 17s
946:	learn: 0.9890260	total: 4m 57s	remaining: 16.7s
947:	learn: 0.9890046	total: 4m 58s	remaining: 16.

In [None]:
category_known_customer_unk_story_no_info = ['customer_id']

_train, _test = X_train.drop(columns=drop_unk_story + drop_story_no_info), X_test.drop(columns=drop_unk_story + drop_story_no_info)
_predict = features_known_customer_unk_story_no_info_test.drop(columns=drop_unk_story + drop_story_no_info)

cat_known_customer_unk_story_no_info = [_predict.columns.get_loc(i) for i in category_known_customer_unk_story_no_info]

preds_proba_known_customer_unk_story_no_info, preds_class_known_customer_unk_story_no_info, _, model_known_customer_unk_story_no_info_coef = train_and_and_predict_cb_model(_train, _test, y_train, y_test, 
                                                            _predict, cat_known_customer_unk_story_no_info, "model_known_customer_unk_story_no_info")

0:	learn: 1.3695479	total: 392ms	remaining: 6m 32s
1:	learn: 1.3540373	total: 801ms	remaining: 6m 39s
2:	learn: 1.3392140	total: 1.17s	remaining: 6m 27s
3:	learn: 1.3253085	total: 1.54s	remaining: 6m 23s
4:	learn: 1.3121581	total: 1.9s	remaining: 6m 18s
5:	learn: 1.2996737	total: 2.27s	remaining: 6m 15s
6:	learn: 1.2879173	total: 2.63s	remaining: 6m 13s
7:	learn: 1.2767181	total: 3.01s	remaining: 6m 13s
8:	learn: 1.2660834	total: 3.36s	remaining: 6m 10s
9:	learn: 1.2559684	total: 3.58s	remaining: 5m 54s
10:	learn: 1.2464367	total: 3.94s	remaining: 5m 54s
11:	learn: 1.2373170	total: 4.29s	remaining: 5m 53s
12:	learn: 1.2287846	total: 4.64s	remaining: 5m 52s
13:	learn: 1.2205440	total: 4.99s	remaining: 5m 51s
14:	learn: 1.2127340	total: 5.34s	remaining: 5m 50s
15:	learn: 1.2055752	total: 5.71s	remaining: 5m 51s
16:	learn: 1.1984998	total: 6.06s	remaining: 5m 50s
17:	learn: 1.1917112	total: 6.43s	remaining: 5m 50s
18:	learn: 1.1852560	total: 6.79s	remaining: 5m 50s
19:	learn: 1.1791484	to

160:	learn: 1.0253811	total: 50.5s	remaining: 4m 22s
161:	learn: 1.0253155	total: 50.8s	remaining: 4m 22s
162:	learn: 1.0252433	total: 51.1s	remaining: 4m 22s
163:	learn: 1.0251930	total: 51.2s	remaining: 4m 20s
164:	learn: 1.0251529	total: 51.4s	remaining: 4m 20s
165:	learn: 1.0251084	total: 51.7s	remaining: 4m 19s
166:	learn: 1.0250616	total: 51.8s	remaining: 4m 18s
167:	learn: 1.0250175	total: 52.1s	remaining: 4m 18s
168:	learn: 1.0249845	total: 52.4s	remaining: 4m 17s
169:	learn: 1.0248833	total: 52.7s	remaining: 4m 17s
170:	learn: 1.0248450	total: 53.1s	remaining: 4m 17s
171:	learn: 1.0246401	total: 53.4s	remaining: 4m 17s
172:	learn: 1.0245609	total: 53.7s	remaining: 4m 16s
173:	learn: 1.0245223	total: 54s	remaining: 4m 16s
174:	learn: 1.0244936	total: 54.3s	remaining: 4m 16s
175:	learn: 1.0244531	total: 54.7s	remaining: 4m 15s
176:	learn: 1.0244263	total: 55s	remaining: 4m 15s
177:	learn: 1.0244002	total: 55.3s	remaining: 4m 15s
178:	learn: 1.0243022	total: 55.6s	remaining: 4m 1

314:	learn: 1.0176660	total: 1m 36s	remaining: 3m 29s
315:	learn: 1.0176478	total: 1m 36s	remaining: 3m 29s
316:	learn: 1.0176418	total: 1m 36s	remaining: 3m 28s
317:	learn: 1.0176371	total: 1m 37s	remaining: 3m 28s
318:	learn: 1.0176269	total: 1m 37s	remaining: 3m 28s
319:	learn: 1.0176026	total: 1m 37s	remaining: 3m 28s
320:	learn: 1.0175819	total: 1m 38s	remaining: 3m 27s
321:	learn: 1.0175616	total: 1m 38s	remaining: 3m 27s
322:	learn: 1.0175304	total: 1m 38s	remaining: 3m 27s
323:	learn: 1.0175216	total: 1m 39s	remaining: 3m 26s
324:	learn: 1.0175022	total: 1m 39s	remaining: 3m 26s
325:	learn: 1.0175000	total: 1m 39s	remaining: 3m 25s
326:	learn: 1.0174678	total: 1m 39s	remaining: 3m 25s
327:	learn: 1.0174522	total: 1m 40s	remaining: 3m 25s
328:	learn: 1.0174522	total: 1m 40s	remaining: 3m 24s
329:	learn: 1.0174506	total: 1m 40s	remaining: 3m 23s
330:	learn: 1.0174371	total: 1m 40s	remaining: 3m 23s
331:	learn: 1.0174282	total: 1m 40s	remaining: 3m 23s
332:	learn: 1.0174234	total:

In [None]:
category_known_customer_known_story_no_info = ['customer_id', 'story_id']

_train, _test = X_train.drop(columns=drop_story_no_info), X_test.drop(columns=drop_story_no_info)
_predict = features_known_customer_known_story_no_info_test.drop(columns=drop_story_no_info)

cat_known_customer_known_story_no_info = [_predict.columns.get_loc(i) for i in category_known_customer_known_story_no_info]

preds_proba_known_customer_known_story_no_info, preds_class_known_customer_known_story_no_info, _, model_known_customer_known_story_no_info_coef = train_and_and_predict_cb_model(_train, _test, y_train, y_test, 
                                                            _predict, cat_known_customer_known_story_no_info, "model_known_customer_known_story_no_info")

In [None]:
category_unk_customer_known_story = ['story_id']

_train, _test = X_train.drop(columns=drop_unk_customer), X_test.drop(columns=drop_unk_customer)
_predict = features_unk_customer_known_story_test.drop(columns=drop_unk_customer)submit_csvsubmit_csvsubmit_csv
cat_unk_customer_known_story = [_predict.columns.get_loc(i) for i in category_unk_customer_known_story]

preds_proba_unk_customer_known_story, preds_class_unk_customer_known_story, _, model_unk_customer_known_story_coef = train_and_and_predict_cb_model(_train, _test, y_train, y_test, 
                                                            _predict, cat_unk_customer_known_story, "model_unk_customer_known_story")

In [None]:
category_unk_customer_unk_story = []

_train, _test = X_train.drop(columns=drop_unk_customer + drop_unk_story), X_test.drop(columns=drop_unk_customer + drop_unk_story)
_predict = features_unk_customer_unk_story_test.drop(columns=drop_unk_customer + drop_unk_story)
cat_unk_customer_unk_story = [_predict.columns.get_loc(i) for i in category_unk_customer_unk_story]

preds_proba_unk_customer_unk_story, preds_class_unk_customer_unk_story, _, model_unk_customer_unk_story_coef = train_and_and_predict_cb_model(_train, _test, y_train, y_test, 
                                                            _predict, cat_unk_customer_unk_story, "model_unk_customer_unk_story")

In [None]:
category_unk_customer_unk_story_no_info = []

_train, _test = X_train.drop(columns=drop_unk_customer + drop_unk_story + drop_story_no_info), X_test.drop(columns=drop_unk_customer + drop_unk_story + drop_story_no_info)
_predict = features_unk_customer_unk_story_no_info_test.drop(columns=drop_unk_customer + drop_unk_story + drop_story_no_info)
cat_unk_customer_unk_story_no_info = [_predict.columns.get_loc(i) for i in category_unk_customer_unk_story_no_info]

preds_proba_unk_customer_unk_story_no_info, preds_class_unk_customer_unk_story_no_info, _, model_unk_customer_unk_story_no_info_coef = train_and_and_predict_cb_model(_train, _test, y_train, y_test, 
                                                            _predict, cat_unk_customer_unk_story_no_info, "model_unk_customer_unk_story_no_info")

In [None]:
category_unk_customer_known_story_no_info = ['story_id']

_train, _test = X_train.drop(columns=drop_unk_customer + drop_story_no_info), X_test.drop(columns=drop_unk_customer + drop_story_no_info)
_predict = features_unk_customer_known_story_no_info_test.drop(columns=drop_unk_customer + drop_story_no_info)
cat_unk_customer_known_story_no_info = [_predict.columns.get_loc(i) for i in category_unk_customer_known_story_no_info]

preds_proba_unk_customer_known_story_no_info, preds_class_unk_customer_known_story_no_info, _, model_unk_customer_known_story_no_info_coef = train_and_and_predict_cb_model(_train, _test, y_train, y_test, 
                                                            _predict, cat_unk_customer_known_story_no_info, "model_unk_customer_known_story_no_info")

In [None]:
all_tests_stacked = pd.concat([features_known_customer_known_story_test, features_known_customer_unk_story_test,
                           features_known_customer_unk_story_no_info_test, features_known_customer_known_story_no_info_test,
                           features_unk_customer_known_story_test, features_unk_customer_unk_story_test,
                           features_unk_customer_unk_story_no_info_test, features_unk_customer_known_story_no_info_test])

all_tests_coef_stacked = model_known_customer_known_story_coef + model_known_customer_unk_story_coef + model_known_customer_unk_story_no_info_coef + model_known_customer_known_story_no_info_coef + model_unk_customer_known_story_coef + model_unk_customer_unk_story_coef + model_unk_customer_unk_story_no_info_coef + model_unk_customer_known_story_no_info_coef
all_tests_stacked = pd.concat([pd.DataFrame(all_tests_coef_stacked, columns=['score']), all_tests_stacked.reset_index()], axis=1)

In [None]:
submit_csv = all_tests_stacked.sort_values('answer_id')[["answer_id", 'score']]

In [None]:
submit_csv.to_csv('submit_cb.csv', index=False)

In [None]:
plt.hist(submit_csv['score'])
submit_csv['score'].describe()