In [19]:
import pandas as pd
import numpy as np

# подготовка очищенного датасета на базе обработки в HW_6_baseline
from src.data.clean_data import clean_data
# подготовка датасета (Pool-ов) для обучения CatBoostClassifier аналогично HW_6_baseline
from src.data.preprocessing_catboost import preprocessing_catboost
from src.data.preprocessing_catboost_test import preprocessing_catboost_test
from src.data.preprocessing_catboost_train import preprocessing_catboost_train

from catboost import CatBoostClassifier
from catboost.utils import get_roc_curve, get_confusion_matrix, select_threshold
from catboost import Pool

from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split

# Чистим данные и добавляем "секретные" признаки в датасет

In [20]:
dwarves = clean_data(path="../data/raw/train.csv", secret=False) # функция оптимизирована позднее
dwarves

Unnamed: 0,Secret_dwarf_info_1,Secret_dwarf_info_2,Secret_dwarf_info_3,Successful_deals_count,Tavern,Hashed_deal_detail_1,Hashed_deal_detail_2,Hashed_deal_detail_3,Hashed_deal_detail_4,Hashed_deal_detail_5,...,Tavern_district_2,Tavern_district_3,Tavern_district_4,Tavern_district_5,Tavern_district_6,Tavern_district_7,day_before_first_defolt,Deal_day,Deal_month,is_weekend
0,,,,0.0,7,2.5,-3,8,2.5,-3,...,0,1,0,0,0,0,68,5,11,0
1,3.5,-2.0,5.0,2.0,7,2.5,-3,14,3.5,-3,...,0,0,1,0,0,0,222,26,8,0
2,,,,0.0,7,2.5,-3,8,2.5,-3,...,0,0,0,0,1,0,99,18,2,0
3,,,,0.0,13,2.5,-2,5,2.5,-3,...,1,0,0,0,0,0,39,30,4,1
4,,,,0.0,39,2.5,-3,7,2.5,-3,...,0,0,1,0,0,0,60,19,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3163,,,,0.0,7,2.5,-3,10,2.5,-3,...,0,1,0,0,0,0,154,9,4,1
3164,,,,0.0,7,2.5,-3,10,2.5,-3,...,0,0,0,1,0,0,158,23,5,0
3165,,,,0.0,7,2.5,-3,5,3.5,-3,...,0,0,1,0,0,0,30,14,9,0
3166,,,,0.0,7,2.5,-3,11,2.5,-3,...,0,0,0,0,0,1,179,9,9,0


In [21]:
# Заполним пропуски медианой. Любая из статистик практически не изменяется от таргета.
dwarves['Secret_dwarf_info_1'] = dwarves['Secret_dwarf_info_1'].fillna(dwarves['Secret_dwarf_info_1'].median())
dwarves['Secret_dwarf_info_2'] = dwarves['Secret_dwarf_info_2'].fillna(dwarves['Secret_dwarf_info_2'].median())
dwarves['Secret_dwarf_info_3'] = dwarves['Secret_dwarf_info_3'].fillna(dwarves['Secret_dwarf_info_3'].median())

# сохраним промежуточный датасет
dwarves.to_csv("../data/interim/dwarves_secret_info.csv", index=False)

In [22]:
dwarves = pd.read_csv("../data/interim/dwarves_secret_info.csv")
dwarves.head()

Unnamed: 0,Secret_dwarf_info_1,Secret_dwarf_info_2,Secret_dwarf_info_3,Successful_deals_count,Tavern,Hashed_deal_detail_1,Hashed_deal_detail_2,Hashed_deal_detail_3,Hashed_deal_detail_4,Hashed_deal_detail_5,...,Tavern_district_2,Tavern_district_3,Tavern_district_4,Tavern_district_5,Tavern_district_6,Tavern_district_7,day_before_first_defolt,Deal_day,Deal_month,is_weekend
0,3.5,-2.0,5.0,0.0,7,2.5,-3,8,2.5,-3,...,0,1,0,0,0,0,68,5,11,0
1,3.5,-2.0,5.0,2.0,7,2.5,-3,14,3.5,-3,...,0,0,1,0,0,0,222,26,8,0
2,3.5,-2.0,5.0,0.0,7,2.5,-3,8,2.5,-3,...,0,0,0,0,1,0,99,18,2,0
3,3.5,-2.0,5.0,0.0,13,2.5,-2,5,2.5,-3,...,1,0,0,0,0,0,39,30,4,1
4,3.5,-2.0,5.0,0.0,39,2.5,-3,7,2.5,-3,...,0,0,1,0,0,0,60,19,9,0


# Подготавливаем данные к catboost и заводим его

In [23]:
# Подготовим данные для catboost и оценки
train_pool, test_pool, X_test, y_test = preprocessing_catboost(dwarves, train=True, test_size=0.25)

In [24]:
clf = CatBoostClassifier(l2_leaf_reg=2000, depth=7, learning_rate=0.99, iterations=75, random_seed=2023)
clf.fit(train_pool, verbose=False)

0:	learn: 0.4094017	total: 12.4ms	remaining: 919ms
1:	learn: 0.3642086	total: 26.7ms	remaining: 976ms
2:	learn: 0.3390968	total: 42.1ms	remaining: 1.01s
3:	learn: 0.3307394	total: 57.5ms	remaining: 1.02s
4:	learn: 0.3279244	total: 67.7ms	remaining: 947ms
5:	learn: 0.3245519	total: 75.4ms	remaining: 867ms
6:	learn: 0.3220726	total: 86.3ms	remaining: 838ms
7:	learn: 0.3205327	total: 97ms	remaining: 812ms
8:	learn: 0.3196598	total: 110ms	remaining: 808ms
9:	learn: 0.3184106	total: 123ms	remaining: 802ms
10:	learn: 0.3163516	total: 135ms	remaining: 786ms
11:	learn: 0.3152222	total: 146ms	remaining: 765ms
12:	learn: 0.3135106	total: 156ms	remaining: 746ms
13:	learn: 0.3126114	total: 168ms	remaining: 734ms
14:	learn: 0.3114002	total: 178ms	remaining: 714ms
15:	learn: 0.3105171	total: 189ms	remaining: 698ms
16:	learn: 0.3094027	total: 201ms	remaining: 686ms
17:	learn: 0.3086435	total: 212ms	remaining: 671ms
18:	learn: 0.3079148	total: 223ms	remaining: 658ms
19:	learn: 0.3072241	total: 235ms	r

<catboost.core.CatBoostClassifier at 0x1f71177aac0>

In [25]:
# Переберем порог для достижения максимального ROC-AUC (наша целевая метрика)
max_roc_auc = (0, 0)
for i in range(1,30):
    clf.set_probability_threshold(i * 0.005)
    y_pred = clf.predict(test_pool)
    if roc_auc_score(y_test, y_pred) > max_roc_auc[0]:
        max_roc_auc = (max(roc_auc_score(y_test, y_pred), max_roc_auc[0]), i * 0.005)
max_roc_auc #ROC-AUC, threshold

(0.6995738636363635, 0.09)

In [26]:
# Отметим важные факторы с текущим датасетом
importances = clf.feature_importances_
names = train_pool.get_feature_names()

named_importances = [[importances[i], names[i]] for i in range(len(names))]
print(np.array(sorted(named_importances, reverse=True)))

[['30.422571334909716' 'day_before_first_defolt']
 ['16.78219651492362' 'Age']
 ['15.49141341001342' 'Tavern_district_3']
 ['7.875545626574688' 'Gender']
 ['6.8330090747498735' 'Tavern_district_4']
 ['4.233403782068736' 'is_weekend']
 ['4.196891246363707' 'Secret_dwarf_info_2']
 ['3.2488819180084976' 'Hashed_deal_detail_1']
 ['2.6588126750719248' 'Secret_dwarf_info_3']
 ['2.461349072567868' 'Successful_deals_count']
 ['2.0045344595992662' 'Secret_dwarf_info_1']
 ['1.9180308169767577' 'Tavern']
 ['0.9417977338901993' 'Tavern_district_7']
 ['0.3375245296203689' 'Deal_month']
 ['0.27264473732652766' 'Hashed_deal_detail_4']
 ['0.20390923237345296' 'Tavern_district_2']
 ['0.07860077107107176' 'Tavern_district_5']
 ['0.03888306389030596' 'Tavern_district_6']
 ['0.0' 'Tavern_district_1']
 ['0.0' 'Hashed_deal_detail_5']
 ['0.0' 'Hashed_deal_detail_3']
 ['0.0' 'Hashed_deal_detail_2']
 ['0.0' 'Deal_day']]


# Готовимся к сдаче на тестовом датасете

- готовим базовый test датасет по тому же Pipeline-у,
- обучаем модель на всех тренировочных данных
- выставляем посчитанный threshold
- считаем предсказания
- выгружаем в csv

Модель на всем train датасете

In [27]:
dwarves_train_lms = clean_data(path="../data/raw/train.csv", secret=True)
dwarves_train_lms

Unnamed: 0,Secret_dwarf_info_1,Secret_dwarf_info_2,Secret_dwarf_info_3,Successful_deals_count,Tavern,Hashed_deal_detail_1,Hashed_deal_detail_2,Hashed_deal_detail_3,Hashed_deal_detail_4,Hashed_deal_detail_5,...,Tavern_district_2,Tavern_district_3,Tavern_district_4,Tavern_district_5,Tavern_district_6,Tavern_district_7,day_before_first_defolt,Deal_day,Deal_month,is_weekend
0,3.5,-2.0,5.0,0.0,7,2.5,-3,8,2.5,-3,...,0,1,0,0,0,0,68,5,11,0
1,3.5,-2.0,5.0,2.0,7,2.5,-3,14,3.5,-3,...,0,0,1,0,0,0,222,26,8,0
2,3.5,-2.0,5.0,0.0,7,2.5,-3,8,2.5,-3,...,0,0,0,0,1,0,99,18,2,0
3,3.5,-2.0,5.0,0.0,13,2.5,-2,5,2.5,-3,...,1,0,0,0,0,0,39,30,4,1
4,3.5,-2.0,5.0,0.0,39,2.5,-3,7,2.5,-3,...,0,0,1,0,0,0,60,19,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3163,3.5,-2.0,5.0,0.0,7,2.5,-3,10,2.5,-3,...,0,1,0,0,0,0,154,9,4,1
3164,3.5,-2.0,5.0,0.0,7,2.5,-3,10,2.5,-3,...,0,0,0,1,0,0,158,23,5,0
3165,3.5,-2.0,5.0,0.0,7,2.5,-3,5,3.5,-3,...,0,0,1,0,0,0,30,14,9,0
3166,3.5,-2.0,5.0,0.0,7,2.5,-3,11,2.5,-3,...,0,0,0,0,0,1,179,9,9,0


In [28]:
pool_train_lms, y_train_lms = preprocessing_catboost_train(dwarves_train_lms)

In [29]:
clf_train_lms = CatBoostClassifier(l2_leaf_reg=2000, depth=7, learning_rate=0.99, iterations=75, random_seed=2023)
clf_train_lms.fit(pool_train_lms)

0:	learn: 0.4452909	total: 14.2ms	remaining: 1.05s
1:	learn: 0.3509079	total: 22.7ms	remaining: 827ms
2:	learn: 0.3248142	total: 34.9ms	remaining: 838ms
3:	learn: 0.3229447	total: 47.8ms	remaining: 849ms
4:	learn: 0.3184980	total: 55.8ms	remaining: 781ms
5:	learn: 0.3165133	total: 67.7ms	remaining: 778ms
6:	learn: 0.3145512	total: 81.2ms	remaining: 788ms
7:	learn: 0.3136484	total: 94.2ms	remaining: 789ms
8:	learn: 0.3125217	total: 106ms	remaining: 775ms
9:	learn: 0.3114381	total: 118ms	remaining: 766ms
10:	learn: 0.3102978	total: 130ms	remaining: 757ms
11:	learn: 0.3095766	total: 142ms	remaining: 747ms
12:	learn: 0.3089323	total: 154ms	remaining: 736ms
13:	learn: 0.3079968	total: 165ms	remaining: 721ms
14:	learn: 0.3073789	total: 177ms	remaining: 706ms
15:	learn: 0.3066749	total: 187ms	remaining: 689ms
16:	learn: 0.3061248	total: 202ms	remaining: 690ms
17:	learn: 0.3055229	total: 214ms	remaining: 678ms
18:	learn: 0.3042801	total: 232ms	remaining: 684ms
19:	learn: 0.3036583	total: 248ms

<catboost.core.CatBoostClassifier at 0x1f7115c26a0>

In [30]:
max_roc_auc_lms = (0, 0)
for i in range(1,30):
    clf_train_lms.set_probability_threshold(i * 0.005)
    y_pred_lms = clf_train_lms.predict(pool_train_lms)
    if roc_auc_score(y_train_lms, y_pred_lms) > max_roc_auc_lms[0]:
        max_roc_auc_lms = (max(roc_auc_score(y_train_lms, y_pred_lms), max_roc_auc_lms[0]), i * 0.005)
max_roc_auc_lms #ROC-AUC, threshold

(0.7479360658274397, 0.105)

Предсказания на test выборке

In [31]:
# преобразуем test датасет к Pool-у для предсказаний
dwarves_test_lms = clean_data(path="../data/raw/test.csv", secret=True)
pool_test_lms = preprocessing_catboost_test(dwarves_test_lms)
# применяем threshold с обученной модели на всех данных
clf_train_lms.set_probability_threshold(max_roc_auc_lms[1])

In [32]:
# предсказания просят предоставить в вероятностях
y_pred = clf_train_lms.predict_proba(pool_test_lms)
prediction = pd.DataFrame(y_pred[:, 1], columns=['Prediction'])
prediction

Unnamed: 0,Prediction
0,0.039756
1,0.044325
2,0.129938
3,0.056032
4,0.159773
...,...
984,0.052453
985,0.307781
986,0.058433
987,0.146126


Собираем предсказания в csv

In [33]:
# датасет к сдаче в LMS
test_df = pd.read_csv("../data/raw/test.csv")
prediction_lms= test_df[['Deal_id']]
prediction_lms['Prediction'] = prediction
prediction_lms

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_lms['Prediction'] = prediction


Unnamed: 0,Deal_id,Prediction
0,72875713,0.039756
1,75825544,0.044325
2,81809181,0.129938
3,87083256,0.056032
4,84651519,0.159773
...,...,...
984,86959667,0.052453
985,77574469,0.307781
986,73676542,0.058433
987,81633790,0.146126


In [34]:
prediction_lms.to_csv("../data/processed/prediction_0668_CB_2000_7_75.csv", index=False)

In [35]:
# 0.668 - требуется не ниже 0.7, не прошло при
# CatBoostClassifier(l2_leaf_reg=2000, depth=7, learning_rate=0.99, iterations=75, random_seed=2023)
# на всем датасете
# 0.645 - на clf с частью обучающей выборки. Профит от полной выборки есть!

In [36]:
# Отметим важные факторы с текущим датасетом
importances = clf_train_lms.feature_importances_
names = pool_test_lms.get_feature_names()

named_importances = [[importances[i], names[i]] for i in range(len(names))]
print(np.array(sorted(named_importances, reverse=True)))

[['33.92081499688401' 'Age']
 ['31.349584403756484' 'day_before_first_defolt']
 ['8.698990028751556' 'Secret_dwarf_info_3']
 ['7.528981762198153' 'Secret_dwarf_info_2']
 ['3.8926372841995174' 'Tavern']
 ['3.8204161397165564' 'Gender']
 ['2.3498520930685487' 'is_weekend']
 ['2.112035787798323' 'Secret_dwarf_info_1']
 ['1.580046261035941' 'Deal_month']
 ['1.2883889789990564' 'Hashed_deal_detail_4']
 ['0.7757616180177076' 'Tavern_district_3']
 ['0.7706826704450764' 'Hashed_deal_detail_1']
 ['0.6555486899397663' 'Successful_deals_count']
 ['0.4315757662309751' 'Tavern_district_4']
 ['0.39578650217635586' 'Tavern_district_6']
 ['0.36614942308924797' 'Tavern_district_2']
 ['0.06274759369271636' 'Hashed_deal_detail_3']
 ['0.0' 'Tavern_district_7']
 ['0.0' 'Tavern_district_5']
 ['0.0' 'Tavern_district_1']
 ['0.0' 'Hashed_deal_detail_5']
 ['0.0' 'Hashed_deal_detail_2']
 ['0.0' 'Deal_day']]


# Выводы:
- threshold для несбалансированной выборки решает.
- Catboost не до конца справился с требованиями по метрикам.

Пробуем добавить признаки. Продолжение в HW_6_feature_gen.ipynb