In [13]:
import pandas as pd
import numpy as np

# подготовка очищенного датасета на базе обработки в HW_6_baseline
from src.data.clean_data import clean_data
# подготовка датасета (Pool-ов) для обучения CatBoostClassifier аналогично HW_6_baseline
from src.data.preprocessing_catboost_more_features import preprocessing_catboost_more_features
from src.data.preprocessing_catboost_test import preprocessing_catboost_test
from src.data.preprocessing_catboost_train import preprocessing_catboost_train
from src.data.feature_combinations import feature_combinations

from catboost import CatBoostClassifier
from catboost.utils import get_roc_curve, get_confusion_matrix, select_threshold
from catboost import Pool

from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split

# Чистим данные и добавляем "секретные" признаки в датасет

In [14]:
dwarves = pd.read_csv("../data/interim/dwarves_secret_info.csv")
dwarves.head()

Unnamed: 0,Secret_dwarf_info_1,Secret_dwarf_info_2,Secret_dwarf_info_3,Successful_deals_count,Tavern,Hashed_deal_detail_1,Hashed_deal_detail_2,Hashed_deal_detail_3,Hashed_deal_detail_4,Hashed_deal_detail_5,...,Tavern_district_2,Tavern_district_3,Tavern_district_4,Tavern_district_5,Tavern_district_6,Tavern_district_7,day_before_first_defolt,Deal_day,Deal_month,is_weekend
0,3.5,-2.0,5.0,0.0,7,2.5,-3,8,2.5,-3,...,0,1,0,0,0,0,68,5,11,0
1,3.5,-2.0,5.0,2.0,7,2.5,-3,14,3.5,-3,...,0,0,1,0,0,0,222,26,8,0
2,3.5,-2.0,5.0,0.0,7,2.5,-3,8,2.5,-3,...,0,0,0,0,1,0,99,18,2,0
3,3.5,-2.0,5.0,0.0,13,2.5,-2,5,2.5,-3,...,1,0,0,0,0,0,39,30,4,1
4,3.5,-2.0,5.0,0.0,39,2.5,-3,7,2.5,-3,...,0,0,1,0,0,0,60,19,9,0


Пробуем перебор новых фич (в функции остались только эффективные)

In [15]:
feature_combinations(dwarves)

# Подготавливаем данные к catboost и заводим его

In [21]:
# Подготовим данные для catboost и оценки
train_pool, test_pool, X_test, y_test = preprocessing_catboost_more_features(dwarves, train=True, test_size=0.30)

In [22]:
clf = CatBoostClassifier(l2_leaf_reg=10000, depth=2, learning_rate=0.99, iterations=1000, random_seed=2023)
clf.fit(train_pool, verbose=False)

<catboost.core.CatBoostClassifier at 0x28c141db7c0>

In [23]:
# Переберем порог для достижения максимального ROC-AUC (наша целевая метрика)
max_roc_auc = (0, 0)
for i in range(1,30):
    clf.set_probability_threshold(i * 0.005)
    y_pred = clf.predict(test_pool)
    if roc_auc_score(y_test, y_pred) > max_roc_auc[0]:
        max_roc_auc = (max(roc_auc_score(y_test, y_pred), max_roc_auc[0]), i * 0.005)
max_roc_auc #ROC-AUC, threshold

(0.7007936507936507, 0.085)

In [24]:
# Отметим важные факторы с текущим датасетом
importances = clf.feature_importances_
names = train_pool.get_feature_names()

named_importances = [[importances[i], names[i]] for i in range(len(names))]
print(np.array(sorted(named_importances, reverse=True)))

[['34.09768036348506' 'day_before_first_defolt']
 ['24.932098481335967' 'Secret_33']
 ['13.624336692716556' 'Age']
 ['6.22098665540466' 'Tavern']
 ['4.129221615796422' 'Deal_month']
 ['2.7400423879344316' 'Gender']
 ['2.2691660901999997' 'Hashed_deal_detail_3']
 ['2.1426661588751457' 'Hashed_deal_detail_1']
 ['1.9930954426067922' 'Secret_dwarf_info_1']
 ['1.8035219695184466' 'Secret_22']
 ['1.2103099371911532' 'Tavern_district_3']
 ['1.1202483419559028' 'Deal_day']
 ['0.8985594004818683' 'Secret_dwarf_info_3']
 ['0.867387223573969' 'Tavern_district_4']
 ['0.6681752456885852' 'is_weekend']
 ['0.34932796420153817' 'Hashed_deal_detail_4']
 ['0.34515162597690924' 'Tavern_district_6']
 ['0.20947410370861466' 'Secret_12']
 ['0.17407458096555595' 'Hashed_deal_detail_2']
 ['0.12228256364197136' 'Successful_deals_count']
 ['0.0484347592933613' 'Secret_31']
 ['0.02896387751399657' 'Hashed_deal_detail_5']
 ['0.0044163560456928356' 'Secret_23']
 ['0.0003767768892794875' 'Tavern_district_2']
 ['1.3

# Готовимся к сдаче на тестовом датасете

- готовим базовый test датасет по тому же Pipeline-у,
- обучаем модель на всех тренировочных данных (test_size=0.01)
- выставляем посчитанный threshold
- считаем предсказания
- выгружаем в csv

Предсказания на test выборке

In [8]:
# преобразуем test датасет к Pool-у для предсказаний
dwarves_lms = clean_data(path="../data/raw/test.csv", secret=True)
# добавляем модифицированные фичи
feature_combinations(dwarves_lms)

In [9]:
pool_test_lms = preprocessing_catboost_more_features(dwarves_lms, train=False)
# применяем threshold с обученной модели на всех данных
clf.set_probability_threshold(max_roc_auc[0])

In [10]:
# предсказания просят предоставить в вероятностях
y_pred = clf.predict_proba(pool_test_lms)
prediction = pd.DataFrame(y_pred[:, 1], columns=['Prediction'])
prediction

Unnamed: 0,Prediction
0,0.038218
1,0.054822
2,0.147681
3,0.066625
4,0.173350
...,...
984,0.067948
985,0.191885
986,0.071079
987,0.093484


Собираем предсказания в csv

In [11]:
# датасет к сдаче в LMS
test_df = pd.read_csv("../data/raw/test.csv")
prediction_lms= test_df[['Deal_id']]
prediction_lms['Prediction'] = prediction
prediction_lms

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_lms['Prediction'] = prediction


Unnamed: 0,Deal_id,Prediction
0,72875713,0.038218
1,75825544,0.054822
2,81809181,0.147681
3,87083256,0.066625
4,84651519,0.173350
...,...,...
984,86959667,0.067948
985,77574469,0.191885
986,73676542,0.071079
987,81633790,0.093484


In [12]:
prediction_lms.to_csv("../data/processed/prediction_CB_030_10000_6_500.csv", index=False)

In [35]:
# 0.684 - требуется не ниже 0.7, не прошло на половине датасета
# CatBoostClassifier(l2_leaf_reg=40000, depth=3, learning_rate=0.99, iterations=300, random_seed=2023)
# 0.681 - на полном датасете

Подбор гиперпараметров catboost-а не дал проходного ROC-AUC, попробуем на SVC и случайном лесе.

Продолжение (завершение) в HW_6_lin_model