In [17]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

from catboost import CatBoostClassifier, Pool, cv

In [2]:
full_df = pd.read_csv('datasets/preprocessed_v0.1.csv', compression='zip', 
                      dtype={'employee_count_nm': 'category', 
                             'bankemplstatus': 'bool',
                             'customer_age': 'category'})

In [3]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96000 entries, 0 to 95999
Columns: 487 entries, user_id to time
dtypes: bool(1), category(2), float64(475), int64(8), object(1)
memory usage: 354.8+ MB


In [4]:
train = full_df[full_df['target'].notna()]
pred = full_df[full_df['target'].isna()]

In [5]:
cat_features = ['employee_count_nm', 'customer_age', 'bankemplstatus']
cols_to_drop = ['user_id', 'report', 'target', 'time']

In [6]:
X = train.drop(cols_to_drop, axis=1)
y = train['target']

X_pred = pred.drop(cols_to_drop, axis=1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

pool_train = Pool(X_train, y_train, cat_features=cat_features)
pool_test = Pool(X_test, y_test, cat_features=cat_features)
pool_full = Pool(X, y, cat_features=cat_features)

# Simple CatBoost

## First training

In [8]:
simple_catb = CatBoostClassifier(random_state=42, eval_metric='AUC')

simple_catb.fit(pool_train, 
                eval_set=pool_test, 
                verbose=100, 
                early_stopping_rounds=100
               )

Learning rate set to 0.083772
0:	test: 0.6164652	best: 0.6164652 (0)	total: 277ms	remaining: 4m 36s
100:	test: 0.7587062	best: 0.7588732 (98)	total: 7.54s	remaining: 1m 7s
200:	test: 0.7612596	best: 0.7612683 (199)	total: 14.6s	remaining: 57.9s
300:	test: 0.7616025	best: 0.7616220 (292)	total: 21.8s	remaining: 50.5s
400:	test: 0.7609998	best: 0.7621679 (344)	total: 29.1s	remaining: 43.5s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7621679039
bestIteration = 344

Shrink model to first 345 iterations.


<catboost.core.CatBoostClassifier at 0x1ef500da9b0>

## Training on most important features

In [9]:
importances = pd.DataFrame({
    'feature': simple_catb.feature_names_,
    'importance': simple_catb.feature_importances_
}).sort_values(by='importance', ascending=False)

In [10]:
threshold = 0.01

important_features = importances[importances['importance'] > threshold]['feature'].values
important_features.shape[0]

381

In [11]:
pool_train_important = Pool(X_train[important_features], y_train, cat_features=cat_features)
pool_test_important = Pool(X_test[important_features], y_test, cat_features=cat_features)
pool_full_important = Pool(X[important_features], y, cat_features=cat_features)

In [12]:
simple_catb_important = CatBoostClassifier(random_state=42, eval_metric='AUC')

simple_catb_important.fit(pool_train_important, 
                          eval_set=pool_test_important, 
                          verbose=100, 
                          early_stopping_rounds=100
                         )

Learning rate set to 0.083772
0:	test: 0.6036954	best: 0.6036954 (0)	total: 112ms	remaining: 1m 51s
100:	test: 0.7592822	best: 0.7595428 (94)	total: 7.12s	remaining: 1m 3s
200:	test: 0.7608876	best: 0.7609030 (199)	total: 13.9s	remaining: 55.4s
300:	test: 0.7620376	best: 0.7620376 (300)	total: 20.9s	remaining: 48.6s
400:	test: 0.7623511	best: 0.7628809 (395)	total: 28.2s	remaining: 42.1s
500:	test: 0.7626669	best: 0.7632674 (474)	total: 35.2s	remaining: 35.1s
600:	test: 0.7628073	best: 0.7634494 (549)	total: 42.3s	remaining: 28.1s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7634494347
bestIteration = 549

Shrink model to first 550 iterations.


<catboost.core.CatBoostClassifier at 0x1ef504238b0>

## Training on full dataset

In [13]:
simple_catb = CatBoostClassifier(random_state=42, eval_metric='AUC', iterations=2000)
simple_catb.fit(pool_full, verbose=100)

Learning rate set to 0.03222
0:	total: 114ms	remaining: 3m 48s
100:	total: 8.58s	remaining: 2m 41s
200:	total: 17.4s	remaining: 2m 35s
300:	total: 25.9s	remaining: 2m 26s
400:	total: 34s	remaining: 2m 15s
500:	total: 42.3s	remaining: 2m 6s
600:	total: 50.7s	remaining: 1m 57s
700:	total: 59.1s	remaining: 1m 49s
800:	total: 1m 7s	remaining: 1m 41s
900:	total: 1m 16s	remaining: 1m 32s
1000:	total: 1m 24s	remaining: 1m 24s
1100:	total: 1m 33s	remaining: 1m 15s
1200:	total: 1m 41s	remaining: 1m 7s
1300:	total: 1m 49s	remaining: 59s
1400:	total: 1m 58s	remaining: 50.5s
1500:	total: 2m 6s	remaining: 42.1s
1600:	total: 2m 15s	remaining: 33.6s
1700:	total: 2m 23s	remaining: 25.2s
1800:	total: 2m 32s	remaining: 16.8s
1900:	total: 2m 40s	remaining: 8.35s
1999:	total: 2m 48s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1ef50423b20>

In [14]:
simple_catb_important.fit(pool_full_important, verbose=100)

Learning rate set to 0.060838
0:	total: 113ms	remaining: 1m 53s
100:	total: 8.19s	remaining: 1m 12s
200:	total: 16.1s	remaining: 1m 3s
300:	total: 23.9s	remaining: 55.4s
400:	total: 32s	remaining: 47.7s
500:	total: 40.1s	remaining: 39.9s
600:	total: 48.2s	remaining: 32s
700:	total: 56.3s	remaining: 24s
800:	total: 1m 4s	remaining: 16s
900:	total: 1m 12s	remaining: 7.96s
999:	total: 1m 20s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1ef504238b0>

## Predicting

In [15]:
predictions = simple_catb.predict_proba(X_pred)[:, 1]
predictions_important = simple_catb_important.predict_proba(X_pred[important_features])[:, 1]

In [16]:
pd.DataFrame({'user_id': pred['user_id'], 'predict': predictions}).to_csv('simple_catb_ds_v0.1.csv', index=False)
pd.DataFrame({'user_id': pred['user_id'], 'predict': predictions_important}).to_csv('simple_catb_important_ds_v0.1.csv', index=False)

# Наблюдения

Concordance index на сайте. Показатель базовой простой модели 0.7671:
- отсечение неважных признаков не привело к улучшению качества модели (0.7647)
- сокращение iterations до числа, когда срабатывает early_stopping, не привело к улучшению качества (однако при этом catboost сам изменил learning_rate с 0.087 до 0.14) (0.7621)
- увеличение iterations до 2000 привело к улучшению (0.7696)

# Идеи

- обучение на важных признаках но с большим числом итераций;
- масштабирование данных;
- приведение распределения transaction_amt к нормальному; 