In [18]:
import pandas as pd

from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier, Pool, cv

In [7]:
full_df = pd.read_csv('datasets/preprocessed_v0.csv', compression='zip', 
                      dtype={'employee_count_nm': 'category', 
                             'bankemplstatus': 'bool',
                             'customer_age': 'category'})

In [9]:
train = full_df[full_df['target'].notna()]
pred = full_df[full_df['target'].isna()]

In [16]:
cat_features = ['employee_count_nm', 'customer_age', 'bankemplstatus']
cols_to_drop = ['user_id', 'report', 'target', 'time']

In [12]:
X = train.drop(cols_to_drop, axis=1)
y = train['target']

X_pred = pred.drop(cols_to_drop, axis=1)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

pool_train = Pool(X_train, y_train, cat_features=cat_features)
pool_test = Pool(X_test, y_test, cat_features=cat_features)
pool_full = Pool(X, y, cat_features=cat_features)

# Simple CatBoost

## First training

In [20]:
simple_catb = CatBoostClassifier(random_state=42, eval_metric='AUC')

simple_catb.fit(pool_train, 
                eval_set=pool_test, 
                verbose=100, 
                early_stopping_rounds=100
               )

Learning rate set to 0.083772
0:	test: 0.6319318	best: 0.6319318 (0)	total: 85ms	remaining: 1m 24s
100:	test: 0.7555323	best: 0.7556537 (97)	total: 6.62s	remaining: 58.9s
200:	test: 0.7577043	best: 0.7581866 (151)	total: 13.1s	remaining: 52.1s
300:	test: 0.7600537	best: 0.7603048 (290)	total: 19.6s	remaining: 45.6s
400:	test: 0.7600792	best: 0.7605706 (321)	total: 26.3s	remaining: 39.3s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7605706004
bestIteration = 321

Shrink model to first 322 iterations.


<catboost.core.CatBoostClassifier at 0x1904b9db5b0>

## Training on most important features

In [21]:
importances = pd.DataFrame({
    'feature': simple_catb.feature_names_,
    'importance': simple_catb.feature_importances_
}).sort_values(by='importance', ascending=False)

In [27]:
threshold = 0.01

important_features = importances[importances['importance'] > threshold]['feature'].values
important_features.shape[0]

318

In [38]:
pool_train_important = Pool(X_train[important_features], y_train, cat_features=cat_features)
pool_test_important = Pool(X_test[important_features], y_test, cat_features=cat_features)
pool_full_important = Pool(X[important_features], y, cat_features=cat_features)

In [36]:
simple_catb_important = CatBoostClassifier(random_state=42, eval_metric='AUC')

simple_catb_important.fit(pool_train_important, 
                          eval_set=pool_test_important, 
                          verbose=100, 
                          early_stopping_rounds=100
                         )

Learning rate set to 0.083772
0:	test: 0.6451405	best: 0.6451405 (0)	total: 88.2ms	remaining: 1m 28s
100:	test: 0.7564544	best: 0.7566284 (96)	total: 7.17s	remaining: 1m 3s
200:	test: 0.7586758	best: 0.7588857 (154)	total: 14.1s	remaining: 56s
300:	test: 0.7603986	best: 0.7607122 (295)	total: 21s	remaining: 48.8s
400:	test: 0.7616806	best: 0.7618045 (397)	total: 28.1s	remaining: 41.9s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7618044541
bestIteration = 397

Shrink model to first 398 iterations.


<catboost.core.CatBoostClassifier at 0x1904b86f5e0>

## Training on full dataset

In [46]:
simple_catb = CatBoostClassifier(random_state=42, eval_metric='AUC', iterations=2000)
simple_catb.fit(pool_full, verbose=100)

Learning rate set to 0.03222
0:	total: 91.5ms	remaining: 3m 2s
100:	total: 7.99s	remaining: 2m 30s
200:	total: 16s	remaining: 2m 23s
300:	total: 23.9s	remaining: 2m 14s
400:	total: 31.8s	remaining: 2m 6s
500:	total: 39.7s	remaining: 1m 58s
600:	total: 47.7s	remaining: 1m 51s
700:	total: 55.7s	remaining: 1m 43s
800:	total: 1m 3s	remaining: 1m 35s
900:	total: 1m 11s	remaining: 1m 27s
1000:	total: 1m 20s	remaining: 1m 20s
1100:	total: 1m 28s	remaining: 1m 12s
1200:	total: 1m 37s	remaining: 1m 4s
1300:	total: 1m 45s	remaining: 56.5s
1400:	total: 1m 53s	remaining: 48.4s
1500:	total: 2m 1s	remaining: 40.3s
1600:	total: 2m 9s	remaining: 32.2s
1700:	total: 2m 17s	remaining: 24.1s
1800:	total: 2m 25s	remaining: 16.1s
1900:	total: 2m 33s	remaining: 8s
1999:	total: 2m 41s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1904b8f9510>

In [39]:
simple_catb_important.fit(pool_full_important, verbose=100)

Learning rate set to 0.060838
0:	total: 90ms	remaining: 1m 29s
100:	total: 7.54s	remaining: 1m 7s
200:	total: 14.8s	remaining: 59s
300:	total: 22.3s	remaining: 51.7s
400:	total: 29.7s	remaining: 44.4s
500:	total: 37.4s	remaining: 37.2s
600:	total: 45.1s	remaining: 29.9s
700:	total: 52.7s	remaining: 22.5s
800:	total: 1m	remaining: 15s
900:	total: 1m 8s	remaining: 7.53s
999:	total: 1m 16s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1904b86f5e0>

## Predicting

In [47]:
predictions = simple_catb.predict_proba(X_pred)[:, 1]
predictions_important = simple_catb_important.predict_proba(X_pred[important_features])[:, 1]

In [48]:
pd.DataFrame({'user_id': pred['user_id'], 'predict': predictions}).to_csv('simple_catb_ds_v0.csv', index=False)
pd.DataFrame({'user_id': pred['user_id'], 'predict': predictions_important}).to_csv('simple_catb_important_ds_v0.csv', index=False)

# Наблюдения

Concordance index на сайте. Показатель базовой простой модели 0.7671:
- отсечение неважных признаков не привело к улучшению качества модели (0.7547)
- сокращение iterations до числа, когда срабатывает early_stopping, не привело к улучшению качества (однако при этом catboost сам изменил learning_rate с 0.087 до 0.14) (0.7621)
- увеличение iterations до 2000 привело к улучшению (0.7696)