In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

from catboost import CatBoostClassifier, Pool, cv, to_classifier

In [2]:
full_df = pd.read_csv('datasets/preprocessed_v0.1.csv', compression='zip', 
                      dtype={'employee_count_nm': 'category', 
                             'bankemplstatus': 'bool',
                             'customer_age': 'category'})

In [3]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96000 entries, 0 to 95999
Columns: 487 entries, user_id to time
dtypes: bool(1), category(2), float64(475), int64(8), object(1)
memory usage: 354.8+ MB


In [4]:
train = full_df[full_df['target'].notna()]
pred = full_df[full_df['target'].isna()]

In [5]:
cat_features = ['employee_count_nm', 'customer_age', 'bankemplstatus', 'report']
cols_to_drop = ['user_id', 'target', 'time']

In [6]:
X = train.drop(cols_to_drop, axis=1)
y = train['target'].astype('int8')

X_pred = pred.drop(cols_to_drop, axis=1)

In [7]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Разбиение на train и test

In [8]:
class GroupFeatures(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.feature_groups = [
            ('report', 'customer_age'),
            ('report', 'employee_count_nm'),
            ('customer_age', 'employee_count_nm')
        ]
        
        self.tendencies = ['count', 'sum', 'mean', 'std']
        
    def fit(self, X, y):
        full_df = pd.concat([X, y], axis=1)
        self.tables = {}
        
        for feature_1, feature_2 in self.feature_groups:
            df = full_df[full_df.target.notna()].groupby([feature_1, feature_2], observed=True).agg({'target': self.tendencies}).reset_index()
            df.columns = [feature_1, feature_2] + [f'group_{feature_1}_{feature_2}_{tendency}' for _, tendency in df.columns[2:]]
            self.tables[(feature_1, feature_2)] = df
        
        return self
            
    def transform(self, X, y=None):
        for feature_1, feature_2 in self.feature_groups:
            X = X.merge(self.tables[(feature_1, feature_2)], on=[feature_1, feature_2], how='left')
            
        for col in X.select_dtypes(include='object').columns:
            X[col] = pd.to_numeric(X[col])
            
        return X

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
pool_train = Pool(X_train, y_train, cat_features=cat_features)
pool_test = Pool(X_test, y_test, cat_features=cat_features)
pool_full = Pool(X, y, cat_features=cat_features)

# Simple CatBoost

## First training

In [11]:
simple_catb = CatBoostClassifier(random_state=42, eval_metric='AUC', iterations=1500)

simple_catb.fit(pool_train, 
                eval_set=pool_test, 
                verbose=100, 
                early_stopping_rounds=200
               )

Learning rate set to 0.070226
0:	test: 0.6365865	best: 0.6365865 (0)	total: 232ms	remaining: 5m 48s
100:	test: 0.7609077	best: 0.7609077 (100)	total: 7.53s	remaining: 1m 44s
200:	test: 0.7647704	best: 0.7648639 (199)	total: 14.7s	remaining: 1m 34s
300:	test: 0.7658206	best: 0.7660094 (287)	total: 21.7s	remaining: 1m 26s
400:	test: 0.7671353	best: 0.7675044 (385)	total: 28.8s	remaining: 1m 18s
500:	test: 0.7668038	best: 0.7675044 (385)	total: 36s	remaining: 1m 11s
600:	test: 0.7672499	best: 0.7678320 (561)	total: 43.2s	remaining: 1m 4s
700:	test: 0.7667170	best: 0.7678320 (561)	total: 50.9s	remaining: 58s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.7678319519
bestIteration = 561

Shrink model to first 562 iterations.


<catboost.core.CatBoostClassifier at 0x16d567076a0>

## Training on most important features

In [12]:
importances = pd.DataFrame({
    'feature': simple_catb.feature_names_,
    'importance': simple_catb.feature_importances_
}).sort_values(by='importance', ascending=False)

In [13]:
importances.head(20)

Unnamed: 0,feature,importance
1,employee_count_nm,5.820256
3,customer_age,3.226406
163,total_days_to_report_std,2.596091
162,total_days_to_report_min,1.868281
20,total_tran_sum_by_month_std,1.651018
482,neg_days_to_report_min,1.508026
483,neg_days_to_report_std,1.207138
159,total_days_to_report_mean,1.203433
130,total_tran_month_nunique,1.120772
0,report,1.106544


In [14]:
threshold = 0.3

important_features = importances[importances['importance'] > threshold]['feature'].values
important_features.shape[0]

100

In [15]:
gf = GroupFeatures()

gf.fit(X_train[important_features], y_train)
X_train_imp = gf.transform(X_train[important_features])
X_test_imp = gf.transform(X_test[important_features])

gf.fit(X[important_features], y)
X_imp = gf.transform(X[important_features])
X_pred_imp = gf.transform(X_pred[important_features])

In [16]:
cat_features_important = [x for x in cat_features if x in important_features]

pool_train_important = Pool(X_train_imp, y_train, cat_features=cat_features_important)
pool_test_important = Pool(X_test_imp, y_test, cat_features=cat_features_important)
pool_full_important = Pool(X_imp, y, cat_features=cat_features_important)

In [17]:
simple_catb_important = CatBoostClassifier(random_state=42, eval_metric='AUC', iterations=1500)

simple_catb_important.fit(pool_train_important, 
                          eval_set=pool_test_important, 
                          verbose=100, 
                          early_stopping_rounds=200
                         )

Learning rate set to 0.070226
0:	test: 0.6847585	best: 0.6847585 (0)	total: 50.2ms	remaining: 1m 15s
100:	test: 0.7626197	best: 0.7626197 (100)	total: 4.89s	remaining: 1m 7s
200:	test: 0.7663012	best: 0.7663575 (195)	total: 9.66s	remaining: 1m 2s
300:	test: 0.7671627	best: 0.7673055 (299)	total: 14.4s	remaining: 57.5s
400:	test: 0.7669664	best: 0.7677150 (355)	total: 19.1s	remaining: 52.4s
500:	test: 0.7655297	best: 0.7677150 (355)	total: 23.8s	remaining: 47.5s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.7677150049
bestIteration = 355

Shrink model to first 356 iterations.


<catboost.core.CatBoostClassifier at 0x16da680a9b0>

In [18]:
res_important, models_important = cv(pool_full_important, params={'cat_features': cat_features_important,
                                                 'eval_metric': 'AUC', 
                                                 'loss_function': 'Logloss',
                                                 'iterations': 1500,
                                                 'random_state': 42}, 
                                       verbose=200, 
                                       stratified=True, 
                                       early_stopping_rounds=200, 
                                       seed=42, 
                                       return_models=True,
                                       fold_count=10)

Training on fold [0/10]
0:	test: 0.6719529	best: 0.6719529 (0)	total: 56.5ms	remaining: 1m 24s
200:	test: 0.7684655	best: 0.7684655 (200)	total: 10.6s	remaining: 1m 8s
400:	test: 0.7730898	best: 0.7731081 (399)	total: 21s	remaining: 57.6s
600:	test: 0.7751679	best: 0.7753181 (561)	total: 31.4s	remaining: 47s
800:	test: 0.7761894	best: 0.7761894 (800)	total: 41.9s	remaining: 36.5s
1000:	test: 0.7760367	best: 0.7768098 (895)	total: 52.2s	remaining: 26s

bestTest = 0.776809765
bestIteration = 895

Training on fold [1/10]
0:	test: 0.6792030	best: 0.6792030 (0)	total: 54.4ms	remaining: 1m 21s
200:	test: 0.7481063	best: 0.7482490 (199)	total: 10.9s	remaining: 1m 10s
400:	test: 0.7521484	best: 0.7523663 (350)	total: 21.8s	remaining: 59.9s
600:	test: 0.7529990	best: 0.7533794 (570)	total: 32.6s	remaining: 48.8s
800:	test: 0.7546566	best: 0.7547196 (799)	total: 43.4s	remaining: 37.9s
1000:	test: 0.7557991	best: 0.7561726 (957)	total: 54.3s	remaining: 27.1s

bestTest = 0.7561725771
bestIteration

In [19]:
res_important

Unnamed: 0,iterations,test-AUC-mean,test-AUC-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
0,0,0.676819,0.006611,0.661034,0.000166,0.661010,0.000150
1,1,0.697336,0.005350,0.631374,0.000328,0.631322,0.000329
2,2,0.711190,0.006704,0.603968,0.000420,0.603878,0.000307
3,3,0.713398,0.008483,0.578835,0.000271,0.578710,0.000165
4,4,0.714619,0.009388,0.556060,0.000470,0.555911,0.000558
...,...,...,...,...,...,...,...
1495,1495,0.769828,0.010538,0.245601,0.002991,0.206628,0.007720
1496,1496,0.769820,0.010538,0.245602,0.002990,0.206624,0.007727
1497,1497,0.769827,0.010538,0.245600,0.002992,0.206619,0.007736
1498,1498,0.769830,0.010539,0.245601,0.002991,0.206617,0.007740


threshold | cv test-AUC-mean | cv test-AUC-std | RobustScaler
--- | --- | --- | ---
0 | 0.7592 | 0.006959 | True
0.01 | 0.759484 | 0.008185 | True
0.1 | 0.760877 | 0.007068 | True
0.2 | 0.760827 | 0.007325 | True
0.3 | 0.761103 | 0.006596 | True
0.4 | 0.761035 | 0.006563 | True
0.5 | 0.759618 | 0.007116 | True
0.3 | 0.760266 | 0.007104 | False

## Predicting

In [22]:
res = pd.DataFrame({'user_id': pred['user_id'], 'predict': 0})

for model in models_important:
    predictions = to_classifier(model).predict_proba(X_pred_imp)[:, 1]
    res['predict'] += predictions
    
res['predict'] /= len(models_important)
pd.DataFrame({'user_id': pred['user_id'], 'predict': res['predict']}).to_csv('catb.csv', index=False)