In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report

In [27]:
import featuretools as ft
from catboost import CatBoostClassifier

## Загрузка данных

In [3]:
data = pd.read_csv('application_train.csv')
bureau = pd.read_csv('bureau.csv')
bureau_balance = pd.read_csv('bureau_balance.csv')

## Инициализация таблиц в featuretools, генерация признаков

In [4]:
es = ft.EntitySet(id='clients')

In [5]:
es = es.add_dataframe(dataframe_name='app', dataframe=data, index='SK_ID_CURR')
es = es.add_dataframe(dataframe_name='bureau', dataframe=bureau, index='SK_ID_BUREAU')
es = es.add_dataframe(dataframe_name='bureau_balance', dataframe=bureau_balance, make_index=True, 
                      index='bureaubalance_index')

In [6]:
r_app_bureau = ft.Relationship(es, 'app', 'SK_ID_CURR', 'bureau', 'SK_ID_CURR')

In [7]:
r_bureau_balance = ft.Relationship(es, 'bureau', 'SK_ID_BUREAU', 'bureau_balance', 'SK_ID_BUREAU')

In [8]:
es = es.add_relationships([r_app_bureau, r_bureau_balance])

In [9]:
agg_func = ['sum', 'std', 'max', 'min', 'mean', 'count', 'num_unique', 'mode'] #функции для генерации признаков

In [10]:
#генерация признаков
features_set, feature_names = ft.dfs(entityset=es, 
                                     target_dataframe_name='app',
                                     agg_primitives=agg_func,
                                     max_depth=2,
                                     verbose=True, n_jobs=2)

Built 294 features
EntitySet scattered to 2 workers in 32 seconds
Elapsed: 03:35 | Progress: 100%|██████████


## Разбиение на тренировочный и тестовый набор, отбор признаков 

In [11]:
X, y = features_set.drop(columns='TARGET'), features_set.TARGET

In [13]:
cat_features = X.columns[X.dtypes=='category'].to_list()
X[cat_features] = X[cat_features].astype(str)
X[cat_features] = X[cat_features].fillna('None')
X = X.replace({pd.NA:None})
X[cat_features] = X[cat_features].astype('category')

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21, test_size=0.3, )

In [18]:
cbc = CatBoostClassifier(iterations=150, 
                         cat_features=cat_features, 
                         eval_metric='AUC', #целевая метрика
                         task_type='CPU', verbose=False)

In [19]:
#отбор признаков
res = cbc.select_features(X_train, y_train, features_for_select=X.columns, 
                          num_features_to_select=25, #25 признаков
                          algorithm='RecursiveByPredictionValuesChange',
                          verbose=False, train_final_model=True
                         )

Learning rate set to 0.5
Step #1 out of 1
Feature #232 eliminated
Feature #228 eliminated
Feature #222 eliminated
Feature #3 eliminated
Feature #96 eliminated
Feature #218 eliminated
Feature #288 eliminated
Feature #287 eliminated
Feature #286 eliminated
Feature #264 eliminated
Feature #215 eliminated
Feature #284 eliminated
Feature #99 eliminated
Feature #13 eliminated
Feature #209 eliminated
Feature #207 eliminated
Feature #233 eliminated
Feature #201 eliminated
Feature #200 eliminated
Feature #199 eliminated
Feature #20 eliminated
Feature #198 eliminated
Feature #276 eliminated
Feature #23 eliminated
Feature #197 eliminated
Feature #25 eliminated
Feature #275 eliminated
Feature #195 eliminated
Feature #194 eliminated
Feature #191 eliminated
Feature #187 eliminated
Feature #234 eliminated
Feature #32 eliminated
Feature #102 eliminated
Feature #34 eliminated
Feature #104 eliminated
Feature #266 eliminated
Feature #170 eliminated
Feature #169 eliminated
Feature #271 eliminated
Feature 

In [26]:
selected_features = res['selected_features_names']
selected_features

['CODE_GENDER',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'NAME_EDUCATION_TYPE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_ID_PUBLISH',
 'OWN_CAR_AGE',
 'REGION_RATING_CLIENT_W_CITY',
 'ORGANIZATION_TYPE',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'DAYS_LAST_PHONE_CHANGE',
 'MAX(bureau.AMT_CREDIT_MAX_OVERDUE)',
 'MAX(bureau.AMT_CREDIT_SUM)',
 'MAX(bureau.DAYS_CREDIT)',
 'MAX(bureau.DAYS_CREDIT_ENDDATE)',
 'MEAN(bureau.AMT_CREDIT_SUM)',
 'MEAN(bureau.AMT_CREDIT_SUM_DEBT)',
 'MEAN(bureau.DAYS_CREDIT)',
 'STD(bureau.DAYS_CREDIT_UPDATE)',
 'SUM(bureau.DAYS_CREDIT_ENDDATE)']

In [22]:
y_pred = cbc.predict(X_train)
y_proba = cbc.predict_proba(X_train)[:, 1]
print('Train set: ')
print(f'ROC_AUC {roc_auc_score(y_train, y_proba)}')
print(classification_report(y_train, y_pred))

Train set: 
ROC_AUC 0.8138634707691201
              precision    recall  f1-score   support

           0       0.93      1.00      0.96    197860
           1       0.81      0.08      0.15     17397

    accuracy                           0.92    215257
   macro avg       0.87      0.54      0.56    215257
weighted avg       0.92      0.92      0.89    215257



In [24]:
y_pred = cbc.predict(X_test)
y_proba = cbc.predict_proba(X_test)[:, 1]
print('Train set: ')
print(f'ROC_AUC {roc_auc_score(y_test, y_proba)}')
print(classification_report(y_test, y_pred))

Train set: 
ROC_AUC 0.75258946087249
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     84826
           1       0.45      0.04      0.08      7428

    accuracy                           0.92     92254
   macro avg       0.68      0.52      0.52     92254
weighted avg       0.88      0.92      0.89     92254



## Вывод - из полученных метрик виден большой дисбаланс классов, вследствие которого метрики на тестовой выборке просели => возможно, следует обратить внимание на балансировку классов