# ДЗ 5 Гладышев В.В.

## Базовая модель оттока

Набор данных с платформы kaggle https://www.kaggle.com/adammaus/predicting-churn-for-bank-customers по оттоку клиентов банка

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#from sklearn.feature_extraction.text import TfidfVectorizer
import itertools

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df = pd.read_csv("churn_data.csv")
df.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [3]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(df, df['Exited'], random_state=0)

In [4]:
#соберем наш простой pipeline, но нам понадобится написать класс для выбора нужного поля
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [5]:
df.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [6]:
categorical_columns = ['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember']
continuous_columns = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [7]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col))
            ])
    final_transformers.append((cont_col, cont_transformer))

In [8]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [9]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(random_state = 42)),
])

In [10]:
#обучим наш пайплайн
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [11]:
#наши прогнозы для тестовой выборки
preds = pipeline.predict_proba(X_test)[:, 1]
preds[:10]

array([0.37, 0.26, 0.16, 0.02, 0.02, 0.67, 0.04, 0.12, 0.15, 0.75])

Также нам нужно от вероятностей перейти к меткам классов. Для этого нужно подобрать порог, после которого мы считаем, что объект можно отнести к классу 1 (если вероятность больше порога - размечаем объект как класс 1, если нет - класс 0)

In [12]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

In [13]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.380000, F-Score=0.641, Precision=0.653, Recall=0.629


### Таблица для сохранения результатов

In [14]:
rw_clissifiers = ['RandomForestClassifier', 
                  'RandomForestClassifier_GridSearch',
                  'RandomForestClassifier_FeatEng', 
                  'XGBClassifier_FeatEng', 
                  'GradientBoostingClassifier_FeatEng']

cl_met = ['Best Threshold', 'F-Score', 'Precision', 
          'Recall', 'roc_auc_s', 'log_loss_s', 'TPR', 'FPR', 'TNR', "TN", "FN", "TP", "FP"]

res_tab = pd.DataFrame(columns=cl_met)

In [15]:
from sklearn.metrics import roc_auc_score, log_loss

r_auc = roc_auc_score(y_true=y_test, y_score=preds)
l_los = log_loss(y_true=y_test, y_pred=preds)

print("roc auc score: {}".format(r_auc))
print("log loss score: {}".format(l_los))

roc auc score: 0.8635016710758334
log loss score: 0.36295529263621457


In [16]:
cnf_matrix = confusion_matrix(y_test, preds>thresholds[ix])

TN = cnf_matrix[0][0]
FN = cnf_matrix[1][0]
TP = cnf_matrix[1][1]
FP = cnf_matrix[0][1]

TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(FP+TN)
TPR, FPR, TNR

(0.6168958742632613, 0.07985936715218483, 0.9201406328478152)

In [17]:
res_tab.loc['RandomForestClassifier', :] = [thresholds[ix], 
                                            fscore[ix], 
                                            precision[ix], 
                                            recall[ix], 
                                            r_auc, l_los, 
                                            TPR, FPR, TNR,
                                            TN, FN, TP, FP]

In [18]:
res_tab

Unnamed: 0,Best Threshold,F-Score,Precision,Recall,roc_auc_s,log_loss_s,TPR,FPR,TNR,TN,FN,TP,FP
RandomForestClassifier,0.38,0.640641,0.653061,0.628684,0.863502,0.362955,0.616896,0.0798594,0.920141,1832,195,314,159


### Пример с перебором параметров с помощью GridSearch

Сетка с параметрами

In [19]:
from sklearn.model_selection import GridSearchCV

params={'classifier__max_features':[0.3, 0.5],
        'classifier__min_samples_leaf':[3],
        'classifier__max_depth':[None]
        }

In [20]:
grid = GridSearchCV(pipeline,
                    param_grid=params,
                    cv=6,
                    refit=False)

search = grid.fit(X_train, y_train)
search.best_params_

{'classifier__max_depth': None,
 'classifier__max_features': 0.5,
 'classifier__min_samples_leaf': 3}

Обучаем модель уже сновыми параметрами

In [21]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(max_depth=None, max_features=0.5, 
                                          min_samples_leaf=3, random_state=42)),
])
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [22]:
preds = pipeline.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.389431, F-Score=0.653, Precision=0.661, Recall=0.646


In [23]:
r_auc = roc_auc_score(y_true=y_test, y_score=preds)
l_los = log_loss(y_true=y_test, y_pred=preds)

print("roc auc score: {}".format(r_auc))
print("log loss score: {}".format(l_los))

roc auc score: 0.8695105380893786
log loss score: 0.33158589956715095


In [24]:
cnf_matrix = confusion_matrix(y_test, preds>thresholds[ix])

TN = cnf_matrix[0][0]
FN = cnf_matrix[1][0]
TP = cnf_matrix[1][1]
FP = cnf_matrix[0][1]

TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(FP+TN)
TPR, FPR, TNR

(0.6444007858546169, 0.08488196885986941, 0.9151180311401306)

In [25]:
res_tab.loc['RandomForestClassifier_GridSearch', :] = [thresholds[ix], 
                                                       fscore[ix], 
                                                       precision[ix], 
                                                       recall[ix], 
                                                       r_auc, l_los, 
                                                       TPR, FPR, TNR,
                                                       TN, FN, TP, FP]

In [26]:
res_tab

Unnamed: 0,Best Threshold,F-Score,Precision,Recall,roc_auc_s,log_loss_s,TPR,FPR,TNR,TN,FN,TP,FP
RandomForestClassifier,0.38,0.640641,0.653061,0.628684,0.863502,0.362955,0.616896,0.0798594,0.920141,1832,195,314,159
RandomForestClassifier_GridSearch,0.389431,0.653426,0.660643,0.646365,0.869511,0.331586,0.644401,0.084882,0.915118,1822,181,328,169


## Добавим feature engineering

In [27]:
continuous_columns

['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [28]:
# генерируем признаки
def transform_data(data):
    for i1, col1 in enumerate(continuous_columns):
        data[col1 + '_log'] = np.log(data[col1] + 1.1)

        for i2, col2 in enumerate(continuous_columns):
            data['%s_%s_1' % (col1, col2)] = data[col1] - data[col2]
            data['%s_%s_2' % (col1, col2)] = data[col1] + data[col2]
            data['%s_%s_3' % (col1, col2)] = data[col1] / (data[col2] + 0.1)
            data['%s_%s_4' % (col1, col2)] = data[col1] * data[col2]

            data['%s_%s_11' % (col1, col2)] = data[col1] - np.log(data[col2] + 1)
            data['%s_%s_22' % (col1, col2)] = data[col1] + np.log(data[col2] + 1)
            data['%s_%s_33' % (col1, col2)] = data[col1] / (np.log(data[col2] + 1) + 0.1)
            data['%s_%s_44' % (col1, col2)] = data[col1] * np.log(data[col2] + 1)

    return data


X_train_tr = transform_data(X_train)
X_test_tr = transform_data(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col1 + '_log'] = np.log(data[col1] + 1.1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['%s_%s_1' % (col1, col2)] = data[col1] - data[col2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['%s_%s_2' % (col1, col2)] = data[col1] + data[col2]
A value is trying to be set on a copy of a s

In [29]:
X_train_tr.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,...,EstimatedSalary_NumOfProducts_33,EstimatedSalary_NumOfProducts_44,EstimatedSalary_EstimatedSalary_1,EstimatedSalary_EstimatedSalary_2,EstimatedSalary_EstimatedSalary_3,EstimatedSalary_EstimatedSalary_4,EstimatedSalary_EstimatedSalary_11,EstimatedSalary_EstimatedSalary_22,EstimatedSalary_EstimatedSalary_33,EstimatedSalary_EstimatedSalary_44
2967,2968,15595324,Daniels,579,Germany,Female,39,5,117833.3,3,...,3923.17979,8083.48242,0.0,11662.0,0.999983,34000560.0,5822.328885,5839.671115,664.795732,50561.27
700,701,15803457,Hao,750,France,Female,32,5,0.0,2,...,79768.471343,105039.93588,0.0,191222.94,0.999999,9141553000.0,95600.001941,95622.938059,8265.12675,1096478.0
3481,3482,15644686,Kennedy,729,Spain,Female,34,9,53299.96,2,...,35754.655951,47082.095285,0.0,85711.94,0.999998,1836634000.0,42845.304376,42866.635624,3980.816319,457085.6
1621,1622,15777797,Kovalyova,689,Spain,Male,38,5,75075.14,1,...,10908.341115,5997.053954,0.0,17303.84,0.999988,74855720.0,8642.854348,8660.985652,943.950293,78435.3
800,801,15747542,Perez,605,France,Male,52,7,0.0,2,...,145128.246761,191106.354145,0.0,347905.0,0.999999,30259470000.0,173940.433457,173964.566543,14297.610726,2099005.0


In [30]:
len(X_train_tr.columns)

219

In [31]:
cols = list(set(X_train_tr.columns.to_list()) - set(categorical_columns + ['Exited', 'Surname']))
len(cols)

212

In [32]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in cols:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col))
            ])
    final_transformers.append((cont_col, cont_transformer))

In [33]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [34]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(random_state = 42)),
])

In [35]:
#обучим наш пайплайн
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [36]:
#наши прогнозы для тестовой выборки
preds = pipeline.predict_proba(X_test)[:, 1]
preds[:10]

array([0.27, 0.21, 0.21, 0.08, 0.12, 0.62, 0.04, 0.04, 0.3 , 0.73])

In [37]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.320000, F-Score=0.603, Precision=0.547, Recall=0.672


In [38]:
r_auc = roc_auc_score(y_true=y_test, y_score=preds)
l_los = log_loss(y_true=y_test, y_pred=preds)

print("roc auc score: {}".format(r_auc))
print("log loss score: {}".format(l_los))

roc auc score: 0.8509589814282148
log loss score: 0.36640188264307444


In [39]:
cnf_matrix = confusion_matrix(y_test, preds>thresholds[ix])

TN = cnf_matrix[0][0]
FN = cnf_matrix[1][0]
TP = cnf_matrix[1][1]
FP = cnf_matrix[0][1]

TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(FP+TN)
TPR, FPR, TNR

(0.6542239685658153, 0.1366147664490206, 0.8633852335509794)

In [40]:
res_tab.loc['RandomForestClassifier_FeatEng', :] = [thresholds[ix], 
                                                    fscore[ix], 
                                                    precision[ix], 
                                                    recall[ix], 
                                                    r_auc, l_los, 
                                                    TPR, FPR, TNR,
                                                    TN, FN, TP, FP]

## Построим бустинговые модели

In [41]:
# выборки для разных моделей
from sklearn.preprocessing import MinMaxScaler

X_train_gb = X_train[cols].values

scaler_reg = MinMaxScaler((-1, 1))
scaler_reg.fit(np.vstack((X_train_tr[cols], X_test_tr[cols])))
X_train_reg = scaler_reg.transform(X_train_tr[cols])
X_test_reg = scaler_reg.transform(X_test_tr[cols])

In [42]:
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier

params = {
    'silent': 1,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'eta': 0.01,
    'subsample': 0.4,
    'min_child_weight': 7,
    'n': 580,
    'verbose': 1
}

dtrain = xgb.DMatrix(X_train_tr[cols], label=y_train, missing=np.NaN)


bst1 = xgb.XGBClassifier(boosting_type='gbdt', **params)
bst1.fit(X_train_tr[cols], y_train)
# ------------------------------------------------------------------
params_est = {
    'n_estimators': 300,
    'loss': 'exponential',
    'learning_rate': 0.08,
    'subsample': 0.6910000000000001,
    'min_samples_leaf': 340,
    'max_features': 53,
    'random_state': 1,
    'verbose': 1
}
bst2 = GradientBoostingClassifier(**params_est)
bst2.fit(X_train_tr[cols], y_train)



Parameters: { "boosting_type", "n", "silent", "verbose" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.7834           0.0132           32.29s
         2           0.7769           0.0132           34.27s
         3           0.7641           0.0119           32.18s
         4           0.7501           0.0102           32.78s
         5           0.7414           0.0101           33.10s
         6           0.7334           0.0072           32.93s
         7           0.7285           0.0066           32.52s
         8           0.7148           0.0068           33.03s
         9           0.7180           0.0051           32.56s
        10           0.7040           0.0066           31.84s


GradientBoostingClassifier(learning_rate=0.08, loss='exponential',
                           max_features=53, min_samples_leaf=340,
                           n_estimators=300, random_state=1,
                           subsample=0.6910000000000001, verbose=1)

In [43]:
preds1 = bst1.predict_proba(X_test_tr[cols])[:, 1]
preds2 = bst2.predict_proba(X_test_tr[cols].values)[:,1]



In [44]:
def get_metrics(probs):
    precision, recall, thresholds = precision_recall_curve(y_test, probs)

    fscore = (2 * precision * recall) / (precision + recall)
    # locate the index of the largest f score
    ix = np.argmax(fscore)
    print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f, Roc-AUC=%.3f' % (thresholds[ix], 
                                                                            fscore[ix],
                                                                            precision[ix],
                                                                            recall[ix],
                                                                            roc_auc_score(y_test, probs)))
    return thresholds[ix], fscore[ix], precision[ix], recall[ix], roc_auc_score(y_test, probs)

In [45]:
# XGBClassifier
xgb_thresholds, xgb_fscore, xgb_precision, xgb_recall, xgb_roc_auc_score = get_metrics(preds1)

Best Threshold=0.387431, F-Score=0.600, Precision=0.563, Recall=0.642, Roc-AUC=0.837


In [46]:
l_los = log_loss(y_true=y_test, y_pred=preds1)
print("log loss score: {}".format(l_los))

log loss score: 0.44430235064029694


In [47]:
cnf_matrix = confusion_matrix(y_test, preds1>xgb_thresholds)

TN = cnf_matrix[0][0]
FN = cnf_matrix[1][0]
TP = cnf_matrix[1][1]
FP = cnf_matrix[0][1]

TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(FP+TN)
TPR, FPR, TNR

(0.6404715127701375, 0.12757408337518836, 0.8724259166248116)

In [48]:
res_tab.loc['XGBClassifier_FeatEng', :] = [xgb_thresholds, 
                                           xgb_fscore, 
                                           xgb_precision, 
                                           xgb_recall, 
                                           xgb_roc_auc_score, l_los, 
                                           TPR, FPR, TNR,
                                           TN, FN, TP, FP]

In [49]:
# GradientBoostingClassifier
sklgb_thresholds, sklgb_fscore, sklgb_precision, sklgb_recall, sklgb_roc_auc_score = get_metrics(preds2)

Best Threshold=0.318901, F-Score=0.581, Precision=0.554, Recall=0.611, Roc-AUC=0.831


In [50]:
l_los = log_loss(y_true=y_test, y_pred=preds2)
print("log loss score: {}".format(l_los))

log loss score: 0.37551919957998015


In [51]:
cnf_matrix = confusion_matrix(y_test, preds2>sklgb_thresholds)

TN = cnf_matrix[0][0]
FN = cnf_matrix[1][0]
TP = cnf_matrix[1][1]
FP = cnf_matrix[0][1]

TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(FP+TN)
TPR, FPR, TNR

(0.6090373280943026, 0.12556504269211452, 0.8744349573078855)

In [52]:
res_tab.loc['GradientBoostingClassifier_FeatEng', :] = [sklgb_thresholds, 
                                                        sklgb_fscore, 
                                                        sklgb_precision, 
                                                        sklgb_recall, 
                                                        sklgb_roc_auc_score, l_los, 
                                                        TPR, FPR, TNR,
                                                        TN, FN, TP, FP]

## Итоговая таблица

In [53]:
res_tab

Unnamed: 0,Best Threshold,F-Score,Precision,Recall,roc_auc_s,log_loss_s,TPR,FPR,TNR,TN,FN,TP,FP
RandomForestClassifier,0.38,0.640641,0.653061,0.628684,0.863502,0.362955,0.616896,0.0798594,0.920141,1832,195,314,159
RandomForestClassifier_GridSearch,0.389431,0.653426,0.660643,0.646365,0.869511,0.331586,0.644401,0.084882,0.915118,1822,181,328,169
RandomForestClassifier_FeatEng,0.32,0.603175,0.5472,0.671906,0.850959,0.366402,0.654224,0.136615,0.863385,1719,176,333,272
XGBClassifier_FeatEng,0.387431,0.6,0.562823,0.642436,0.836658,0.444302,0.640472,0.127574,0.872426,1737,183,326,254
GradientBoostingClassifier_FeatEng,0.318901,0.581308,0.554367,0.611002,0.831472,0.375519,0.609037,0.125565,0.874435,1741,199,310,250


### Выберем основной метрикой ROC-AUC

In [54]:
res_tab.sort_values('roc_auc_s', ascending=False)

Unnamed: 0,Best Threshold,F-Score,Precision,Recall,roc_auc_s,log_loss_s,TPR,FPR,TNR,TN,FN,TP,FP
RandomForestClassifier_GridSearch,0.389431,0.653426,0.660643,0.646365,0.869511,0.331586,0.644401,0.084882,0.915118,1822,181,328,169
RandomForestClassifier,0.38,0.640641,0.653061,0.628684,0.863502,0.362955,0.616896,0.0798594,0.920141,1832,195,314,159
RandomForestClassifier_FeatEng,0.32,0.603175,0.5472,0.671906,0.850959,0.366402,0.654224,0.136615,0.863385,1719,176,333,272
XGBClassifier_FeatEng,0.387431,0.6,0.562823,0.642436,0.836658,0.444302,0.640472,0.127574,0.872426,1737,183,326,254
GradientBoostingClassifier_FeatEng,0.318901,0.581308,0.554367,0.611002,0.831472,0.375519,0.609037,0.125565,0.874435,1741,199,310,250
