In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
import itertools
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
df = pd.read_csv("churn_data.csv")
df.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [3]:
X_train, X_test, y_train, y_test = train_test_split(df, df['Exited'], random_state=0)

In [4]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    

class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [5]:
categorical_columns = ['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember']
continuous_columns = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [6]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col))
            ])
    
    final_transformers.append((cont_col, cont_transformer))

In [7]:
feats = FeatureUnion(final_transformers)
feature_processing = Pipeline([('feats', feats)])

### №1 Случайные лес:

In [8]:
pipeline = Pipeline([
    ('features', feats),
    ('classifier', RandomForestClassifier(random_state=42)),
])

In [9]:
pipeline.fit(X_train, y_train)

In [10]:
preds = pipeline.predict_proba(X_test)[:, 1]

In [11]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)

ix = np.argmax(fscore)
RF_rez = (f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

cnf_matrix_RF = confusion_matrix(y_test, preds > thresholds[ix])

Best Threshold=0.38, F-Score=0.641, Precision=0.653, Recall=0.629


### №2 Градиентный бустинг:

In [12]:
pipeline = Pipeline([
    ('features', feats),
    ('classifier', XGBClassifier(random_state=42)),
])

In [13]:
pipeline.fit(X_train, y_train)

In [14]:
preds = pipeline.predict_proba(X_test)[:, 1]

In [15]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
XB_rez = (f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

cnf_matrix_XB = confusion_matrix(y_test, preds > thresholds[ix])

### №3 Логистическая регрессия:

In [16]:
pipeline = Pipeline([
    ('features', feats),
    ('classifier', LogisticRegression(random_state=42)),
])

In [17]:
pipeline.fit(X_train, y_train)

In [18]:
preds = pipeline.predict_proba(X_test)[:, 1]

In [19]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
LR_rez = (f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

cnf_matrix_LR = confusion_matrix(y_test, preds > thresholds[ix])

### Результаты работы трех моделей:

In [20]:
r_s = (cnf_matrix_RF[0][1] + cnf_matrix_RF[1][1]) * 1
inc = cnf_matrix_RF[1][1] * 2
econom_RF = inc - r_s
r_s = (cnf_matrix_XB[0][1] + cnf_matrix_XB[1][1]) * 1
inc = cnf_matrix_XB[1][1] * 2
econom_XB = inc - r_s
r_s = (cnf_matrix_LR[0][1] + cnf_matrix_LR[1][1]) * 1
inc = cnf_matrix_LR[1][1] * 2
econom_LR = inc - r_s

print("Confusion matrix RandomForestClassifier")
print(cnf_matrix_RF)
print(RF_rez)
print(f'Экономическая выгода: {econom_RF}')
print('*'*40)
print("Confusion matrix XGBClassifier")
print(cnf_matrix_XB)
print(XB_rez)
print(f'Экономическая выгода: {econom_XB}')
print('*'*40)
print("Confusion matrix LogisticRegression")
print(cnf_matrix_LR)
print(LR_rez)
print(f'Экономическая выгода: {econom_LR}')

Confusion matrix RandomForestClassifier
[[1832  159]
 [ 195  314]]
Best Threshold=0.38, F-Score=0.641, Precision=0.653, Recall=0.629
Экономическая выгода: 155
****************************************
Confusion matrix XGBClassifier
[[1793  198]
 [ 188  321]]
Best Threshold=0.37331774830818176, F-Score=0.626, Precision=0.619, Recall=0.633
Экономическая выгода: 123
****************************************
Confusion matrix LogisticRegression
[[1990    1]
 [ 509    0]]
Best Threshold=0.7506003062811972, F-Score=nan, Precision=0.000, Recall=0.000
Экономическая выгода: -1


### По данным видно, что RandomForest по прежнему показывает самые лучшие результаты, XGBClassifier немного от него отстает, а LogisticRegression показал совсем плохие результаты, не распознав ни одного TrueNegative результата.

### Начинаем подбор параметров XGBClassifier с GridSearch:

In [21]:
from sklearn.model_selection import GridSearchCV

params = {
    'classifier__learning_rate':[0.1, 0.5, 0.7],
    'classifier__n_estimators':[500, 1000, 1500],
    'classifier__max_depth':[1,3,7]
}

In [22]:
%%time

pipeline = Pipeline([
    ('features', feats),
    ('classifier', XGBClassifier(random_state=42)),
])

grid = GridSearchCV(pipeline,
                    param_grid=params,
                    cv=6,
                    refit=False)

search = grid.fit(X_train, y_train)
search.best_params_

CPU times: total: 1h 5min 37s
Wall time: 8min 20s


{'classifier__learning_rate': 0.1,
 'classifier__max_depth': 3,
 'classifier__n_estimators': 500}

In [23]:
pipeline = Pipeline([
    ('features', feats),
    ('classifier', XGBClassifier(learning_rate =0.1,max_depth = 3, n_estimators = 500, random_state=42)),
])

In [24]:
pipeline.fit(X_train, y_train)

In [25]:
preds = pipeline.predict_proba(X_test)[:, 1]

In [26]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
XB_rez_2 = (f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

cnf_matrix_XB_2 = confusion_matrix(y_test, preds > thresholds[ix])

In [27]:
r_s = (cnf_matrix_XB_2[0][1] + cnf_matrix_XB_2[1][1]) * 1
inc = cnf_matrix_XB_2[1][1] * 2
econom_XB_2 = inc - r_s

print('Бустинг со стандартными параметрами:')
print(cnf_matrix_XB)
print(XB_rez)
print(f'Экономическая выгода: {econom_XB}')
print('*'*40)
print('Бустинг с подобранными параметрами:')
print(cnf_matrix_XB_2)
print(XB_rez_2)
print(f'Экономическая выгода: {econom_XB_2}')

Бустинг со стандартными параметрами:
[[1793  198]
 [ 188  321]]
Best Threshold=0.37331774830818176, F-Score=0.626, Precision=0.619, Recall=0.633
Экономическая выгода: 123
****************************************
Бустинг с подобранными параметрами:
[[1814  177]
 [ 190  319]]
Best Threshold=0.3787517845630646, F-Score=0.636, Precision=0.644, Recall=0.629
Экономическая выгода: 142


### Как видно из показателей, подбор параметров неплохо улучшил экономическую выгоду.