In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
#from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
#from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

import itertools

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
#from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

In [4]:
df = pd.read_csv("churn_data.csv")
df.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [5]:
df = df.drop(columns = "CustomerId")
df.head(3)

Unnamed: 0,RowNumber,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [6]:
X_train, X_test, y_train, y_test = train_test_split(df, df['Exited'], random_state=0)

In [7]:
#соберем наш простой pipeline, но нам понадобится написать класс для выбора нужного поля
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [8]:
categorical_columns = ['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember']
continuous_columns = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [9]:
gender = Pipeline([
                ('selector', FeatureSelector(column='Gender')),
                ('ohe', OHEEncoder(key='Gender'))
            ])
gender.fit(X_train)
gender.transform(X_test).head(3)

Unnamed: 0,Gender_Female,Gender_Male
9394,1,0
898,1,0
2398,1,0


In [10]:
#df[categorical_columns]

In [11]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cont_col, cont_transformer))

In [12]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [13]:
def find_fscore(p, r, beta):
    return ((1+beta**2)*(p*r/((beta**2)*p+r)))

# 1.1 CatBoostClassifier

In [14]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', CatBoostClassifier(iterations=40, thread_count=2, random_state=42, silent=True)),
])

In [15]:
#обучим наш пайплайн
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [16]:
#наши прогнозы для тестовой выборки
preds = pipeline.predict_proba(X_test)[:, 1]
y_preds = pipeline.predict(X_test)
preds[:10]

array([0.11233987, 0.23866687, 0.0781688 , 0.03450567, 0.0202649 ,
       0.78408931, 0.03017612, 0.19910587, 0.12756418, 0.85252504])

In [17]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

#print(precision)
#print(recall)
fscore = find_fscore(precision, recall, 0.4)
print(fscore)
#locate the index of the largest f score
ix = np.nanargmax(fscore)
#ix=10
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))
roc_auc_score(y_test, preds)
precision_score(y_test, y_preds)

[0.22952528 0.22916343 0.22925258 ... 0.02780441 0.01407084 0.        ]
Best Threshold=0.650836, F-Score=0.723, Precision=0.811, Recall=0.430


0.7223719676549866

In [18]:
metrics = dict()
metrics["CatBoostClassifier"]  = [thresholds[ix], fscore[ix], precision[ix], recall[ix]]
models_preds = dict()
models_preds["CatBoostClassifier"] = preds

# 1.2 LogisticRegression

In [19]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', LogisticRegression(random_state = 42)),
])

In [20]:
#обучим наш пайплайн
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [21]:
#наши прогнозы для тестовой выборки
preds = pipeline.predict_proba(X_test)[:, 1]
y_preds = pipeline.predict(X_test)
preds[:10]

array([0.22390104, 0.33707751, 0.18489566, 0.09197292, 0.18590078,
       0.40873004, 0.18821398, 0.18360246, 0.43551361, 0.70866171])

In [22]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

#print(precision)
#print(recall)
fscore = find_fscore(precision, recall, 0.4)
print(fscore)
# locate the index of the largest f score
ix = np.nanargmax(fscore)
#ix=10
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))
roc_auc_score(y_test, preds)
precision_score(y_test, y_preds)

[0.23114264 0.23077887 0.23086929 ...        nan        nan 0.        ]
Best Threshold=0.333915, F-Score=0.381, Precision=0.397, Recall=0.305


0.3625

In [23]:
def print_metrics(precision, recall, thresholds, fscore):
    print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds, 
                                                                            fscore,
                                                                            precision,
                                                                            recall))
precision_score(y_test, y_preds)

0.3625

In [24]:
metrics["LogisticRegression"]  = [thresholds[ix], fscore[ix], precision[ix], recall[ix]]
models_preds["LogisticRegression"] = preds

# 1.3 AdaBoostClassifier

In [25]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', AdaBoostClassifier(n_estimators=10, random_state=42)),
])

In [26]:
#обучим наш пайплайн
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [27]:
#наши прогнозы для тестовой выборки
preds = pipeline.predict_proba(X_test)[:, 1]
y_preds = pipeline.predict(X_test)
preds[:10]

array([0.4771804 , 0.47560607, 0.4544716 , 0.44539774, 0.42673165,
       0.48049484, 0.38965248, 0.40385211, 0.4953744 , 0.48049484])

In [28]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

#print(precision)
#print(recall)
fscore = find_fscore(precision, recall, 0.4)
# locate the index of the largest f score
ix = np.nanargmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))
roc_auc_score(y_test, preds)
precision_score(y_test, y_preds)

Best Threshold=0.514758, F-Score=0.692, Precision=0.784, Recall=0.399


0.6961038961038961

In [29]:
metrics["AdaBoostClassifier"]  = [thresholds[ix], fscore[ix], precision[ix], recall[ix]]
models_preds["AdaBoostClassifier"] = preds

# 1.4 DecisionTreeClassifier

In [30]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', BaggingClassifier(n_estimators=10, random_state=42)),
])

In [31]:
#обучим наш пайплайн
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [32]:
#наши прогнозы для тестовой выборки
preds = pipeline.predict_proba(X_test)[:, 1]
y_preds = pipeline.predict(X_test)
preds[:10]

array([0.4, 0.2, 0.1, 0. , 0. , 0.9, 0.1, 0. , 0.1, 0.8])

In [33]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

#print(recall)
fscore = find_fscore(precision, recall, 0.4)
# locate the index of the largest f score
ix = np.nanargmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.800000, F-Score=0.674, Precision=0.791, Recall=0.350


In [34]:
metrics["DecisionTreeClassifier"] = [thresholds[ix], fscore[ix], precision[ix], recall[ix]]
models_preds["DecisionTreeClassifier"] = preds

# 2. Сравнение моделей и выбор "лучшей"

In [35]:
#функция, определяющая модель с максимальным значением Precision
def best_model_search(_metrics):
    _max = 0
    for i in _metrics:
        if _metrics[i][2] > _max:
            _max = _metrics[i][2]
            max_i = i
    print("Best model: ", max_i, ", best precision value: ", _max)
    #return int(max_i[7])
    return max_i

In [36]:
#print("First model metrics:")
#print_metrics(metrics_1[0], metrics_1[1], metrics_1[2], metrics_1[3])
#print("Second model metrics:")
#print_metrics(metrics_2[0], metrics_2[1], metrics_2[2], metrics_2[3])
#print("Third model metrics:")
#print_metrics(metrics_3[0], metrics_3[1], metrics_3[2], metrics_3[3])
#print("Fourth model metrics:")
#print_metrics(metrics_4[0], metrics_4[1], metrics_4[2], metrics_4[3])

best_model_name = best_model_search(metrics)

Best model:  CatBoostClassifier , best precision value:  0.8111111111111111


### Так как нам необходимо отобрать только тех клиентов, которые могут уйти в отток, и по максимуму не трогать тех, кто в любом случае останется с нами (чтобы не тратить лишние деньги на удержание клиентов и чтобы не предлагать более выгодные условия тем, кто готов остаться на прежних условиях), нам нужно ориентироваться на значение метрики Precision (на точность отбора). Поэтому, в данном случае, лучше всех отработала третья модель (адаптивный бустинг) и для дальнейшей работы следует выбрать её. 

# 3. Проверка эффективности выбранной модели

In [37]:
best_model_cnf_matrix = confusion_matrix(y_test, models_preds[best_model_name] > metrics[best_model_name][0])

In [38]:
print(best_model_cnf_matrix)

[[1940   51]
 [ 291  218]]


In [39]:
def is_model_effective(_i, cnf_matrix):
    TP = cnf_matrix[1][1]
    TN = cnf_matrix[0][0]
    FP = cnf_matrix[0][1]
    FN = cnf_matrix[1][0]
    print("Proceeds: ", TP*2)
    print("Expenses: ", TP + FP)
    print("Profit: ", TP*2 - (TP + FP))
    if TP*2 - (TP + FP) > 100:
        print("Conclusion: model ", _i, " is cost-effective")
        #print(TP/(TP+FP))
    else:
        print("Conclusion: model ", _i, " is not cost-effective")
        #print(TP/(TP+FP))

#### Проверяем эффективность выбраной модели (считаем, что ээфективна, если прибыль больше 100 долларов):

In [40]:
is_model_effective(best_model_name, best_model_cnf_matrix)

Proceeds:  436
Expenses:  269
Profit:  167
Conclusion: model  CatBoostClassifier  is cost-effective


#### Проверим эффективность отвергнутых моделей:

In [41]:
#cnf_matrix_1 = confusion_matrix(y_test, preds_1>metrics_1[2])
#cnf_matrix_2 = confusion_matrix(y_test, preds_2>metrics_2[2])

#is_model_effective(cnf_matrix_1)
#is_model_effective(cnf_matrix_2)

for i in metrics:
    if i != best_model_name:
        is_model_effective(i, confusion_matrix(y_test, models_preds[i] > metrics[i][0]))

Proceeds:  308
Expenses:  389
Profit:  -81
Conclusion: model  LogisticRegression  is not cost-effective
Proceeds:  404
Expenses:  258
Profit:  146
Conclusion: model  AdaBoostClassifier  is cost-effective
Proceeds:  270
Expenses:  155
Profit:  115
Conclusion: model  DecisionTreeClassifier  is cost-effective


#### Видно, что вторая модель (LogisticRegression) с задачей не справилась (потермим убытки), а третья (AdaBoostClassifier) и четвёртая (DecisionTreeClassifier) - справились, но чуть хуже, чем первая.

In [42]:
#pip install hyperopt

# 4. Подбор лучших параметров для модели-победителя

In [43]:
#(iterations=40, thread_count=2, random_state=42, silent=True)
iterations = [20, 40, 50, 70, 90, 150]
thread_count = [1, 2, 4]
random_state = [10, 12, 25, 42]
silent = [True]
param_grid = dict(iterations=iterations,
                  thread_count=thread_count,
                  random_state=random_state,
                  silent=silent)
model = CatBoostClassifier()
#kfold = KFold()
grid = GridSearchCV(estimator=model, param_grid=param_grid)

pipeline = Pipeline([
    ('features',feats),
    ('classifier', grid),
])

grid_result = pipeline.fit(X_train, y_train)

In [49]:
preds = grid_result.predict_proba(X_test)[:, 1]
y_preds = grid_result.predict(X_test)
preds[:10]

array([0.42282659, 0.20575779, 0.14050745, 0.05084555, 0.02409063,
       0.87131717, 0.02320493, 0.16822853, 0.16029462, 0.85581979])

In [50]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

#print(recall)
fscore = find_fscore(precision, recall, 0.4)
# locate the index of the largest f score
ix = np.nanargmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.653996, F-Score=0.763, Precision=0.870, Recall=0.432


In [51]:
metrics["Best_model_with_hyperopt"] = [thresholds[ix], fscore[ix], precision[ix], recall[ix]]
models_preds["Best_model_with_hyperopt"] = preds

In [56]:
is_model_effective("Best_model_with_hyperopt", confusion_matrix(y_test, models_preds["Best_model_with_hyperopt"] > metrics["Best_model_with_hyperopt"][0]))

Proceeds:  438
Expenses:  252
Profit:  186
Conclusion: model  Best_model_with_hyperopt  is cost-effective


#### Сравним с "лучшей" моделью:

In [57]:
is_model_effective("Best_model_name", best_model_cnf_matrix)

Proceeds:  436
Expenses:  269
Profit:  167
Conclusion: model  Best_model_name  is cost-effective


#### После подбора оптимальных гиперпараметров, модель даёт более точный результат и её эффективность повышается