# Проект 7 отток клиентов

Из «Бета-Банка» стали уходить клиенты. Каждый месяц. Немного, но заметно. Банковские маркетологи посчитали: сохранять текущих клиентов дешевле, чем привлекать новых.

Нужно спрогнозировать, уйдёт клиент из банка в ближайшее время или нет. Вам предоставлены исторические данные о поведении клиентов и расторжении договоров с банком. 

Постройте модель с предельно большим значением *F1*-меры. Чтобы сдать проект успешно, нужно довести метрику до 0.59. Проверьте *F1*-меру на тестовой выборке самостоятельно.

Дополнительно измеряйте *AUC-ROC*, сравнивайте её значение с *F1*-мерой.

Источник данных: [https://www.kaggle.com/barelydedicated/bank-customer-churn-modeling](https://www.kaggle.com/barelydedicated/bank-customer-churn-modeling)

# 1. Подготовка данных

In [2]:
# Сразу загрузим пачку библиотек
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

from sklearn.utils import shuffle

In [3]:
data = pd.read_csv('/datasets/Churn.csv')

In [4]:
data

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5.0,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10.0,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7.0,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3.0,75075.31,2,1,0,92888.52,1


Признаки:

RowNumber — индекс строки в данных

CustomerId — уникальный идентификатор клиента

Surname — фамилия

CreditScore — кредитный рейтинг

Geography — страна проживания

Gender — пол

Age — возраст

Tenure — количество недвижимости у клиента

Balance — баланс на счёте

NumOfProducts — количество продуктов банка, используемых клиентом

HasCrCard — наличие кредитной карты

IsActiveMember — активность клиента

EstimatedSalary — предполагаемая зарплата

Целевой признак
Exited — факт ухода клиента

Не значимые признаки:
'RowNumber', 'CustomerId', 'Surname'

In [5]:
#Удалим не значимые признаки
data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

In [6]:
#data.info()

Есть пропуски в столбце "Tenure" — количество недвижимости у клиента, вероятнее всего NaN - означает отсутствие недвижимости

Проверим это, если гипотиза подтвердится, NaN заменим на 0

In [7]:
data['Tenure'].value_counts()

1.0     952
2.0     950
8.0     933
3.0     928
5.0     927
7.0     925
4.0     885
9.0     882
6.0     881
10.0    446
0.0     382
Name: Tenure, dtype: int64

Гипотиза не подтвердилась, под NaN может скрываться что угодно.

Значит будем учить нашу модель по выборке строчек, где есть данные в Tenure

In [8]:
data_nan_tenure = data.loc[data['Tenure'].isna()] 
data = data.query('@data.index not in @data_nan_tenure.index')

In [9]:
data_nan_tenure['Exited'].value_counts()

0    726
1    183
Name: Exited, dtype: int64

In [10]:
imbalance_nan_tenure = data_nan_tenure['Exited'].value_counts()
print(imbalance_nan_tenure[0]/imbalance_nan_tenure[1])

3.9672131147540983


Коефициент отношения клиентов которые не ушли, к клиентам, которые ушли примерно одинаковый у выборок:
Где Tenure - есть данные, и где в Tenure -NAN,
значит с точки зрения целевого признака - это не особая отличная выборка, а скорее средняя,
и наиболее лучшим вариантом будет не портить выборку не корректным заполнением нулями, а учить модель по выборке по строкам, где есть значения в Tenure

# 2. Исследование задачи

In [11]:
data = pd.get_dummies(data, drop_first=True)

In [12]:
data

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2.0,0.00,1,1,1,101348.88,1,0,0,0
1,608,41,1.0,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8.0,159660.80,3,1,0,113931.57,1,0,0,0
3,699,39,1.0,0.00,2,0,0,93826.63,0,0,0,0
4,850,43,2.0,125510.82,1,1,1,79084.10,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9994,800,29,2.0,0.00,2,0,0,167773.55,0,0,0,0
9995,771,39,5.0,0.00,2,1,0,96270.64,0,0,0,1
9996,516,35,10.0,57369.61,1,1,1,101699.77,0,0,0,1
9997,709,36,7.0,0.00,1,0,1,42085.58,1,0,0,0


In [12]:
df_train, df_valid_and_test = train_test_split(data, test_size=0.4, random_state=12345)
df_test, df_valid = train_test_split(df_valid_and_test, test_size=0.5, random_state=12345)

features_train = df_train.drop(['Exited'], axis=1) 
target_train = df_train['Exited']

features_valid = df_valid.drop(['Exited'], axis=1)
target_valid =  df_valid['Exited']

features_test = df_test.drop(['Exited'], axis=1)
answers_test =  df_test['Exited']

In [13]:
from sklearn.preprocessing import StandardScaler
numeric = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

scaler = StandardScaler()
scaler.fit(features_train[numeric])

features_train[numeric] = scaler.transform(features_train[numeric])

features_valid[numeric] = scaler.transform(features_valid[numeric])

features_test[numeric] = scaler.transform(features_test[numeric])

In [14]:
features_test

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
7445,-1.369326,0.560665,-0.336987,-1.237830,-0.908179,0.663468,-1.024127,-0.086537,0,0,0
8620,1.232367,0.090079,1.041028,-1.237830,0.809300,-1.507231,0.976442,-0.537457,0,0,0
1714,0.840048,0.560665,0.352020,1.231363,-0.908179,-1.507231,-1.024127,1.070393,1,0,1
5441,1.056856,-0.945210,-1.370498,0.951231,-0.908179,0.663468,-1.024127,-0.576279,0,0,1
9001,0.406433,-0.662858,0.352020,0.767800,-0.908179,0.663468,0.976442,0.662068,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
7505,0.107031,-1.133444,-0.336987,-1.237830,0.809300,0.663468,-1.024127,-0.016504,0,1,1
2232,0.003790,-0.286389,1.041028,-1.237830,0.809300,0.663468,-1.024127,1.056449,0,1,1
3720,-1.813265,0.090079,1.041028,1.226442,0.809300,0.663468,-1.024127,-0.468713,1,0,1
5196,0.726482,-1.604030,0.696524,0.610110,-0.908179,0.663468,0.976442,1.328951,0,0,0


In [15]:
modelRandomForestClassifier = RandomForestClassifier(random_state=12345, max_depth=10, n_estimators=100)
modelRandomForestClassifier.fit(features_train, target_train)
predictions = modelRandomForestClassifier.predict(features_valid)
print('f1_score =',f1_score(target_valid, predictions))
print('auc_score =',roc_auc_score(target_valid, predictions))

f1_score = 0.5588235294117647
auc_score = 0.7025596388675407


In [16]:
print(confusion_matrix(target_valid, predictions))

[[1427   41]
 [ 199  152]]


In [17]:
features_test

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
7445,-1.369326,0.560665,-0.336987,-1.237830,-0.908179,0.663468,-1.024127,-0.086537,0,0,0
8620,1.232367,0.090079,1.041028,-1.237830,0.809300,-1.507231,0.976442,-0.537457,0,0,0
1714,0.840048,0.560665,0.352020,1.231363,-0.908179,-1.507231,-1.024127,1.070393,1,0,1
5441,1.056856,-0.945210,-1.370498,0.951231,-0.908179,0.663468,-1.024127,-0.576279,0,0,1
9001,0.406433,-0.662858,0.352020,0.767800,-0.908179,0.663468,0.976442,0.662068,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
7505,0.107031,-1.133444,-0.336987,-1.237830,0.809300,0.663468,-1.024127,-0.016504,0,1,1
2232,0.003790,-0.286389,1.041028,-1.237830,0.809300,0.663468,-1.024127,1.056449,0,1,1
3720,-1.813265,0.090079,1.041028,1.226442,0.809300,0.663468,-1.024127,-0.468713,1,0,1
5196,0.726482,-1.604030,0.696524,0.610110,-0.908179,0.663468,0.976442,1.328951,0,0,0


# 3. Борьба с дисбалансом

In [44]:
#model = RandomForestRegressor(n_estimators=20, max_depth=10, random_state=12345, class_weight='balanced')
imbalance = data['Exited'].value_counts()
imbalance

0    7237
1    1854
Name: Exited, dtype: int64

В целевом признаке не сбалансированное количество данных, исправим это:

In [45]:
coefficient = int(round(imbalance[0]/imbalance[1]))
print(imbalance[0]/imbalance[1])

3.9034519956850056


# 3.1 Увеличение выборки:

In [46]:
coefficient = int(round(imbalance[0]/imbalance[1]))
def upsample(features, target, repeat):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)
    
    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=12345)
    
    return features_upsampled, target_upsampled

features_upsampled, target_upsampled = upsample(features_train, target_train, coefficient)

In [47]:
modelRandomForestClassifier = RandomForestClassifier(random_state=12345, max_depth=10, n_estimators=100)
modelRandomForestClassifier.fit(features_upsampled, target_upsampled)
predictions = modelRandomForestClassifier.predict(features_valid)
print('f1_score =',f1_score(target_valid, predictions))
print('auc_score =',roc_auc_score(target_valid, predictions))

f1_score = 0.5842696629213484
auc_score = 0.7597638510445051


# 3.2 Уменьшение выборки

In [48]:
def downsample(features, target, fraction):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_downsampled = pd.concat(
        [features_zeros.sample(frac=fraction, random_state=12345)] + [features_ones])
    target_downsampled = pd.concat(
        [target_zeros.sample(frac=fraction, random_state=12345)] + [target_ones])
    
    features_downsampled, target_downsampled = shuffle(
        features_downsampled, target_downsampled, random_state=12345)
    
    return features_downsampled, target_downsampled

features_downsampled, target_downsampled = downsample(features_train, target_train, 0.1)

In [49]:
modelRandomForestClassifier = RandomForestClassifier(random_state=12345, max_depth=10, n_estimators=100)
modelRandomForestClassifier.fit(features_downsampled, target_downsampled)
predictions = modelRandomForestClassifier.predict(features_valid)
print('f1_score =',f1_score(target_valid, predictions))
print('auc_score =',roc_auc_score(target_valid, predictions))

f1_score = 0.4414715719063545
auc_score = 0.6928375136822003


Вывод:

Метод увеличения выборки даёт более лучший результат по метрике f1_score

# 4. Тестирование модели

In [50]:
new_predicted_valid = pd.Series(1, index=target_valid.index)
print(f1_score(target_valid, new_predicted_valid))

0.3235023041474654


In [51]:
for estim in range(10, 101, 10):
    for depth in range(5, 26, 5):
        modelRandomForestClassifier = RandomForestClassifier(random_state=12345, max_depth=depth, n_estimators=estim)
        modelRandomForestClassifier.fit(features_upsampled, target_upsampled)
        predictions = modelRandomForestClassifier.predict(features_valid)
        score = f1_score(target_valid, predictions)
        print("n_estimators =", estim, 'n_depth =', depth ,":", score)

n_estimators = 10 n_depth = 5 : 0.5717488789237668
n_estimators = 10 n_depth = 10 : 0.5558312655086849
n_estimators = 10 n_depth = 15 : 0.5801749271137027
n_estimators = 10 n_depth = 20 : 0.5683563748079878
n_estimators = 10 n_depth = 25 : 0.5867895545314901
n_estimators = 20 n_depth = 5 : 0.5845464725643897
n_estimators = 20 n_depth = 10 : 0.5810473815461347
n_estimators = 20 n_depth = 15 : 0.5838150289017342
n_estimators = 20 n_depth = 20 : 0.557632398753894
n_estimators = 20 n_depth = 25 : 0.59594383775351
n_estimators = 30 n_depth = 5 : 0.5776805251641138
n_estimators = 30 n_depth = 10 : 0.5875
n_estimators = 30 n_depth = 15 : 0.5956204379562045
n_estimators = 30 n_depth = 20 : 0.5855161787365177
n_estimators = 30 n_depth = 25 : 0.594679186228482
n_estimators = 40 n_depth = 5 : 0.5802879291251385
n_estimators = 40 n_depth = 10 : 0.5989847715736041
n_estimators = 40 n_depth = 15 : 0.6066763425253991
n_estimators = 40 n_depth = 20 : 0.5916795069337443
n_estimators = 40 n_depth = 25 :

Лучший результат с гиперпараметрами n_estimators = 50 n_depth = 15

f1 = 0.6184971098265896

In [52]:
modelRandomForestClassifier = RandomForestClassifier(random_state=12345, max_depth=10, n_estimators=100)
modelRandomForestClassifier.fit(features_upsampled, target_upsampled)
predictions = modelRandomForestClassifier.predict(features_test)
print('f1_score =',f1_score(answers_test, predictions))
print('auc_score =',roc_auc_score(answers_test, predictions))

f1_score = 0.6239620403321471
auc_score = 0.7783691696563504


Вывод: удалось достичь F1-меры не менее 0.59