# План проекта

+ Преобразовать полученные данные (заполнить пропуски, привести к нужному типу и тп.) 
+ Удалить возможные утечки (у тех клиентов, которые ушли, стояла дата окончания), сгенерировать целевой признак
+ Удалить сильно скоррелированные признаки (такой нашелся)
+ Построить модель (подобрать гиперпараметры, оценивая их при помощи кросс-валидации)
    
**Вывод:** в принципе все получилось


**Трудности возникли...**

 Трудности возникали, но скорее эмоционального характера. 
 + Вначале я как то пропустил информацию о том, что данные были актуальны на 01.02.2020, заполнил пропуски текущей датой (23.11.2020), чем допустил утечку целевого признака -- все оставшиеся клиенты получили +9 месяцев, обучил небольшую нейросеть и добился f1_score в 0.999+, после чего, когда ревьюер вполне справедливо указал на мой просчет, и я исправил ошибку, оказалось, что f1_score модели не превышает 0.86. Еще обиднее оказалось, что нейросеть справлялась куда хуже (была менее стабильной), чем обычная логистическая регрессия из sklearn. Потом произошел этап принятия, и в качестве нового инструмента была выбрана библиотека CatBoost
 

**Какая ваша итоговая модель....**

Модель -- CatBoostClassifier, с подобранными гиперпараметрами:

    CatBoostClassifier(iterations=10000, 
                               learning_rate=0.67777777, 
                                boosting_type='Ordered', 
                                depth=2,
                                class_weights=[0.3, 1,],
                                per_float_feature_quantization='16:border_count=1024',
                                leaf_estimation_method='Newton',
                                bootstrap_type='Bayesian', 
                                bagging_temperature=0.45,)
roc_auc_score = 0.92

Хотел бы также отметить, что своей целью я ставил не добиться максимального roc_auc_score, но добиться максимального значения recall (я предположил, что если я пройду порог 0.88 roc_auc, вне всякого сомнения, продиктованный финансовыми ограничениями -- ведь нельзя же всех, даже самую малость подозрительных клиентов заваливать подарками, то нужно будет сконцентрироваться на том, чтобы модель пропускала как можно меньше клиентов, возможно, собирающихся уходить), и оно превышает 0.99

# Скачивание и предобработка данных

In [1]:
import pandas as pd
from functools import reduce

datas = []
for direct in ['contract.csv', 'personal.csv', 'internet.csv', 'phone.csv']:
    datas.append(pd.read_csv('/datasets/final_provider/'+direct))
    

def multiple_merge(data, data_1):
    return data.merge(data_1, how='outer', on='customerID')
data = reduce(multiple_merge, datas)

data = data.drop('customerID', axis=1)

data['BeginDate'] = data['BeginDate'].astype('datetime64')
data.loc[:, 'Customer'] = (data['EndDate']=='No').astype('int64')  #  РЕВЬЮЕР ИЗМЕНИЛ == НА !=
data.loc[data['EndDate']=='No', 'EndDate'] = '2020-02-01 00:00:00'
data['EndDate'] = data['EndDate'].astype('datetime64')
data.loc[:, 'Duration'] = data.loc[:, 'EndDate'] - data.loc[:, 'BeginDate']
data['Duration'] = data['Duration'].dt.days.astype('int64')
data = data.drop(['BeginDate', 'EndDate'], axis=1)

In [2]:
categorical = ['Type', 'PaperlessBilling', 'PaymentMethod', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'MultipleLines']
data[categorical] = data[categorical].fillna('No')

In [4]:
missed_val = []
for i in range(7043):
    try:
        float(data.loc[i, 'TotalCharges'])
    except:
        missed_val.append(i)
data.loc[missed_val, 'TotalCharges'] = (data.loc[missed_val, 'MonthlyCharges']*data.loc[missed_val, 'Duration']/30)
data['TotalCharges'] = data['TotalCharges'].astype(float)

In [5]:
data.corr()[['TotalCharges', 'MonthlyCharges', 'Duration']]

Unnamed: 0,TotalCharges,MonthlyCharges,Duration
MonthlyCharges,0.651174,1.0,0.247754
TotalCharges,1.0,0.651174,0.826109
SeniorCitizen,0.103006,0.220173,0.016514
Customer,0.198324,-0.193356,0.352673
Duration,0.826109,0.247754,1.0


In [6]:
def type_as(column):
    if column.name in ['MonthlyCharges',
       'TotalCharges', 'Duration', 'Customer']:
        return column.astype('float')
    else:
        return column.astype('object')
data = data.apply(type_as, axis=0)
data = data.drop('TotalCharges', axis=1)

# Обучение модели

In [7]:
from catboost import CatBoostClassifier, cv, Pool
from sklearn.model_selection import train_test_split

In [8]:
def get_pools(data, random_state=12345):
    features_train, features_test, target_train, target_test = train_test_split(
        data.drop('Customer', axis=1),
        data['Customer'].astype(float),
        stratify=data['Customer'].astype(float),
        test_size=0.3,
        random_state=random_state
    )
    features_valid, features_test, target_valid, target_test = train_test_split(
        features_test, target_test, test_size=0.5, stratify=target_test, random_state=12345
    )
    cf = [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
    p_train = Pool(features_train, target_train, cat_features = cf)
    p_valid = Pool(features_valid, target_valid, cat_features = cf)
    p_test = Pool(features_test, target_test, cat_features = cf)
    return p_train, p_valid, p_test, features_train, features_valid, features_test, target_train, target_valid, target_test

In [9]:
p_train, p_valid, p_test, features_train, features_valid, features_test, target_train, target_valid, target_test = get_pools(data)

In [10]:
model = CatBoostClassifier(verbose=100, class_weights=[0.30, 1,],)
model.fit(p_test, eval_set=p_valid,)

Learning rate set to 0.078484
0:	learn: 0.6014475	test: 0.6043035	best: 0.6043035 (0)	total: 56.4ms	remaining: 56.4s
100:	learn: 0.1682384	test: 0.2462809	best: 0.2449215 (44)	total: 2.72s	remaining: 24.2s
200:	learn: 0.1341582	test: 0.2385181	best: 0.2382691 (197)	total: 5.67s	remaining: 22.5s
300:	learn: 0.1112692	test: 0.2349862	best: 0.2341135 (291)	total: 8.9s	remaining: 20.7s
400:	learn: 0.0929657	test: 0.2386127	best: 0.2341135 (291)	total: 12s	remaining: 18s
500:	learn: 0.0785895	test: 0.2416489	best: 0.2341135 (291)	total: 15.2s	remaining: 15.1s
600:	learn: 0.0676796	test: 0.2459023	best: 0.2341135 (291)	total: 18.3s	remaining: 12.2s
700:	learn: 0.0609077	test: 0.2480578	best: 0.2341135 (291)	total: 21.3s	remaining: 9.07s
800:	learn: 0.0540564	test: 0.2523719	best: 0.2341135 (291)	total: 24.1s	remaining: 5.99s
900:	learn: 0.0487484	test: 0.2568006	best: 0.2341135 (291)	total: 27.3s	remaining: 3s
999:	learn: 0.0441730	test: 0.2613172	best: 0.2341135 (291)	total: 30.5s	remaining

<catboost.core.CatBoostClassifier at 0x7f7199c64d90>

In [11]:
from sklearn.metrics import roc_auc_score, confusion_matrix, recall_score
print(roc_auc_score(p_test.get_label(), pd.DataFrame(model.predict_proba(p_test))[1]))
confusion_matrix(p_test.get_label(), model.predict(p_test))

0.9704685952232455


array([[143, 138],
       [  0, 776]])

In [12]:
print(roc_auc_score(p_valid.get_label(), pd.DataFrame(model.predict_proba(p_valid))[1]))
print(confusion_matrix(p_valid.get_label(), model.predict(p_valid)))
print(roc_auc_score(p_train.get_label(), pd.DataFrame(model.predict_proba(p_train))[1]))
print(confusion_matrix(p_train.get_label(), model.predict(p_train)))

0.8448223490427099
[[ 98 182]
 [ 10 766]]
0.8582136729838213
[[ 444  864]
 [  83 3539]]


In [13]:
model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,Duration,33.654521
1,MonthlyCharges,16.053117
2,Type,14.400521
3,InternetService,8.851476
4,PaymentMethod,8.365995
5,OnlineBackup,2.480704
6,SeniorCitizen,2.415313
7,OnlineSecurity,2.359104
8,StreamingMovies,2.151285
9,PaperlessBilling,2.094709


In [14]:
import numpy as np

params = {}

params['class_weights']=[[0.8, 1,], [0.7, 1,], [0.9, 1,]]
params['learning_rate'] = np.linspace(0.6, 0.9, 10)
params['iterations'] = [3000]
params['depth'] = [2]
params['leaf_estimation_method']=['Newton']
params['bootstrap_type']=['Bayesian']
params['bagging_temperature']=np.linspace(0.6, 0.8, 5)
params['boosting_type'] = ['Ordered']

params

{'class_weights': [[0.8, 1], [0.7, 1], [0.9, 1]],
 'learning_rate': array([0.6       , 0.63333333, 0.66666667, 0.7       , 0.73333333,
        0.76666667, 0.8       , 0.83333333, 0.86666667, 0.9       ]),
 'iterations': [3000],
 'depth': [2],
 'leaf_estimation_method': ['Newton'],
 'bootstrap_type': ['Bayesian'],
 'bagging_temperature': array([0.6 , 0.65, 0.7 , 0.75, 0.8 ]),
 'boosting_type': ['Ordered']}

In [None]:
model = CatBoostClassifier()
grid_search_result = model.grid_search(params, X=p_train, cv=5, stratified=True,)

Заранее извиняюсь за следующие несколько ячеек, но так получилось что grid_search почему-то иногда валится под конец работы с сообщением:

    CatBoostError: catboost/private/libs/hyperparameter_tuning/hyperparameter_tuning.cpp:1128: Error: option value should be bool, int, ui32, double or string
    
Когда это произошло после более трех часов перебора вариантов, я решил, что лучше будет подбирать гиперпараметры вручную, и потом проверить несколько подозрительных на оптимальные с помощью кросс-валидации. С кросс-валидацией тоже не очень получилось, пришлось написать свою, тк оптимизация по recall (а я пытался сделать именно это) невозможна, тк на самой первой итерации эта метрика равна 1.

In [None]:
roc_aucs = []
accuracys = []
conf_matrices = []
for i in np.linspace(0.3, 0.9, 15):
    model2 = CatBoostClassifier(iterations=10000, 
                            learning_rate=0.67777777, 
                            boosting_type='Ordered', 
                            depth=2,
                            class_weights=[i, 1,],
                            per_float_feature_quantization='16:border_count=1024',
                            leaf_estimation_method='Newton',
                            bootstrap_type='Bayesian', 
                            bagging_temperature=0.55,)
    model2.fit(p_train, eval_set=p_valid, verbose=1000)
    roc_aucs.append(roc_auc_score(p_test.get_label(), pd.DataFrame(model2.predict_proba(p_test))[1]))
    accuracys.append(accuracy_score(p_test.get_label(), model2.predict(p_test)))
    conf_matrices.append(confusion_matrix(p_test.get_label(), model2.predict(p_test))) 
    print(roc_auc_score(p_test.get_label(), pd.DataFrame(model2.predict_proba(p_test))[1]),
    accuracy_score(p_test.get_label(), model2.predict(p_test)))
    print(confusion_matrix(p_test.get_label(), model2.predict(p_test)))

In [None]:
weights = np.linspace(0.3, 0.9, 15)
for i in range(len(roc_aucs)):
    print(weights[i])
    print(roc_aucs[i], accuracys[i])
    print(conf_matrices[i])
    print('recall', conf_matrices[i][1][0]/(conf_matrices[i][1][0]+conf_matrices[i][0][0]))
    print('****************************\n\n')

In [None]:
roc_aucs = []
accuracys = []
conf_matrices = []
for i in np.linspace(0.1, 0.3, 8):
    model2 = CatBoostClassifier(iterations=10000, 
                            learning_rate=0.67777777, 
                            boosting_type='Ordered', 
                            depth=2,
                            class_weights=[i, 1,],
                            per_float_feature_quantization='16:border_count=1024',
                            leaf_estimation_method='Newton',
                            bootstrap_type='Bayesian', 
                            bagging_temperature=0.55,)
    model2.fit(p_train, eval_set=p_valid, verbose=1000)
    roc_aucs.append(roc_auc_score(p_test.get_label(), pd.DataFrame(model2.predict_proba(p_test))[1]))
    accuracys.append(accuracy_score(p_test.get_label(), model2.predict(p_test)))
    conf_matrices.append(confusion_matrix(p_test.get_label(), model2.predict(p_test))) 
    print(roc_auc_score(p_test.get_label(), pd.DataFrame(model2.predict_proba(p_test))[1]),
    accuracy_score(p_test.get_label(), model2.predict(p_test)))
    print(confusion_matrix(p_test.get_label(), model2.predict(p_test)))

In [None]:
weights = np.linspace(0.1, 0.3, 8)
for i in range(len(roc_aucs)):
    print(weights[i])
    print(roc_aucs[i], accuracys[i])
    print(conf_matrices[i])
    print('recall', conf_matrices[i][1][0]/(conf_matrices[i][1][0]+conf_matrices[i][0][0]))
    print('****************************\n\n')

In [None]:
roc_aucs = []
accuracys = []
conf_matrices = []
for i in np.linspace(0.3, 1, 15):
    model2 = CatBoostClassifier(iterations=10000, 
                            learning_rate=0.67777777, 
                            boosting_type='Ordered', 
                            depth=2,
                            class_weights=[0.3, 1,],
                            per_float_feature_quantization='16:border_count=1024',
                            leaf_estimation_method='Newton',
                            bootstrap_type='Bayesian', 
                            bagging_temperature=i,)
    model2.fit(p_train, eval_set=p_valid, verbose=1000)
    roc_aucs.append(roc_auc_score(p_test.get_label(), pd.DataFrame(model2.predict_proba(p_test))[1]))
    accuracys.append(accuracy_score(p_test.get_label(), model2.predict(p_test)))
    conf_matrices.append(confusion_matrix(p_test.get_label(), model2.predict(p_test))) 
    print(roc_auc_score(p_test.get_label(), pd.DataFrame(model2.predict_proba(p_test))[1]),
    accuracy_score(p_test.get_label(), model2.predict(p_test)))
    print(confusion_matrix(p_test.get_label(), model2.predict(p_test)))

In [None]:
b_temp = np.linspace(0.3, 1, 15)
for i in range(len(roc_aucs)):
    print(b_temp[i])
    print(roc_aucs[i], accuracys[i])
    print(conf_matrices[i])
    print(conf_matrices[i].ravel())
    print('recall', conf_matrices[i][1][0]/(conf_matrices[i][1][0]+conf_matrices[i][0][0]))
    print('****************************\n\n')

In [16]:
import random
rs = []
for i in range(5):
    rs.append(random.randint(0,10000))
rs

[9501, 5615, 6050, 2627, 6010]

In [17]:
models_scores = []
rocs = []
for b_temp in [0.35, 0.4, 0.55]:
    print(b_temp)
    model_score = 0
    roc=0
    for j in range(5):
        p_train, p_valid, p_test, features_train, features_valid, features_test, target_train, target_valid, target_test = get_pools(data, random_state = rs[j])
        model2 = CatBoostClassifier(iterations=10000, 
                               learning_rate=0.67777777, 
                                boosting_type='Ordered', 
                                depth=2,
                                class_weights=[0.3, 1,],
                                per_float_feature_quantization='16:border_count=1024',
                                leaf_estimation_method='Newton',
                                bootstrap_type='Bayesian', 
                                bagging_temperature=b_temp,)
        model2.fit(p_train, eval_set=p_valid, verbose=1000)
        model_score+=recall_score(target_test, model2.predict(p_test))
        roc+=roc_auc_score(p_test.get_label(), pd.DataFrame(model2.predict_proba(p_test))[1])
    print(roc/len(rs))
    print(model_score/len(rs))
    models_scores.append(model_score/len(rs))
    rocs.append(roc/len(rs))   

0.35
0:	learn: 0.3041674	test: 0.3020582	best: 0.3020582 (0)	total: 9.14ms	remaining: 1m 31s
1000:	learn: 0.1499988	test: 0.1541489	best: 0.1541489 (1000)	total: 28.2s	remaining: 4m 13s
2000:	learn: 0.1455894	test: 0.1509857	best: 0.1504827 (1098)	total: 58s	remaining: 3m 51s
3000:	learn: 0.1368748	test: 0.1425402	best: 0.1424024 (2723)	total: 1m 27s	remaining: 3m 24s
4000:	learn: 0.1323977	test: 0.1407139	best: 0.1406057 (3905)	total: 1m 57s	remaining: 2m 56s
5000:	learn: 0.1272173	test: 0.1363937	best: 0.1363937 (5000)	total: 2m 27s	remaining: 2m 27s
6000:	learn: 0.1251543	test: 0.1368562	best: 0.1358831 (5040)	total: 2m 58s	remaining: 1m 58s
7000:	learn: 0.1238452	test: 0.1365701	best: 0.1358831 (5040)	total: 3m 26s	remaining: 1m 28s
8000:	learn: 0.1233661	test: 0.1358009	best: 0.1357163 (7976)	total: 3m 54s	remaining: 58.6s
9000:	learn: 0.1232005	test: 0.1357981	best: 0.1357163 (7976)	total: 4m 23s	remaining: 29.2s
9999:	learn: 0.1224681	test: 0.1351084	best: 0.1350687 (9984)	total

In [18]:
rocs, models_scores

([0.921996643064167, 0.9199375389808123, 0.9200576916021571],
 [0.9935567010309277, 0.9948453608247423, 0.9914948453608247])

In [19]:
pd.Series(1-np.array(models_scores)).sort_values(ascending=True)

1    0.005155
0    0.006443
2    0.008505
dtype: float64

In [20]:
model2 = CatBoostClassifier(iterations=10000, 
                               learning_rate=0.67777777, 
                                boosting_type='Ordered', 
                                depth=2,
                                class_weights=[0.3, 1,],
                                per_float_feature_quantization='16:border_count=1024',
                                leaf_estimation_method='Newton',
                                bootstrap_type='Bayesian', 
                                bagging_temperature=0.45,)

p_train, p_valid, p_test, features_train, features_valid, features_test, target_train, target_valid, target_test = get_pools(data)
model2.fit(p_train, eval_set=p_valid, verbose=1000)

0:	learn: 0.3020712	test: 0.3087974	best: 0.3087974 (0)	total: 7.68ms	remaining: 1m 16s
1000:	learn: 0.1429443	test: 0.1650848	best: 0.1649266 (936)	total: 30.6s	remaining: 4m 35s
2000:	learn: 0.1306103	test: 0.1586153	best: 0.1575411 (1968)	total: 59.9s	remaining: 3m 59s
3000:	learn: 0.1272365	test: 0.1557588	best: 0.1554706 (2312)	total: 1m 28s	remaining: 3m 26s
4000:	learn: 0.1254963	test: 0.1548149	best: 0.1548077 (3964)	total: 1m 56s	remaining: 2m 55s
5000:	learn: 0.1249031	test: 0.1545061	best: 0.1544247 (4975)	total: 2m 25s	remaining: 2m 25s
6000:	learn: 0.1243841	test: 0.1546828	best: 0.1544247 (4975)	total: 2m 54s	remaining: 1m 56s
7000:	learn: 0.1199022	test: 0.1536792	best: 0.1530614 (6783)	total: 3m 23s	remaining: 1m 27s
8000:	learn: 0.1174464	test: 0.1511499	best: 0.1509265 (7846)	total: 3m 52s	remaining: 58.2s
9000:	learn: 0.1130549	test: 0.1470013	best: 0.1469871 (8990)	total: 4m 21s	remaining: 29s
9999:	learn: 0.1122316	test: 0.1466040	best: 0.1466040 (9999)	total: 4m 4

<catboost.core.CatBoostClassifier at 0x7f7199c5a510>

In [22]:
print(confusion_matrix(p_test.get_label(), model2.predict(p_test)))
print(1-recall_score(p_test.get_label(), model2.predict(p_test)))
print(roc_auc_score(p_test.get_label(), pd.DataFrame(model2.predict_proba(p_test))[1]))
model2.get_feature_importance(prettified=True)

[[154 127]
 [  5 771]]
0.006443298969072142
0.9352712147338298


Unnamed: 0,Feature Id,Importances
0,Duration,73.325033
1,MonthlyCharges,11.512865
2,InternetService,4.579675
3,Type,4.379648
4,PaymentMethod,2.190475
5,Partner,0.805423
6,SeniorCitizen,0.541902
7,StreamingTV,0.540149
8,PaperlessBilling,0.483005
9,StreamingMovies,0.389868


<font color="blue">Другое дело. Принято