### Домашнее задание
1. обучить несколько разных моделей на наборе данных ССЗ (train_case2.csv): логрег, бустинг, лес и т.д - на ваш выбор 2-3 варианта
2. при обучении моделей обязательно использовать кроссвалидацию
3. вывести сравнение полученных моделей по основным метрикам классификации: pr/rec/auc/f_score (можно в виде таблицы, где строки - модели, а столбцы - метрики)
4. сделать выводы о том, какая модель справилась с задачей лучше других
5. (опциональный вопрос) какая метрика (precision_recall_curve или roc_auc_curve) больше подходит в случае сильного дисбаланса классов? (когда объектов одного из классов намного больше чем другого).


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import precision_recall_curve, roc_auc_score, precision_score, recall_score, roc_auc_score, f1_score

In [2]:
df = pd.read_csv('train_case2.csv', ';')
df.head(3)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1


нализ данных показал что в полях 'ap_hi' 'ap_lo' существуют выбросы, значения не свойственные измеряемым характеристикам. 

In [3]:
df[df['ap_hi']>300].head(5)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
1876,2654,15116,1,160,60.0,902,60,1,1,0,0,1,0
2014,2845,22712,2,167,59.0,906,0,1,1,0,0,1,0
4817,6822,14425,1,168,63.0,909,60,2,1,0,0,1,0
7763,11089,21032,1,175,80.0,11500,90,1,1,0,0,1,1
8915,12710,18870,1,164,75.0,1420,80,2,1,0,0,1,1


In [38]:
df[df['ap_hi']<30].head(5)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
567,815,21281,1,168,78.0,14,90,2,1,0,0,1,1
927,1294,21867,2,175,70.0,14,90,3,1,0,0,1,1
979,1360,18225,1,172,65.0,11,80,1,3,0,0,1,0
1600,2260,19201,1,165,66.0,12,80,1,1,0,0,1,0
1627,2301,22591,1,165,71.0,14,80,1,1,0,0,1,1


In [5]:
df[df['ap_hi']<0].head(5)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
4607,6525,15281,1,165,78.0,-100,80,2,1,0,0,1,0
16021,22881,22108,2,161,90.0,-115,70,1,1,0,0,1,0
20536,29313,15581,1,153,54.0,-100,70,1,1,0,0,1,0
23988,34295,18301,1,162,74.0,-140,90,1,1,0,0,1,1
25240,36025,14711,2,168,50.0,-120,80,2,1,0,0,0,1


In [6]:

class ColumnSelector():
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]  

class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in test_columns:
            if col_ not in self.columns:
                X[col_] = 0
        return X[self.columns]
    
class ApTransfom():
    '''Преобразует данные полей 'ap_hi' 'ap_lo' устраняет выбросы'''
    def __init__(self):
        return None
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[X>300] = X[X>300]//100
        X[X<0] = -X[X<0]
        X[X<30] = X[X<30]*10
        return X    

    
continuos_cols = ['age', 'height', 'weight']
transf_cols = ['ap_hi', 'ap_lo']
cat_cols = ['gender', 'cholesterol']
base_cols = ['gluc', 'smoke', 'alco', 'active']

continuos_transformers = []
data_transform = []
cat_transformers = []
base_transformers = []

for transf_col in transf_cols:
    transfomer =  Pipeline([
                ('selector', NumberSelector(key=transf_col)),
                ('transfdata', ApTransfom()),
                ('standard', StandardScaler())
            ])
    data_transform.append((transf_col, transfomer))


for cont_col in continuos_cols:
    transfomer =  Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('standard', StandardScaler())
            ])
    continuos_transformers.append((cont_col, transfomer))
    
for cat_col in cat_cols:
    cat_transformer = Pipeline([
                ('selector', ColumnSelector(key=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    cat_transformers.append((cat_col, cat_transformer))
    
for base_col in base_cols:
    base_transformer = Pipeline([
                ('selector', NumberSelector(key=base_col))
            ])
    base_transformers.append((base_col, base_transformer))

    
feats = FeatureUnion(continuos_transformers+cat_transformers+base_transformers+data_transform)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('cardio', 1), 
                                                    df['cardio'], random_state=0)

### Классивикатор LogisticRegression

In [8]:
classifierLiner = Pipeline([
    ('features',feats),
    ('classifierLiner', LogisticRegression(random_state = 21)),
])

In [9]:
%%time
classifierLiner.fit(X_train, y_train)

Wall time: 183 ms


Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('age',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='age')),
                                                                 ('standard',
                                                                  StandardScaler())])),
                                                ('height',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='height')),
                                                                 ('standard',
                                                                  StandardScaler())])),
                                                ('weight',
                                                 Pipeline(steps=[('selector',
                                        

In [10]:
cv_scores = cross_val_score(classifierLiner, X_train, y_train, cv=5, scoring='roc_auc')

In [11]:
cv_score = np.mean(cv_scores)
cv_score_std = np.std(cv_scores)
print('CV score is {}+-{}'.format(cv_score, cv_score_std))

CV score is 0.7927032649941874+-0.003762115867224669


In [12]:
pred_Liner = classifierLiner.predict(X_test)

In [13]:
result = []
result.append(['Linear', precision_score(y_test, pred_Liner), recall_score(y_test, pred_Liner),\
               f1_score(y_test, pred_Liner), cv_score])

In [14]:
y_score = classifierLiner.predict_proba(X_test)[:, 1]
b=1
precision, recall, thresholds = precision_recall_curve(y_test.values, y_score)
fscore = (1+b**2)*(precision * recall) / (b**2*precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))



Best Threshold=0.381611, F-Score=0.735, Precision=0.662, Recall=0.825


In [15]:
result.append(['Linear Best_Threshold', precision[ix], recall[ix], fscore[ix], roc_auc_score(y_test, y_score)])

### Классивикатор RandomForestClassifier

In [16]:
classifierForest = Pipeline([
    ('features',feats),
    ('classifierForest', RandomForestClassifier(n_estimators = 250, random_state = 21)),
])

In [17]:
%%time
classifierForest.fit(X_train, y_train)

Wall time: 12.6 s


Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('age',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='age')),
                                                                 ('standard',
                                                                  StandardScaler())])),
                                                ('height',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='height')),
                                                                 ('standard',
                                                                  StandardScaler())])),
                                                ('weight',
                                                 Pipeline(steps=[('selector',
                                        

In [18]:
cv_scores = cross_val_score(classifierForest, X_train, y_train, cv=5, scoring='roc_auc')

In [19]:
cv_score = np.mean(cv_scores)
cv_score_std = np.std(cv_scores)
print('CV score is {}+-{}'.format(cv_score, cv_score_std))

CV score is 0.7760003176657189+-0.0032647740986071626


In [20]:
pred_Forest = classifierForest.predict(X_test)

In [21]:
result.append(['Forest', precision_score(y_test, pred_Forest), recall_score(y_test, pred_Forest), \
               f1_score(y_test, pred_Forest), cv_score])

In [22]:
y_score = classifierForest.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test.values, y_score)
b = 1
fscore = (1+b**2)*(precision * recall) / (b**2*precision + recall)

ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.368000, F-Score=0.722, Precision=0.656, Recall=0.802


In [23]:
result.append(['Forest Best_Threshold', precision[ix], recall[ix], fscore[ix], roc_auc_score(y_test, y_score)])

### Классивикатор CatBoostClassifier

In [24]:
classifierBoost = Pipeline([
    ('features',feats),
    ('classifierBoost', CatBoostClassifier(n_estimators = 250, random_state = 21)),
])

In [25]:
%%time
classifierBoost.fit(X_train, y_train)

Learning rate set to 0.199311
0:	learn: 0.6398112	total: 189ms	remaining: 46.9s
1:	learn: 0.6059081	total: 211ms	remaining: 26.2s
2:	learn: 0.5842919	total: 228ms	remaining: 18.8s
3:	learn: 0.5707659	total: 254ms	remaining: 15.6s
4:	learn: 0.5624749	total: 278ms	remaining: 13.6s
5:	learn: 0.5565750	total: 295ms	remaining: 12s
6:	learn: 0.5523680	total: 314ms	remaining: 10.9s
7:	learn: 0.5493582	total: 332ms	remaining: 10s
8:	learn: 0.5471127	total: 349ms	remaining: 9.35s
9:	learn: 0.5448282	total: 365ms	remaining: 8.75s
10:	learn: 0.5433913	total: 381ms	remaining: 8.27s
11:	learn: 0.5422479	total: 393ms	remaining: 7.79s
12:	learn: 0.5412804	total: 425ms	remaining: 7.74s
13:	learn: 0.5406642	total: 440ms	remaining: 7.42s
14:	learn: 0.5400318	total: 456ms	remaining: 7.14s
15:	learn: 0.5390409	total: 471ms	remaining: 6.89s
16:	learn: 0.5383652	total: 486ms	remaining: 6.66s
17:	learn: 0.5379787	total: 516ms	remaining: 6.66s
18:	learn: 0.5374637	total: 532ms	remaining: 6.47s
19:	learn: 0.53

167:	learn: 0.5151485	total: 3.46s	remaining: 1.69s
168:	learn: 0.5150841	total: 3.52s	remaining: 1.69s
169:	learn: 0.5149264	total: 3.54s	remaining: 1.67s
170:	learn: 0.5148180	total: 3.55s	remaining: 1.64s
171:	learn: 0.5147342	total: 3.57s	remaining: 1.62s
172:	learn: 0.5146346	total: 3.59s	remaining: 1.6s
173:	learn: 0.5145694	total: 3.6s	remaining: 1.57s
174:	learn: 0.5144605	total: 3.62s	remaining: 1.55s
175:	learn: 0.5143813	total: 3.63s	remaining: 1.53s
176:	learn: 0.5142816	total: 3.65s	remaining: 1.5s
177:	learn: 0.5140682	total: 3.67s	remaining: 1.48s
178:	learn: 0.5139183	total: 3.68s	remaining: 1.46s
179:	learn: 0.5138067	total: 3.69s	remaining: 1.44s
180:	learn: 0.5136039	total: 3.72s	remaining: 1.42s
181:	learn: 0.5134533	total: 3.74s	remaining: 1.4s
182:	learn: 0.5133233	total: 3.75s	remaining: 1.37s
183:	learn: 0.5132247	total: 3.77s	remaining: 1.35s
184:	learn: 0.5130867	total: 3.78s	remaining: 1.33s
185:	learn: 0.5129154	total: 3.8s	remaining: 1.31s
186:	learn: 0.512

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('age',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='age')),
                                                                 ('standard',
                                                                  StandardScaler())])),
                                                ('height',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='height')),
                                                                 ('standard',
                                                                  StandardScaler())])),
                                                ('weight',
                                                 Pipeline(steps=[('selector',
                                        

In [26]:
cv_scores = cross_val_score(classifierBoost, X_train, y_train, cv=5, scoring='roc_auc')

Learning rate set to 0.181197
0:	learn: 0.6441673	total: 35ms	remaining: 8.71s
1:	learn: 0.6129202	total: 44.3ms	remaining: 5.49s
2:	learn: 0.5911798	total: 55.3ms	remaining: 4.55s
3:	learn: 0.5765769	total: 67.5ms	remaining: 4.15s
4:	learn: 0.5666938	total: 87.8ms	remaining: 4.3s
5:	learn: 0.5607169	total: 99.2ms	remaining: 4.03s
6:	learn: 0.5555822	total: 109ms	remaining: 3.79s
7:	learn: 0.5516360	total: 133ms	remaining: 4.02s
8:	learn: 0.5491996	total: 144ms	remaining: 3.85s
9:	learn: 0.5469097	total: 155ms	remaining: 3.72s
10:	learn: 0.5451569	total: 173ms	remaining: 3.76s
11:	learn: 0.5438297	total: 192ms	remaining: 3.81s
12:	learn: 0.5425261	total: 207ms	remaining: 3.77s
13:	learn: 0.5415057	total: 222ms	remaining: 3.74s
14:	learn: 0.5407620	total: 238ms	remaining: 3.73s
15:	learn: 0.5400556	total: 254ms	remaining: 3.71s
16:	learn: 0.5395469	total: 267ms	remaining: 3.66s
17:	learn: 0.5391323	total: 280ms	remaining: 3.61s
18:	learn: 0.5386311	total: 292ms	remaining: 3.55s
19:	lear

167:	learn: 0.5130730	total: 2.22s	remaining: 1.08s
168:	learn: 0.5128721	total: 2.24s	remaining: 1.07s
169:	learn: 0.5128152	total: 2.25s	remaining: 1.06s
170:	learn: 0.5127170	total: 2.28s	remaining: 1.05s
171:	learn: 0.5126087	total: 2.3s	remaining: 1.04s
172:	learn: 0.5124709	total: 2.31s	remaining: 1.03s
173:	learn: 0.5123376	total: 2.35s	remaining: 1.02s
174:	learn: 0.5120868	total: 2.36s	remaining: 1.01s
175:	learn: 0.5118921	total: 2.38s	remaining: 999ms
176:	learn: 0.5118304	total: 2.39s	remaining: 984ms
177:	learn: 0.5117147	total: 2.4s	remaining: 971ms
178:	learn: 0.5115219	total: 2.42s	remaining: 961ms
179:	learn: 0.5114125	total: 2.44s	remaining: 948ms
180:	learn: 0.5112844	total: 2.45s	remaining: 934ms
181:	learn: 0.5110606	total: 2.46s	remaining: 920ms
182:	learn: 0.5108899	total: 2.47s	remaining: 905ms
183:	learn: 0.5108046	total: 2.48s	remaining: 890ms
184:	learn: 0.5106814	total: 2.49s	remaining: 875ms
185:	learn: 0.5105263	total: 2.5s	remaining: 861ms
186:	learn: 0.5

82:	learn: 0.5241106	total: 1.1s	remaining: 2.22s
83:	learn: 0.5238596	total: 1.11s	remaining: 2.2s
84:	learn: 0.5236885	total: 1.12s	remaining: 2.18s
85:	learn: 0.5234620	total: 1.15s	remaining: 2.18s
86:	learn: 0.5233311	total: 1.17s	remaining: 2.2s
87:	learn: 0.5231380	total: 1.18s	remaining: 2.18s
88:	learn: 0.5229131	total: 1.19s	remaining: 2.16s
89:	learn: 0.5226788	total: 1.22s	remaining: 2.17s
90:	learn: 0.5224878	total: 1.23s	remaining: 2.15s
91:	learn: 0.5223951	total: 1.24s	remaining: 2.13s
92:	learn: 0.5222195	total: 1.25s	remaining: 2.11s
93:	learn: 0.5220618	total: 1.26s	remaining: 2.09s
94:	learn: 0.5219248	total: 1.27s	remaining: 2.07s
95:	learn: 0.5218636	total: 1.28s	remaining: 2.05s
96:	learn: 0.5216480	total: 1.29s	remaining: 2.03s
97:	learn: 0.5214886	total: 1.3s	remaining: 2.02s
98:	learn: 0.5212462	total: 1.33s	remaining: 2.02s
99:	learn: 0.5210260	total: 1.34s	remaining: 2.01s
100:	learn: 0.5208666	total: 1.37s	remaining: 2.02s
101:	learn: 0.5206697	total: 1.38s

246:	learn: 0.5005848	total: 3.32s	remaining: 40.3ms
247:	learn: 0.5004453	total: 3.33s	remaining: 26.8ms
248:	learn: 0.5002572	total: 3.35s	remaining: 13.5ms
249:	learn: 0.5000575	total: 3.37s	remaining: 0us
Learning rate set to 0.181197
0:	learn: 0.6437700	total: 27.2ms	remaining: 6.76s
1:	learn: 0.6121432	total: 42.1ms	remaining: 5.21s
2:	learn: 0.5903050	total: 51.3ms	remaining: 4.23s
3:	learn: 0.5759426	total: 62.6ms	remaining: 3.85s
4:	learn: 0.5668986	total: 72.3ms	remaining: 3.54s
5:	learn: 0.5595791	total: 82.2ms	remaining: 3.34s
6:	learn: 0.5544895	total: 91.1ms	remaining: 3.16s
7:	learn: 0.5506775	total: 101ms	remaining: 3.06s
8:	learn: 0.5479224	total: 111ms	remaining: 2.98s
9:	learn: 0.5462371	total: 121ms	remaining: 2.9s
10:	learn: 0.5442668	total: 131ms	remaining: 2.84s
11:	learn: 0.5429289	total: 140ms	remaining: 2.77s
12:	learn: 0.5415833	total: 150ms	remaining: 2.73s
13:	learn: 0.5406351	total: 159ms	remaining: 2.69s
14:	learn: 0.5397722	total: 172ms	remaining: 2.7s
1

164:	learn: 0.5145300	total: 2.24s	remaining: 1.15s
165:	learn: 0.5144272	total: 2.25s	remaining: 1.14s
166:	learn: 0.5142663	total: 2.26s	remaining: 1.12s
167:	learn: 0.5140373	total: 2.27s	remaining: 1.11s
168:	learn: 0.5139010	total: 2.28s	remaining: 1.09s
169:	learn: 0.5137077	total: 2.29s	remaining: 1.08s
170:	learn: 0.5134978	total: 2.3s	remaining: 1.06s
171:	learn: 0.5133147	total: 2.31s	remaining: 1.05s
172:	learn: 0.5132186	total: 2.33s	remaining: 1.03s
173:	learn: 0.5130251	total: 2.34s	remaining: 1.02s
174:	learn: 0.5128723	total: 2.35s	remaining: 1.01s
175:	learn: 0.5127408	total: 2.37s	remaining: 995ms
176:	learn: 0.5126190	total: 2.38s	remaining: 981ms
177:	learn: 0.5124087	total: 2.39s	remaining: 968ms
178:	learn: 0.5122831	total: 2.41s	remaining: 954ms
179:	learn: 0.5121297	total: 2.42s	remaining: 941ms
180:	learn: 0.5119451	total: 2.44s	remaining: 929ms
181:	learn: 0.5117462	total: 2.45s	remaining: 916ms
182:	learn: 0.5116516	total: 2.46s	remaining: 902ms
183:	learn: 0

84:	learn: 0.5221701	total: 1.2s	remaining: 2.33s
85:	learn: 0.5218907	total: 1.22s	remaining: 2.32s
86:	learn: 0.5217789	total: 1.23s	remaining: 2.3s
87:	learn: 0.5216577	total: 1.24s	remaining: 2.28s
88:	learn: 0.5215550	total: 1.25s	remaining: 2.26s
89:	learn: 0.5213509	total: 1.27s	remaining: 2.26s
90:	learn: 0.5212289	total: 1.28s	remaining: 2.24s
91:	learn: 0.5210631	total: 1.29s	remaining: 2.22s
92:	learn: 0.5210458	total: 1.3s	remaining: 2.2s
93:	learn: 0.5209936	total: 1.31s	remaining: 2.17s
94:	learn: 0.5208643	total: 1.32s	remaining: 2.16s
95:	learn: 0.5206338	total: 1.34s	remaining: 2.14s
96:	learn: 0.5204100	total: 1.35s	remaining: 2.13s
97:	learn: 0.5202107	total: 1.36s	remaining: 2.11s
98:	learn: 0.5199826	total: 1.38s	remaining: 2.1s
99:	learn: 0.5197438	total: 1.39s	remaining: 2.09s
100:	learn: 0.5196821	total: 1.41s	remaining: 2.07s
101:	learn: 0.5196351	total: 1.42s	remaining: 2.06s
102:	learn: 0.5195329	total: 1.43s	remaining: 2.04s
103:	learn: 0.5191856	total: 1.45

249:	learn: 0.4999259	total: 3.38s	remaining: 0us
Learning rate set to 0.181197
0:	learn: 0.6445123	total: 22.5ms	remaining: 5.59s
1:	learn: 0.6128353	total: 37.9ms	remaining: 4.7s
2:	learn: 0.5921203	total: 49.1ms	remaining: 4.04s
3:	learn: 0.5773577	total: 58.2ms	remaining: 3.58s
4:	learn: 0.5676691	total: 67.8ms	remaining: 3.32s
5:	learn: 0.5608805	total: 77.2ms	remaining: 3.14s
6:	learn: 0.5555857	total: 86ms	remaining: 2.98s
7:	learn: 0.5524326	total: 95.7ms	remaining: 2.89s
8:	learn: 0.5495755	total: 106ms	remaining: 2.83s
9:	learn: 0.5472230	total: 115ms	remaining: 2.76s
10:	learn: 0.5457129	total: 125ms	remaining: 2.71s
11:	learn: 0.5441991	total: 133ms	remaining: 2.65s
12:	learn: 0.5428949	total: 155ms	remaining: 2.83s
13:	learn: 0.5418022	total: 165ms	remaining: 2.78s
14:	learn: 0.5409861	total: 178ms	remaining: 2.79s
15:	learn: 0.5401380	total: 188ms	remaining: 2.75s
16:	learn: 0.5395475	total: 200ms	remaining: 2.73s
17:	learn: 0.5388525	total: 210ms	remaining: 2.7s
18:	lear

172:	learn: 0.5123564	total: 2.2s	remaining: 980ms
173:	learn: 0.5122533	total: 2.22s	remaining: 969ms
174:	learn: 0.5120986	total: 2.23s	remaining: 954ms
175:	learn: 0.5120356	total: 2.26s	remaining: 950ms
176:	learn: 0.5118642	total: 2.27s	remaining: 936ms
177:	learn: 0.5117656	total: 2.29s	remaining: 926ms
178:	learn: 0.5116347	total: 2.3s	remaining: 912ms
179:	learn: 0.5115042	total: 2.32s	remaining: 902ms
180:	learn: 0.5114161	total: 2.33s	remaining: 888ms
181:	learn: 0.5112585	total: 2.34s	remaining: 874ms
182:	learn: 0.5111554	total: 2.35s	remaining: 860ms
183:	learn: 0.5111087	total: 2.36s	remaining: 846ms
184:	learn: 0.5110437	total: 2.37s	remaining: 833ms
185:	learn: 0.5108266	total: 2.38s	remaining: 819ms
186:	learn: 0.5107231	total: 2.4s	remaining: 807ms
187:	learn: 0.5106382	total: 2.41s	remaining: 795ms
188:	learn: 0.5104357	total: 2.42s	remaining: 781ms
189:	learn: 0.5103729	total: 2.43s	remaining: 767ms
190:	learn: 0.5101912	total: 2.44s	remaining: 754ms
191:	learn: 0.5

In [27]:
cv_score = np.mean(cv_scores)
cv_score_std = np.std(cv_scores)
print('CV score is {}+-{}'.format(cv_score, cv_score_std))

CV score is 0.800452523900829+-0.003910482381529339


In [28]:
pred_Boost = classifierBoost.predict(X_test)

In [29]:
result.append(['Boost', precision_score(y_test, pred_Boost), recall_score(y_test, pred_Boost),\
               f1_score(y_test, pred_Boost), cv_score])

In [30]:
y_score = classifierBoost.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test.values, y_score)
b = 1
fscore = (1+b**2)*(precision * recall) / (b**2*precision + recall)

ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.397320, F-Score=0.738, Precision=0.698, Recall=0.782


In [31]:
result.append(['Boost Best_Threshold', precision[ix], recall[ix], fscore[ix], roc_auc_score(y_test, y_score)])

### Задание3 

In [32]:
result = pd.DataFrame(result, columns = ['model','pres', 'rec', 'f1', 'roc_auc'])

In [33]:
result

Unnamed: 0,model,pres,rec,f1,roc_auc
0,Linear,0.755224,0.670392,0.710284,0.792703
1,Linear Best_Threshold,0.66241,0.824654,0.734681,0.792228
2,Forest,0.718153,0.696889,0.707361,0.776
3,Forest Best_Threshold,0.656235,0.802074,0.721862,0.772944
4,Boost,0.753211,0.689171,0.719769,0.800453
5,Boost Best_Threshold,0.698036,0.781912,0.737597,0.800097


### Задание4 

Выводы: 
Указать на явное преимущество определённого классификатора в данном задании не получилось. Все полученные метрики очень близки друг к другу. 
Самые лучшие результаты показал CatBoostClassifier, но в силу незначительного отличия от LogisticRegression я всё так предпочтение отдаю регрессии. Это связанно со значительной разницей во времени исполнения.


### Задание5  

какая метрика (precision_recall_curve или roc_auc_curve) больше подходит в случае сильного дисбаланса классов? (когда объектов одного из классов намного больше чем другого).

графики отличаются только осями FPR и precision

рассмотрим два случая 


| TP= 90  |  FP = 10   |
|:-----------:|:----------:|
| FN = 10 | TN = 99890 |


| TP= 90  |  FP = 45   |
|:-----------:|:----------:|
| FN = 10 | TN = 99855 |


$FPR_1 = \frac {10}{(10+99890)}$<br>
$precision_1 = \frac {90}{90+10}$

In [34]:
10/(10+99890)

0.0001001001001001001

In [35]:
90/(90+10)

0.9

$FPR_2 = \frac {45}{(45+99855)}$<br>
$precision_2 = \frac {90}{90+45}$

In [36]:
45/(45+99890)

0.00045029269024866164

In [37]:
90/(90+45)

0.6666666666666666

Из полученных результатов видно, что:
1. при подобном изменении поведения модели на графике __roc_auc_curve__ изменения FPR будут не значительны и вероятней всего мы не увидим отличия. 
2. параметр *precision* упал на 24%. И график __precision_recall_curve__ нам явно это покажет. 

Поэтому при большом дисбалансе классов график __precision_recall_curve__ будет более информативный. 