# ДЗ 3 Гладышев В.В.

## Определение ССЗ

### Загрузка и предобработка данных

In [15]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import precision_recall_curve, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv('train_case2.csv', ';')
df.head(3)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1


In [6]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(df.drop('cardio', 1), 
                                                    df['cardio'], random_state=0)

In [7]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in test_columns:
            if col_ not in self.columns:
                X[col_] = 0
        return X[self.columns]


continuos_cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
cat_cols = ['gender', 'cholesterol']
base_cols = ['gluc', 'smoke', 'alco', 'active']

continuos_transformers = []
cat_transformers = []
base_transformers = []

for cont_col in continuos_cols:
    transfomer =  Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('standard', StandardScaler())
            ])
    continuos_transformers.append((cont_col, transfomer))
    
for cat_col in cat_cols:
    cat_transformer = Pipeline([
                ('selector', ColumnSelector(key=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    cat_transformers.append((cat_col, cat_transformer))
    
for base_col in base_cols:
    base_transformer = Pipeline([
                ('selector', NumberSelector(key=base_col))
            ])
    base_transformers.append((base_col, base_transformer))

In [8]:
from sklearn.pipeline import FeatureUnion

feats = FeatureUnion(continuos_transformers+cat_transformers+base_transformers)
feature_processing = Pipeline([('feats', feats)])

feature_processing.fit_transform(X_train)

array([[-1.73391771,  0.6873301 ,  0.74843904, ...,  1.        ,
         0.        ,  1.        ],
       [-1.67343538,  0.07758923, -0.29640123, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.13738132,  1.17512278, -0.15708919, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 1.17775864,  1.17512278, -0.15708919, ...,  0.        ,
         0.        ,  1.        ],
       [-0.47190715, -1.38578883,  0.74843904, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.38174619,  0.56538192, -0.08743318, ...,  0.        ,
         0.        ,  1.        ]])

### Таблица для сохранения результатов

In [51]:
rw_clissifiers = ['LogisticRegression', 
                 'KNeighborsClassifier', 
                 'RandomForestClassifier', 
                 'GradientBoostingClassifier', 
                 'XGBClassifier']

cl_met = ['Best Threshold', 'F-Score', 'Precision', 
          'Recall', 'roc_auc_s', 'log_loss_s', 'TPR', 'FPR', 'TNR']

res_tab = pd.DataFrame(columns=cl_met)

### LogisticRegression

In [52]:
classifier = Pipeline([
    ('features',feats),
    ('classifier', LogisticRegression(random_state = 42)),
])


#запустим кросс-валидацию
cv_scores = cross_val_score(classifier, X_train, y_train, cv=16, scoring='roc_auc')
cv_score = np.mean(cv_scores)
cv_score_std = np.std(cv_scores)
print('CV score is {}+-{}'.format(cv_score, cv_score_std))

#обучим пайплайн на всем тренировочном датасете
classifier.fit(X_train, y_train)
y_score = classifier.predict_proba(X_test)[:, 1]

CV score is 0.7867401104915408+-0.00852135511666111


In [53]:
b=1
precision, recall, thresholds = precision_recall_curve(y_test.values, y_score)
fscore = (1+b**2)*(precision * recall) / (b**2*precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.386937, F-Score=0.730, Precision=0.647, Recall=0.838


In [54]:
from sklearn.metrics import roc_auc_score, log_loss

r_auc = roc_auc_score(y_true=y_test, y_score=classifier.predict_proba(X_test)[:,1])
l_los = log_loss(y_true=y_test, y_pred=classifier.predict_proba(X_test)[:,1])

print("roc auc score: {}".format(r_auc))
print("log loss score: {}".format(l_los))

roc auc score: 0.7840347790421852
log loss score: 0.5779604008230663


In [55]:
cnf_matrix = confusion_matrix(y_test, y_score>thresholds[ix])

TN = cnf_matrix[0][0]
FN = cnf_matrix[1][0]
TP = cnf_matrix[1][1]
FP = cnf_matrix[0][1]

TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(FP+TN)
TPR, FPR, TNR

(0.837442396313364, 0.44886621315192743, 0.5511337868480726)

In [56]:
res_tab.loc['LogisticRegression', :] = [thresholds[ix], fscore[ix], precision[ix], recall[ix], r_auc, l_los, TPR, FPR, TNR]

In [57]:
res_tab

Unnamed: 0,Best Threshold,F-Score,Precision,Recall,roc_auc_s,log_loss_s,TPR,FPR,TNR
LogisticRegression,0.386937,0.730323,0.647431,0.837558,0.784035,0.57796,0.837442,0.448866,0.551134


### KNN

In [58]:
from sklearn.neighbors import KNeighborsClassifier

knn = Pipeline([
    ('features',feats),
    ('knn', KNeighborsClassifier(n_neighbors=5)),
])

#запустим кросс-валидацию
cv_scores = cross_val_score(knn, X_train, y_train, cv=16, scoring='roc_auc')
cv_score = np.mean(cv_scores)
cv_score_std = np.std(cv_scores)
print('CV score is {}+-{}'.format(cv_score, cv_score_std))

#обучим пайплайн на всем тренировочном датасете
knn.fit(X_train, y_train)
y_score = knn.predict_proba(X_test)[:, 1]

CV score is 0.6919218138274713+-0.007083802929940452


In [59]:
b=1
precision, recall, thresholds = precision_recall_curve(y_test.values, y_score)
fscore = (1+b**2)*(precision * recall) / (b**2*precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

r_auc = roc_auc_score(y_true=y_test, y_score=knn.predict_proba(X_test)[:,1])
l_los = log_loss(y_true=y_test, y_pred=knn.predict_proba(X_test)[:,1])

print("roc auc score: {}".format(r_auc))
print("log loss score: {}".format(l_los))

Best Threshold=0.400000, F-Score=0.686, Precision=0.588, Recall=0.822
roc auc score: 0.6940883204280176
log loss score: 2.0495594233752303


In [60]:
cnf_matrix = confusion_matrix(y_test, y_score>thresholds[ix])

TN = cnf_matrix[0][0]
FN = cnf_matrix[1][0]
TP = cnf_matrix[1][1]
FP = cnf_matrix[0][1]

TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(FP+TN)
TPR, FPR, TNR

(0.6185483870967742, 0.322108843537415, 0.677891156462585)

In [61]:
res_tab.loc['KNeighborsClassifier', :] = [thresholds[ix], fscore[ix], precision[ix], recall[ix], r_auc, l_los, TPR, FPR, TNR]

In [62]:
res_tab

Unnamed: 0,Best Threshold,F-Score,Precision,Recall,roc_auc_s,log_loss_s,TPR,FPR,TNR
LogisticRegression,0.386937,0.730323,0.647431,0.837558,0.784035,0.57796,0.837442,0.448866,0.551134
KNeighborsClassifier,0.4,0.685863,0.588468,0.821889,0.694088,2.04956,0.618548,0.322109,0.677891


### Random Forest

In [63]:
from sklearn.ensemble import RandomForestClassifier

r_forest = Pipeline([
    ('features',feats),
    ('r_forest', RandomForestClassifier(random_state=42)),
])

#запустим кросс-валидацию
cv_scores = cross_val_score(r_forest, X_train, y_train, cv=16, scoring='roc_auc')
cv_score = np.mean(cv_scores)
cv_score_std = np.std(cv_scores)
print('CV score is {}+-{}'.format(cv_score, cv_score_std))

#обучим пайплайн на всем тренировочном датасете
r_forest.fit(X_train, y_train)
y_score = r_forest.predict_proba(X_test)[:, 1]

CV score is 0.7734501681056019+-0.007171140345435727


In [64]:
b=1
precision, recall, thresholds = precision_recall_curve(y_test.values, y_score)
fscore = (1+b**2)*(precision * recall) / (b**2*precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

r_auc = roc_auc_score(y_true=y_test, y_score=r_forest.predict_proba(X_test)[:,1])
l_los = log_loss(y_true=y_test, y_pred=r_forest.predict_proba(X_test)[:,1])

print("roc auc score: {}".format(r_auc))
print("log loss score: {}".format(l_los))

Best Threshold=0.350000, F-Score=0.719, Precision=0.643, Recall=0.816
roc auc score: 0.7710366181802983
log loss score: 0.5992984853728378


In [65]:
cnf_matrix = confusion_matrix(y_test, y_score>thresholds[ix])

TN = cnf_matrix[0][0]
FN = cnf_matrix[1][0]
TP = cnf_matrix[1][1]
FP = cnf_matrix[0][1]

TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(FP+TN)
TPR, FPR, TNR

(0.8084101382488479, 0.43412698412698414, 0.5658730158730159)

In [66]:
res_tab.loc['RandomForestClassifier', :] = [thresholds[ix], fscore[ix], precision[ix], recall[ix], 
                                            r_auc, l_los, TPR, FPR, TNR]
res_tab

Unnamed: 0,Best Threshold,F-Score,Precision,Recall,roc_auc_s,log_loss_s,TPR,FPR,TNR
LogisticRegression,0.386937,0.730323,0.647431,0.837558,0.784035,0.57796,0.837442,0.448866,0.551134
KNeighborsClassifier,0.4,0.685863,0.588468,0.821889,0.694088,2.04956,0.618548,0.322109,0.677891
RandomForestClassifier,0.35,0.718863,0.642669,0.815553,0.771037,0.599298,0.80841,0.434127,0.565873


### GradientBoostingClassifier

In [67]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = Pipeline([
    ('features',feats),
    ('gbc', GradientBoostingClassifier(random_state=42)),
])

#запустим кросс-валидацию
cv_scores = cross_val_score(gbc, X_train, y_train, cv=16, scoring='roc_auc')
cv_score = np.mean(cv_scores)
cv_score_std = np.std(cv_scores)
print('CV score is {}+-{}'.format(cv_score, cv_score_std))

#обучим пайплайн на всем тренировочном датасете
gbc.fit(X_train, y_train)
y_score = gbc.predict_proba(X_test)[:, 1]

CV score is 0.8025125910838183+-0.00707472977074522


In [68]:
b=1
precision, recall, thresholds = precision_recall_curve(y_test.values, y_score)
fscore = (1+b**2)*(precision * recall) / (b**2*precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

r_auc = roc_auc_score(y_true=y_test, y_score=gbc.predict_proba(X_test)[:,1])
l_los = log_loss(y_true=y_test, y_pred=gbc.predict_proba(X_test)[:,1])

print("roc auc score: {}".format(r_auc))
print("log loss score: {}".format(l_los))

Best Threshold=0.394947, F-Score=0.740, Precision=0.698, Recall=0.788
roc auc score: 0.8026153641179974
log loss score: 0.5397460438742135


In [69]:
cnf_matrix = confusion_matrix(y_test, y_score>thresholds[ix])

TN = cnf_matrix[0][0]
FN = cnf_matrix[1][0]
TP = cnf_matrix[1][1]
FP = cnf_matrix[0][1]

TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(FP+TN)
TPR, FPR, TNR

(0.7880184331797235, 0.33582766439909295, 0.664172335600907)

In [70]:
res_tab.loc['GradientBoostingClassifier', :] = [thresholds[ix], fscore[ix], precision[ix], recall[ix], 
                                                r_auc, l_los, TPR, FPR, TNR]
res_tab

Unnamed: 0,Best Threshold,F-Score,Precision,Recall,roc_auc_s,log_loss_s,TPR,FPR,TNR
LogisticRegression,0.386937,0.730323,0.647431,0.837558,0.784035,0.57796,0.837442,0.448866,0.551134
KNeighborsClassifier,0.4,0.685863,0.588468,0.821889,0.694088,2.04956,0.618548,0.322109,0.677891
RandomForestClassifier,0.35,0.718863,0.642669,0.815553,0.771037,0.599298,0.80841,0.434127,0.565873
GradientBoostingClassifier,0.394947,0.740248,0.697848,0.788134,0.802615,0.539746,0.788018,0.335828,0.664172


### XGBClassifier

In [71]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [72]:
from xgboost import XGBClassifier

xgbc = Pipeline([
    ('features',feats),
    ('xgbc', XGBClassifier(objective='binary:logistic', silent=True, random_state=42)),
])

#запустим кросс-валидацию
cv_scores = cross_val_score(xgbc, X_train, y_train, cv=16, scoring='roc_auc')
cv_score = np.mean(cv_scores)
cv_score_std = np.std(cv_scores)
print('CV score is {}+-{}'.format(cv_score, cv_score_std))

#обучим пайплайн на всем тренировочном датасете
xgbc.fit(X_train, y_train)
y_score = xgbc.predict_proba(X_test)[:, 1]



Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


CV score is 0.7974879407860951+-0.006447877492510205
Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.








In [73]:
b=1
precision, recall, thresholds = precision_recall_curve(y_test.values, y_score)
fscore = (1+b**2)*(precision * recall) / (b**2*precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

r_auc = roc_auc_score(y_true=y_test, y_score=xgbc.predict_proba(X_test)[:,1])
l_los = log_loss(y_true=y_test, y_pred=xgbc.predict_proba(X_test)[:,1])

print("roc auc score: {}".format(r_auc))
print("log loss score: {}".format(l_los))

Best Threshold=0.347103, F-Score=0.738, Precision=0.665, Recall=0.828
roc auc score: 0.797227760535858
log loss score: 0.5471106075466078


In [74]:
cnf_matrix = confusion_matrix(y_test, y_score>thresholds[ix])

TN = cnf_matrix[0][0]
FN = cnf_matrix[1][0]
TP = cnf_matrix[1][1]
FP = cnf_matrix[0][1]

TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(FP+TN)
TPR, FPR, TNR

(0.8282258064516129, 0.41020408163265304, 0.5897959183673469)

In [75]:
res_tab.loc['XGBClassifier', :] = [thresholds[ix], fscore[ix], precision[ix], recall[ix], 
                                                r_auc, l_los, TPR, FPR, TNR]
res_tab

Unnamed: 0,Best Threshold,F-Score,Precision,Recall,roc_auc_s,log_loss_s,TPR,FPR,TNR
LogisticRegression,0.386937,0.730323,0.647431,0.837558,0.784035,0.57796,0.837442,0.448866,0.551134
KNeighborsClassifier,0.4,0.685863,0.588468,0.821889,0.694088,2.04956,0.618548,0.322109,0.677891
RandomForestClassifier,0.35,0.718863,0.642669,0.815553,0.771037,0.599298,0.80841,0.434127,0.565873
GradientBoostingClassifier,0.394947,0.740248,0.697848,0.788134,0.802615,0.539746,0.788018,0.335828,0.664172
XGBClassifier,0.347103,0.73789,0.665248,0.828341,0.797228,0.547111,0.828226,0.410204,0.589796


### Итоговая таблица

In [77]:
res_tab.sort_values('roc_auc_s', ascending=False)

Unnamed: 0,Best Threshold,F-Score,Precision,Recall,roc_auc_s,log_loss_s,TPR,FPR,TNR
GradientBoostingClassifier,0.394947,0.740248,0.697848,0.788134,0.802615,0.539746,0.788018,0.335828,0.664172
XGBClassifier,0.347103,0.73789,0.665248,0.828341,0.797228,0.547111,0.828226,0.410204,0.589796
LogisticRegression,0.386937,0.730323,0.647431,0.837558,0.784035,0.57796,0.837442,0.448866,0.551134
RandomForestClassifier,0.35,0.718863,0.642669,0.815553,0.771037,0.599298,0.80841,0.434127,0.565873
KNeighborsClassifier,0.4,0.685863,0.588468,0.821889,0.694088,2.04956,0.618548,0.322109,0.677891


С гиперпараметрами по умолчанию лучший результат показал GradientBoostingClassifier. Однако для каждого метода желательно произвести подбор гиперпараметов