In [90]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, make_scorer, accuracy_score, confusion_matrix, f1_score, classification_report
from sklearn.preprocessing import Normalizer, OneHotEncoder
import seaborn as sns

%matplotlib inline

In [163]:
df =  pd.read_csv('train.csv', sep=';')

In [164]:
df['age'] = df['age']//365
df['weight'] = df['weight'].astype('int')
df.loc[df['gender'] == 2, 'gender'] = 0

In [109]:
df.loc[df['ap_lo'] > 150, 'ap_lo'] = int(df['ap_lo'].mean())
df.loc[df['ap_lo'] < 50, 'ap_lo'] = int(df['ap_lo'].mean())
df.loc[df['ap_hi'] < 80, 'ap_hi'] = int(df['ap_hi'].mean())
df.loc[df['ap_hi'] > 300, 'ap_hi'] = int(df['ap_hi'].mean())

In [165]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,0,168,62,110,80,1,1,0,0,1,0
1,1,55,1,156,85,140,90,3,1,0,0,1,1
2,2,51,1,165,64,130,70,3,1,0,0,0,1
3,3,48,0,169,82,150,100,1,1,0,0,1,1
4,4,47,1,156,56,100,60,1,1,0,0,0,0


In [166]:
df_new_3 = df[['gender', 'alco', 'smoke', 'active']]

In [167]:
transformer = Normalizer().fit(df[['age', 'height', 'weight', 'ap_hi', 'ap_lo']])

In [168]:
Normalizer()

Normalizer(copy=True, norm='l2')

In [169]:
new_df = transformer.transform(df[['age', 'height', 'weight', 'ap_hi', 'ap_lo']])

In [170]:
new_df = pd.DataFrame(new_df, columns=['age', 'height', 'weight', 'ap_hi', 'ap_lo'])

In [171]:
new_df

Unnamed: 0,age,height,weight,ap_hi,ap_lo
0,0.217047,0.729278,0.269138,0.477503,0.347275
1,0.220378,0.625071,0.340584,0.560961,0.360618
2,0.216051,0.698989,0.271123,0.550719,0.296541
3,0.181308,0.638354,0.309734,0.566587,0.377724
4,0.225917,0.749853,0.269178,0.480675,0.288405
...,...,...,...,...,...
69995,0.216847,0.700584,0.316931,0.500417,0.333611
69996,0.226923,0.587767,0.468725,0.520806,0.334804
69997,0.175574,0.617884,0.354523,0.607754,0.303877
69998,0.248826,0.664896,0.293697,0.550681,0.326330


In [172]:
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(df[['cholesterol','gluc']])

OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='ignore',
              n_values=None, sparse=True)

In [173]:
df_new_2 = encoder.transform(df[['cholesterol','gluc']]).toarray()

In [174]:
df_new_2 = pd.DataFrame(df_new_2, columns=['cholesterol_1','cholesterol_2','cholesterol_3','gluc_1','gluc_2','gluc_3'])

In [175]:
df_new_2.head()

Unnamed: 0,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3
0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,1.0,0.0,0.0
2,0.0,0.0,1.0,1.0,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0


In [176]:
data = pd.concat([new_df, df_new_2, df_new_3], sort=False, axis=1)

In [36]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('cardio', axis=1), df['cardio'], test_size=0.33, random_state=42)
#train = df.drop('cardio', axis=1)

In [177]:
X_train, X_test, y_train, y_test = train_test_split(
    data, df['cardio'], test_size=0.33, random_state=42)

In [178]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46900 entries, 64334 to 15795
Data columns (total 15 columns):
age              46900 non-null float64
height           46900 non-null float64
weight           46900 non-null float64
ap_hi            46900 non-null float64
ap_lo            46900 non-null float64
cholesterol_1    46900 non-null float64
cholesterol_2    46900 non-null float64
cholesterol_3    46900 non-null float64
gluc_1           46900 non-null float64
gluc_2           46900 non-null float64
gluc_3           46900 non-null float64
gender           46900 non-null int64
alco             46900 non-null int64
smoke            46900 non-null int64
active           46900 non-null int64
dtypes: float64(11), int64(4)
memory usage: 5.7 MB


In [179]:
params = {'C': np.logspace(-5, 5, 11)}
clf = LogisticRegression()
LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
cv = GridSearchCV(clf, params, n_jobs=-1, scoring=LogLoss,cv=5)
cv.fit(X_train, y_train)
print(cv.best_score_, cv.best_estimator_)

-0.5834298366128414 LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)




In [40]:
cv.cv_results_

{'mean_fit_time': array([0.20550022, 0.25873008, 0.33291063, 0.48191042, 0.83229752,
        0.8613605 , 0.91067185, 0.90915484, 0.92047186, 0.9161314 ,
        0.6583395 ]),
 'std_fit_time': array([0.0095608 , 0.01129911, 0.01067457, 0.01172234, 0.04612255,
        0.07090533, 0.02036115, 0.00945246, 0.01039523, 0.02531091,
        0.13324856]),
 'mean_score_time': array([0.01534491, 0.00837369, 0.00877604, 0.00897646, 0.00716696,
        0.00808401, 0.0075829 , 0.00898757, 0.00699959, 0.00737705,
        0.00487876]),
 'std_score_time': array([0.00374586, 0.00135513, 0.00116223, 0.00302496, 0.00040687,
        0.00079772, 0.00049081, 0.00353448, 0.00063144, 0.00048444,
        0.00161803]),
 'param_C': masked_array(data=[1e-05, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0,
                    1000.0, 10000.0, 100000.0],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False],
        fill_value='?',
             dtype=object)

In [201]:
clf = LogisticRegression(C=10.0, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [202]:
clf.score(X_test, y_test)

0.7187012987012987

In [203]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix\n\n', cm)
print('\nTrue Positives(TP) = ', cm[0,0])
print('\nTrue Negatives(TN) = ', cm[1,1])
print('\nFalse Positives(FP) = ', cm[0,1])
print('\nFalse Negatives(FN) = ', cm[1,0])

Confusion matrix

 [[8690 2816]
 [3682 7912]]

True Positives(TP) =  8690

True Negatives(TN) =  7912

False Positives(FP) =  2816

False Negatives(FN) =  3682


In [197]:
y_pred_prob = clf.predict_proba(X_test)

In [198]:
log_loss(y_test, y_pred_prob)

0.5795252127880653

In [199]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.75      0.73     11506
           1       0.74      0.68      0.71     11594

    accuracy                           0.72     23100
   macro avg       0.72      0.72      0.72     23100
weighted avg       0.72      0.72      0.72     23100



## Случайные леса

In [79]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

In [80]:
clf = RandomForestClassifier(n_estimators=25)
clf.fit(X_train, y_train)
clf_probs = clf.predict_proba(X_test)

0.7788541552447172


In [89]:
log_loss(y_test, clf_probs[:, 1])

0.7788559552043003

# lightGBM

In [54]:
import lightgbm as lgb

ModuleNotFoundError: No module named 'lightgbm'

In [147]:
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)

LGBMClassifier()

In [148]:
y_pred=clf.predict(X_test)

In [149]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_pred, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))

LightGBM Model accuracy score: 0.7401


In [137]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix\n\n', cm)
print('\nTrue Positives(TP) = ', cm[0,0])
print('\nTrue Negatives(TN) = ', cm[1,1])
print('\nFalse Positives(FP) = ', cm[0,1])
print('\nFalse Negatives(FN) = ', cm[1,0])

Confusion matrix

 [[9024 2482]
 [3522 8072]]

True Positives(TP) =  9024

True Negatives(TN) =  8072

False Positives(FP) =  2482

False Negatives(FN) =  3522


In [167]:
classification_report(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.68      0.73      0.71     11506
           1       0.71      0.67      0.69     11594

    accuracy                           0.70     23100
   macro avg       0.70      0.70      0.70     23100
weighted avg       0.70      0.70      0.70     23100

