In [151]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, make_scorer, accuracy_score, confusion_matrix
import seaborn as sns

%matplotlib inline

In [152]:
df =  pd.read_csv('train.csv', sep=';')

In [153]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [154]:
df['age'] = df['age']//365

In [155]:
df['weight'] = df['weight'].astype('int')

In [156]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,2,168,62,110,80,1,1,0,0,1,0
1,1,55,1,156,85,140,90,3,1,0,0,1,1
2,2,51,1,165,64,130,70,3,1,0,0,0,1
3,3,48,2,169,82,150,100,1,1,0,0,1,1
4,4,47,1,156,56,100,60,1,1,0,0,0,0


In [157]:
df. info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   id           70000 non-null  int64
 1   age          70000 non-null  int64
 2   gender       70000 non-null  int64
 3   height       70000 non-null  int64
 4   weight       70000 non-null  int64
 5   ap_hi        70000 non-null  int64
 6   ap_lo        70000 non-null  int64
 7   cholesterol  70000 non-null  int64
 8   gluc         70000 non-null  int64
 9   smoke        70000 non-null  int64
 10  alco         70000 non-null  int64
 11  active       70000 non-null  int64
 12  cardio       70000 non-null  int64
dtypes: int64(13)
memory usage: 6.9 MB


In [158]:
df.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,49972.4199,52.840671,1.349571,164.359229,74.204329,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,28851.302323,6.766774,0.476838,8.210126,14.395953,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,0.0,29.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,25006.75,48.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,50001.5,53.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,74889.25,58.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,99999.0,64.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


In [159]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('cardio', axis=1), df['cardio'], test_size=0.33, random_state=42)
#train = df.drop('cardio', axis=1)

In [160]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46900 entries, 64334 to 15795
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   id           46900 non-null  int64
 1   age          46900 non-null  int64
 2   gender       46900 non-null  int64
 3   height       46900 non-null  int64
 4   weight       46900 non-null  int64
 5   ap_hi        46900 non-null  int64
 6   ap_lo        46900 non-null  int64
 7   cholesterol  46900 non-null  int64
 8   gluc         46900 non-null  int64
 9   smoke        46900 non-null  int64
 10  alco         46900 non-null  int64
 11  active       46900 non-null  int64
dtypes: int64(12)
memory usage: 4.7 MB


In [162]:
params = {'C': np.logspace(-5, 5, 11)}
clf = LogisticRegression()
LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
cv = GridSearchCV(clf, params, n_jobs=-1, scoring=LogLoss,cv=5)
cv.fit(X_train, y_train)
print(cv.best_score_, cv.best_estimator_)

-0.6056456322269851 LogisticRegression(C=0.0001)


In [164]:
clf = LogisticRegression(C=0.0001)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [165]:
print('LogisticRegression Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))

LogisticRegression Model accuracy score: 0.6974


In [166]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix\n\n', cm)
print('\nTrue Positives(TP) = ', cm[0,0])
print('\nTrue Negatives(TN) = ', cm[1,1])
print('\nFalse Positives(FP) = ', cm[0,1])
print('\nFalse Negatives(FN) = ', cm[1,0])

Confusion matrix

 [[8393 3113]
 [3877 7717]]

True Positives(TP) =  8393

True Negatives(TN) =  7717

False Positives(FP) =  3113

False Negatives(FN) =  3877


## Случайные леса

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

In [89]:
clf = RandomForestClassifier(n_estimators=25)
clf.fit(X_train, y_train)
clf_probs = clf.predict_proba(X_test)
score = log_loss(y_test, clf_probs)
print(score)

0.6950670325614652


In [73]:
lr_gs = cv.fit(X_train, y_train)

In [74]:
y_pred = lr_gs.best_estimator_.predict(X_test)

In [86]:
print(confusion_matrix(y_test, clf_probs))

ValueError: Classification metrics can't handle a mix of binary and continuous-multioutput targets

In [96]:
list(zip(clf.classes_, clf_probs))

[(0, array([0.12, 0.88])), (1, array([0.16, 0.84]))]

In [91]:
print(clf_probs)

[[0.12 0.88]
 [0.16 0.84]
 [0.32 0.68]
 ...
 [0.6  0.4 ]
 [0.24 0.76]
 [0.76 0.24]]


# lightGBM

In [111]:
import lightgbm as lgb

In [147]:
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)

LGBMClassifier()

In [148]:
y_pred=clf.predict(X_test)

In [149]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_pred, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))

LightGBM Model accuracy score: 0.7401


In [137]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix\n\n', cm)
print('\nTrue Positives(TP) = ', cm[0,0])
print('\nTrue Negatives(TN) = ', cm[1,1])
print('\nFalse Positives(FP) = ', cm[0,1])
print('\nFalse Negatives(FN) = ', cm[1,0])

Confusion matrix

 [[9024 2482]
 [3522 8072]]

True Positives(TP) =  9024

True Negatives(TN) =  8072

False Positives(FP) =  2482

False Negatives(FN) =  3522


In [167]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.73      0.71     11506
           1       0.71      0.67      0.69     11594

    accuracy                           0.70     23100
   macro avg       0.70      0.70      0.70     23100
weighted avg       0.70      0.70      0.70     23100

