In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import *

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        path = os.path.join(dirname, filename)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dataset = pd.read_csv(path)
dataset.head()

***sex***
Woman or man (binary)

***age***
Age

***anemia***
Decrease in red blood cells or hemoglobin (boolean)

***creatinine_phosphokinase***
The level of the enzyme CPK in the blood (μg / l)

***diabetes***
If the patient has diabetes (boolean)

***ejection_fraction***
Percentage of blood leaving the heart with each beat (percentage)

***high_blood_pressure***
If the patient has hypertension (boolean)

***platelets***
Platelets in the blood (kilotrombocytes / ml)

***serum_creatinine***
Serum creatinine level in the blood (mg / dl)

***serum_sodium***
Serum sodium level in blood (mEq / L)

***sex***
Woman or man (binary)

***smoking***
Does the patient smoke or not (boolean)

***time***
Observation period (days)

***DEATH_EVENT***
If the patient died during the observation period (boolean)

In [None]:
contin_title = ['age', 'creatinine_phosphokinase',
       'ejection_fraction', 'platelets',
       'serum_creatinine', 'serum_sodium']

In [None]:
pdplot = pd.plotting.scatter_matrix(dataset[contin_title], c=dataset['DEATH_EVENT'], 
                                    figsize=(10, 10),  
                                    hist_kwds={'bins': 20},
                                    s = 50)

The scatter matrix shows that continuous data cannot be divided linearly. Using a linear regression model is not possible. At the same time, the patterns in the distribution of data are obvious.

Using the K-neighbors method to sample continuous data

In [None]:
contin_data = dataset[contin_title]

In [None]:
scaler = StandardScaler()
scaler.fit(contin_data)
X_contin_data_sc = scaler.transform(contin_data)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_contin_data_sc, dataset['DEATH_EVENT'])

In [None]:
KNN = KNeighborsClassifier()

In [None]:
KNN.fit(X_train, y_train)

To evaluate the model, we use the following metrics
1. **Accuracy** - percentage of correct answers of the algorithm <br>
$ accuracy = \frac {TP + TN} {TP + TN + FP + FN} $
<br> This metric is useless in problems with unequal classes
2. **Precision** (accuracy) - the proportion of objects called positive by the classifier and at the same time really positive <br>
$ precision = \frac {TP} {TP + FP} $
<br> Precision - the ability to distinguish one class from other classes.
3. **Recall** (completeness) - shows what proportion of objects of a positive class from all objects of a positive class was found by the algorithm <br>
$ recall = \frac {TP} {TP + FN} $
<br> Recall demonstrates the algorithm's ability to detect a given class at all
4. **F1-measure** - harmonic mean between accuracy and completeness <br>
$ F_1 = 2 \frac {Precision \ times Recall} {Precision + Recall} $
<br> It tends to zero if accuracy or completeness tends to zero

In [None]:
y_pred = KNN.predict(X_test)

In [None]:
f1_score(y_test, y_pred)

In [None]:
precision_score(y_test, y_pred)

In [None]:
recall_score(y_test, y_pred)

In [None]:
class_names = ['did not die', 'died']
disp = plot_confusion_matrix(KNN, X_test, y_test,
                                 display_labels=class_names)
disp.ax_.set_title('Confusion matrix')
confplot = disp.confusion_matrix

The recall and precision metrics, and the error matrix data indicate that the trained model does not recognize the death of the patient during the observation period, but always makes accurate predictions about the opposite. Those. the model is not able to predict death, but it confidently asserts that the patient will live even when he dies.

Let us establish the dependence of mortality from smoking, diabetes and high blood pressure (binary values in the table).

In [None]:
no_death = [dataset[dataset.smoking == 1].DEATH_EVENT.value_counts()[0],
            dataset[dataset.smoking == 0].DEATH_EVENT.value_counts()[0],
            dataset[dataset.diabetes == 1].DEATH_EVENT.value_counts()[0],
            dataset[dataset.diabetes == 0].DEATH_EVENT.value_counts()[0],
            dataset[dataset.high_blood_pressure == 1].DEATH_EVENT.value_counts()[0],
            dataset[dataset.high_blood_pressure == 0].DEATH_EVENT.value_counts()[0]]

In [None]:
yes_death = [dataset[dataset.smoking == 1].DEATH_EVENT.value_counts()[1],
             dataset[dataset.smoking == 0].DEATH_EVENT.value_counts()[1],
             dataset[dataset.diabetes == 1].DEATH_EVENT.value_counts()[1],
             dataset[dataset.diabetes == 0].DEATH_EVENT.value_counts()[1],
             dataset[dataset.high_blood_pressure == 1].DEATH_EVENT.value_counts()[1],
             dataset[dataset.high_blood_pressure == 0].DEATH_EVENT.value_counts()[1]]

In [None]:
index = ['Smoking', 'No smoking', 'Diabets', 'No diabets', 'High blood pressure', 'No high blood pressure']
bin_data = pd.DataFrame({"Didn't die": no_death,
                        'Died': yes_death}, index=index)
bin_data.plot.bar(stacked=True)

In [None]:
bin_data.insert(loc=1, column='% no', value=[round(n*100/(n+y), 1) for n, y in zip(no_death, yes_death)])
bin_data.insert(loc=3, column='% yes', value=[round(y*100/(n+y), 1) for n, y in zip(no_death, yes_death)])

bin_data

In [None]:
death = dataset.DEATH_EVENT.value_counts()
print('The number of deaths during the observation period: ', round(death[1]*100/(dataset.DEATH_EVENT.count()+1), 1), '%')

Due to the lack of data, it is impossible to draw conclusions about the dependence of mortality on parameters such as high blood pressure, diabetes and smoking.

Consider the dependence of other health indicators on smoking

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))
titles = ['age', 'creatinine_phosphokinase',
       'ejection_fraction', 'platelets',
       'serum_creatinine', 'serum_sodium']
iter = 0

for ax in axes.flat:
    ax.set(title=titles[iter])
    ax.hist(dataset[dataset.smoking == 0][titles[iter]], alpha=0.3, bins=6)
    ax.hist(dataset[dataset.smoking == 1][titles[iter]], alpha=0.3, bins=6)
    iter += 1

plt.show()

**TOTAL**
<br>
Based on the data obtained, a study was carried out of the dependence of various health indicators. As a result of this study, it was not possible to draw unambiguous conclusions due to the small amount of data.