In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Heart Failure Prediction

![](https://i.imgur.com/f1Kast5.png)

In [None]:
file_path='/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv'

In [None]:
#establsih file path
datah=pd.read_csv(file_path)

# Exploring and Undertanding Data

In [None]:
#lets see what we have
datah.head()

**Understanding Terms**
* Anemia happens when the number of healthy red blood cells in your body is too low. Red blood cells carry oxygen to all of the body’s tissues, so a low red blood cell count indicates that the amount of oxygen in your blood is lower than it should be.

* When the total CPK level is very high, it most often means there has been injury or stress to muscle tissue, the heart, or the brain. Muscle tissue injury is most likely. When a muscle is damaged, CPK leaks into the bloodstream. Finding which specific form of CPK is high helps determine which tissue has been damaged.

* Ejection fraction (EF) is a measurement, expressed as a percentage, of how much blood the left ventricle pumps out with each contraction. An ejection fraction of 60 percent means that 60 percent of the total amount of blood in the left ventricle is pushed out with each heartbeat.

* Platelets are tiny blood cells that help your body form clots to stop bleeding.Too many platelets can lead to certain conditions, including stroke, heart attack, or a clot in the blood vessels. There are two types of thrombocytosis: primary and secondary.

* Creatinine is a waste product that comes from the normal wear and tear on muscles of the body. Everyone has creatinine in their bloodstream.A high serum creatinine level means that your kidneys aren't working well.

In [None]:
#feature selection and importance evaluation


plt.rcParams['figure.figsize']=15,6 
sns.set_style("darkgrid")

x = datah.iloc[:, :-1]
y = datah.iloc[:,-1]

from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(x,y)
print(model.feature_importances_) 
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.nlargest(12).plot(kind='barh')
plt.show()

The "time' feature is means: numbers of days spent in hospital before the patient was released or died. This feature can not be used for modelling and will be ignored.

In [None]:
datah.info()

Finding outliers

In [None]:
# Boxplot for ejection_fraction

sns.boxplot(x = datah.ejection_fraction, color = 'teal')
plt.show()

as per above plot 2 outliers are existing (which are equal or above 70). We should remove them.

In [None]:
datah[datah['ejection_fraction']>=70]

In [None]:
datah[datah['ejection_fraction']<=70]

In [None]:
# Boxplots
sns.boxplot(x=datah.creatinine_phosphokinase, color='teal')


In [None]:
sns.boxplot(x=datah.ejection_fraction, color='teal')


In [None]:
sns.boxplot(x=datah.platelets, color='teal')


In [None]:
sns.boxplot(x=datah.serum_creatinine, color='teal')


In [None]:
sns.boxplot(x=datah.serum_sodium, color='teal')

In [None]:
#sordting by death
#a=datah.groupby('diabetes').sex.count()
#b=datah.groupby('DEATH_EVENT').smoking.count()
#print (a, b)
#datah.groupby(['anaemia']).apply(lambda df:df.loc[df.smoking.idxmax()])
#r.describe()
datah.groupby(['sex','smoking','diabetes']).DEATH_EVENT.agg([len,min, max])

**Interesting observations:**

Males, smoker and with diabetes - 28/
Femails, smoker and diabetes - 53 

Male, none smoker and none diabetes - 60/
Female, none smoker and none diabetes - 48


In [None]:
datah.columns

In [None]:
from pandas.plotting import scatter_matrix 
attributes =['age','creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine', 'serum_sodium','sex']
scatter_matrix(datah[attributes], figsize=(12,8))

In [None]:
#look into correlation (i.e. Pearson's R)
corr_matrix=datah.corr()
corr_matrix['DEATH_EVENT'].sort_values(ascending=False)

In [None]:
datah.describe()

As per above statistics:
* data provided with 299 patients
* the youngest patient was at 40 years old and oldest was at 95 years old
* the observed minumm heart ejection fraction was 14 (which is very bad), maximum is 80, mean 38% and majority was at 45%.

In [None]:
death=datah['DEATH_EVENT']==1
#age_l=data['age']<50
datah[death]

In [None]:
#counting 
datah['DEATH_EVENT'].value_counts()

In [None]:

datah.plot(kind='hist',datah['DEATH_EVENT'], title='norm')
plt.show()

In [None]:
#proportion
datah['age'].value_counts(normalize=True)

In [None]:
#groupped summaries
datah.groupby(["DEATH_EVENT","age"])['DEATH_EVENT'].mean()

In [None]:
plt.figure(figsize=(14,6))
plt.title('Hear failure')
sns.lineplot(data=datah['age'], label='age')
sns.lineplot(data=datah['time'], label='time')
plt.xlabel('age')

CPK test. Creatine phosphokinase (CPK) is an enzyme in the body. It is found mainly in the heart, brain, and skeletal muscle. This article discusses the test to measure the amount of CPK in the blood. Blood is drawn from a vein (venipuncture), usually from the inside of the elbow or the back of the hand

Hypertension, also known as high or raised blood pressure, is a condition in which the blood vessels have persistently raised pressure. Blood is carried from the heart to all parts of the body in the vessels. Each time the heart beats, it pumps blood into the vessels.

In [None]:
list(datah)

In [None]:
#heatmap
plt.title ('Heatmap')
sns.heatmap(data=datah, annot=True)
plt.xlabel('serum_creatinine')

In [None]:
plt.plot(datah['platelets'], datah['age'], linestyle='none',marker='o',color='b')
plt.show()

In [None]:
sns.regplot(x=datah['platelets'], y=datah['age'])

In [None]:
sns.regplot(x=datah['serum_creatinine'], y=datah['DEATH_EVENT'])

In [None]:
sns.swarmplot(x=datah['DEATH_EVENT'], y=datah['serum_creatinine'])

In [None]:
sns.swarmplot(x=datah['DEATH_EVENT'], y=datah['ejection_fraction'])

In [None]:
sns.swarmplot(x=datah['DEATH_EVENT'], y=datah['serum_sodium'])

In [None]:
sns.distplot(a=datah['age'], kde=False)

In [None]:
sns.distplot(a=datah['age'],label='Age', kde=False)
#sns.distplot(a=datah['DEATH_EVENT'],label='DE',kde=False)
sns.distplot(a=datah['serum_sodium'], label='serum_sodium', kde=False)
plt.title("heart failure")
plt.legend()

In [None]:
datah.columns

In [None]:
import plotly.graph_objects as go

fig=go.Figure()
fig.add_trace(go.Histogram(
    x=datah['age'],
    xbins=dict(
        start=40,
        end=95,
        size=2
    ),
    marker_color='#e8ab60',
    opacity=1
))
fig.update_layout(
    title_text='AGE Distribution',
    xaxis_title_text='Age',
    yaxis_title_text='Count',
    bargap=0.05,
    xaxis={'showgrid':False},
    yaxis={'showgrid':False},
    template='plotly_dark')
fig.show()


In [None]:
#Ejection Fraction distribution
import plotly.graph_objects as go 

fig=go.Figure()
fig.add_trace(go.Histogram(
    x=datah['ejection_fraction'],
    xbins=dict(
        start=14,
        end=80,
        size=2
    ),
    marker_color="#A7F432",
    opacity=0.8))
fig.update_layout(
    title_text='Ejection Fraction Distribution',
    xaxis_title_text='Ejection Fraction',
    yaxis_title_text='count',
    bargap=0.05,
    xaxis={'showgrid':False},
    yaxis={'showgrid':False},
    template='plotly_dark')
fig.show()

# **Machine Learning**

In [None]:
#dropping missing values
datah=datah.dropna(axis=0)


In [None]:
#selecting the prediction target
y=datah.DEATH_EVENT

In [None]:
#choosing Features
death_features=['age','anaemia','creatinine_phosphokinase','diabetes','ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking']
X=datah[death_features]

In [None]:
#ensuring that all features are included
X.head()

In [None]:
#building model
from sklearn.tree import DecisionTreeRegressor 
#define model
heartf_model=DecisionTreeRegressor(random_state=1)
#fit model
#heartf_model.fit(X,y)

In [None]:
#splitting data

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
train_X, val_X, train_y, val_y=train_test_split(X,y, random_state=0)

#fitting model
heartf_model.fit(train_X,train_y)

#get prediction on validation data
val_predictions=heartf_model.predict(val_X)
print(mean_absolute_error(val_y,val_predictions))


In [None]:
#function to get MAE
def get_mae (max_leaf_nodes, train_X, val_X, train_y, val_y):
    model=DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X,train_y)
    preds_val=model.predict(val_X)
    mae=mean_absolute_error(val_y, preds_val)
    return(mae)
#create a list of different leafe nodes
heartfailure_max_leaf_nodes=[2,3,4,10,50,100]
#store the best value of max_leaf_nodes
scores={leaf_size:get_mae(leaf_size, train_X, val_X,train_y, val_y)
       for leaf_size in heartfailure_max_leaf_nodes}
best_tree_size=min(scores, key=scores.get)
print (best_tree_size)
print (scores)

In [None]:
#based on above an optimum nodes is 4=0.3311382897487576

final_model=DecisionTreeRegressor(max_leaf_nodes=best_tree_size,random_state=1)
final_model.fit(X,y)

In [None]:
#get prediction on validation data
val_predictions=final_model.predict(val_X)
print(mean_absolute_error(val_y,val_predictions))