In [40]:
#Data preparation

In [41]:
#importing necessary packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
%matplotlib inline
import seaborn as sns

In [42]:
clinical_records = pd.read_csv("heart_failure_clinical_records_dataset.csv", error_bad_lines=False)

In [43]:
clinical_records.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [44]:
clinical_records.shape

(299, 13)

In [45]:
clinical_records.dtypes

age                         float64
anaemia                       int64
creatinine_phosphokinase      int64
diabetes                      int64
ejection_fraction             int64
high_blood_pressure           int64
platelets                   float64
serum_creatinine            float64
serum_sodium                  int64
sex                           int64
smoking                       int64
time                          int64
DEATH_EVENT                   int64
dtype: object

In [46]:
clinical_records.describe()
#creatinine_phosphokinase, platelets, serum_creatinine and serum_sodium need more investigation as max values look too high
#age and time looks fine since minimum and maximum age looks accurate

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.833893,0.431438,581.839465,0.41806,38.083612,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,11.894809,0.496107,970.287881,0.494067,11.834841,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,116.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.0,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


In [47]:
#checking for null values
clinical_records.isnull().any()

age                         False
anaemia                     False
creatinine_phosphokinase    False
diabetes                    False
ejection_fraction           False
high_blood_pressure         False
platelets                   False
serum_creatinine            False
serum_sodium                False
sex                         False
smoking                     False
time                        False
DEATH_EVENT                 False
dtype: bool

In [48]:
#checking if there are different values
clinical_records['DEATH_EVENT'].value_counts()

0    203
1     96
Name: DEATH_EVENT, dtype: int64

In [49]:
clinical_records['sex'].value_counts()

1    194
0    105
Name: sex, dtype: int64

In [50]:
clinical_records['anaemia'].value_counts()

0    170
1    129
Name: anaemia, dtype: int64

In [51]:
clinical_records['diabetes'].value_counts()

0    174
1    125
Name: diabetes, dtype: int64

In [52]:
clinical_records['high_blood_pressure'].value_counts()

0    194
1    105
Name: high_blood_pressure, dtype: int64

In [53]:
clinical_records['smoking'].value_counts()

0    203
1     96
Name: smoking, dtype: int64

In [54]:
#Investigating Creatinine Phosphokinase
CP_data = clinical_records['creatinine_phosphokinase']
Q1 = CP_data.quantile(0.25)
Q3 = CP_data.quantile(0.75)
IQR = Q3-Q1
min_whisker = Q1-(IQR*1.5)
max_whisker = Q3 +(IQR*1.5)
outliers = CP_data[(CP_data<min_whisker)|(CP_data>max_whisker)]
outliers.values
#We can't assume these values to be impossible or invalid outliers as according to 
#https://www.mountsinai.org/health-library/tests/creatine-phosphokinase-test, there are possibilities for abnormal values
#Therefore, there is no justification to remove these values. We can instead use it for our analysis

array([7861, 2656, 1380, 3964, 7702, 5882, 5209, 1876, 1808, 4540, 1548,
       1610, 2261, 1846, 2334, 2442, 3966, 1419, 1896, 1767, 2281, 2794,
       2017, 2522, 2695, 1688, 1820, 2060, 2413], dtype=int64)

In [55]:
#Investigating Platelets
P_data = clinical_records['platelets']
Q1 = P_data.quantile(0.25)
Q3 = P_data.quantile(0.75)
IQR = Q3-Q1
min_whisker = Q1-(IQR*1.5)
max_whisker = Q3 +(IQR*1.5)
outliers = P_data[(P_data<min_whisker)|(P_data>max_whisker)]
outliers.values
#Same as Creatinine phospholinase, humans can have higher than normal rates of platelets in the blood. This was stated by 
#https://www.infobloom.com/what-is-a-high-platelet-count.htm . As a result, there is no strong reason to remove these values

array([454000.,  47000., 451000., 461000., 497000., 621000., 850000.,
       507000., 448000.,  75000.,  70000.,  73000., 481000., 504000.,
        62000., 533000.,  25100., 451000.,  51000., 543000., 742000.])

In [56]:
#Investigating Serum Creatinine
SC_data = clinical_records['serum_creatinine']
Q1 = SC_data.quantile(0.25)
Q3 = SC_data.quantile(0.75)
IQR = Q3-Q1
min_whisker = Q1-(IQR*1.5)
max_whisker = Q3 +(IQR*1.5)
outliers = SC_data[(SC_data<min_whisker)|(SC_data>max_whisker)]
outliers.values
#According to the article https://www.emedicinehealth.com/creatinine_blood_tests/article_em.htm , a higher value of 
#serum creatinine signals health problems in the patient. Therefore, these are valid outliers

array([2.7, 9.4, 4. , 5.8, 3. , 3.5, 2.3, 3. , 4.4, 6.8, 2.2, 2.7, 2.3,
       2.9, 2.5, 2.3, 3.2, 3.7, 3.4, 6.1, 2.5, 2.4, 2.5, 3.5, 9. , 5. ,
       2.4, 2.7, 3.8])

In [57]:
#Investigating Serum Sodium
SS_data = clinical_records['serum_sodium']
Q1 = SS_data.quantile(0.25)
Q3 = SS_data.quantile(0.75)
IQR = Q3-Q1
min_whisker = Q1-(IQR*1.5)
max_whisker = Q3 +(IQR*1.5)
outliers = SS_data[(SS_data<min_whisker)|(SS_data>max_whisker)]
outliers.values
#It is possible for a patient to have serum sodium levels below the normal 135 mEq/L, so these values are not invalid outliers

array([116, 121, 124, 113], dtype=int64)

In [58]:
#The data looks clean and no further checking is required.

In [59]:
#Data Exploration