### Project Objective : 
+ Create a machine learning model for predicting mortality caused by Heart Failure.  

### Importing Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing datasets

+ Cardiovascular diseases (CVDs) are the number 1 cause of death globally, taking an estimated 17.9 million lives each year, which accounts for 31% of all deaths worlwide. Heart failure is a common event caused by CVDs and this dataset contains 12 features that can be used to predict mortality by heart failure.
+ Most cardiovascular diseases can be prevented by addressing behavioural risk factors such as tobacco use, unhealthy diet and obesity, physical inactivity and harmful use of alcohol using population-wide strategies.

+ People with cardiovascular disease or who are at high cardiovascular risk (due to the presence of one or more risk factors such as hypertension, diabetes, hyperlipidaemia or already established disease) need early detection and management wherein a machine learning model can be of great help.
+ https://archive.ics.uci.edu/ml/datasets/Heart+failure+clinical+records

In [2]:
dataset = pd.read_csv('D:\heart_failure_clinical_records_dataset.csv')
dataset

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,270,0
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,271,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,278,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,280,0


In [3]:
dataset.keys()

Index(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time',
       'DEATH_EVENT'],
      dtype='object')

### Seperating features

In [4]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

### Features of the datasets needed for training the Model

In [5]:
X

array([[7.500e+01, 0.000e+00, 5.820e+02, ..., 1.000e+00, 0.000e+00,
        4.000e+00],
       [5.500e+01, 0.000e+00, 7.861e+03, ..., 1.000e+00, 0.000e+00,
        6.000e+00],
       [6.500e+01, 0.000e+00, 1.460e+02, ..., 1.000e+00, 1.000e+00,
        7.000e+00],
       ...,
       [4.500e+01, 0.000e+00, 2.060e+03, ..., 0.000e+00, 0.000e+00,
        2.780e+02],
       [4.500e+01, 0.000e+00, 2.413e+03, ..., 1.000e+00, 1.000e+00,
        2.800e+02],
       [5.000e+01, 0.000e+00, 1.960e+02, ..., 1.000e+00, 1.000e+00,
        2.850e+02]])

In [6]:
X.shape

(299, 12)

### Dependent variable 

+ 1: alive
+ 0: death

In [7]:
y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtyp

### Splitting the datasets into training sets and the test sets

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Creating Classification model

### Checking performace of different classification model

### 1. Naive Bayes Classification Model

In [9]:
# Importing library for naive_bayes classification
from sklearn.naive_bayes import GaussianNB

# Creating instance
classifier1 = GaussianNB()

# Traning model as Naive_bayes classification model
classifier1.fit(X_train, y_train)

# Predicting test sets results using Model
y_pred = classifier1.predict(X_test)

# Comparing the predicted y and actual y to ensure accuracy of model
Df_1 = pd.DataFrame(y_pred)
Df_1.columns = ['Predicted_Events']
Df_1['Actual_Events'] = y_test
Df_1

Unnamed: 0,Predicted_Events,Actual_Events
0,0,0
1,0,0
2,1,1
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,1,1
9,0,0


In [10]:
# Importing library for creating Confusion matrix and accuracy
from sklearn.metrics import confusion_matrix, accuracy_score

In [11]:
# Creating Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix :\n\n',cm)

# Finding R_score
R = accuracy_score(y_test, y_pred)
print('\nR score = ',R)

Confusion Matrix :

 [[36  1]
 [13 10]]

R score =  0.7666666666666667


### 2. The Decision Tree Classification model

In [12]:
# Importing library for Decision Tree Classification
from sklearn.tree import DecisionTreeClassifier

# Creating instance
classifier2 = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)

# Traning model as Decision Tree Classification model
classifier2.fit(X_train, y_train)

# Predicting test sets results using Model
y_pred = classifier2.predict(X_test)

# Comparing the predicted y and actual y to ensure accuracy of model
Df_2 = pd.DataFrame(y_pred)
Df_2.columns = ['Predicted_Events']
Df_2['Actual_Events'] = y_test
Df_2

Unnamed: 0,Predicted_Events,Actual_Events
0,0,0
1,1,0
2,1,1
3,0,0
4,0,0
5,0,0
6,1,0
7,0,0
8,1,1
9,0,0


In [13]:
# Creating Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix :\n\n',cm)

# Finding R_score
R = accuracy_score(y_test, y_pred)
print('\nR score = ',R)

Confusion Matrix :

 [[34  3]
 [ 6 17]]

R score =  0.85


### 3. The KNN Classification model

In [14]:
# Importing library for K Neighbors Classification
from sklearn.neighbors import KNeighborsClassifier

# Creating instance
classifier3 = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)

# Traning model as KNN Classification model
classifier3.fit(X_train, y_train)

# Predicting test sets results using Model
y_pred = classifier3.predict(X_test)

# Comparing the predicted y and actual y to ensure accuracy of model
Df_3 = pd.DataFrame(y_pred)
Df_3.columns = ['Predicted_Events']
Df_3['Actual_Events'] = y_test
Df_3

Unnamed: 0,Predicted_Events,Actual_Events
0,1,0
1,0,0
2,0,1
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,1
9,0,0


In [15]:
# Creating Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix :\n\n',cm)

# Finding R_score
R = accuracy_score(y_test, y_pred)
print('\nR score = ',R)

Confusion Matrix :

 [[25 12]
 [19  4]]

R score =  0.48333333333333334


### 4. The Kernel SVM classification model

In [16]:
# Importing library for Kernel SVM model
from sklearn.svm import SVC

# Creating instance
classifier4 = SVC(kernel = 'rbf', random_state = 0)

# Traning model as KNN Classification model
classifier4.fit(X_train, y_train)

# Predicting test sets results using Model
y_pred = classifier4.predict(X_test)

# Comparing the predicted y and actual y to ensure accuracy of model
Df_4 = pd.DataFrame(y_pred)
Df_4.columns = ['Predicted_Events']
Df_4['Actual_Events'] = y_test
Df_4

Unnamed: 0,Predicted_Events,Actual_Events
0,0,0
1,0,0
2,0,1
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,1
9,0,0


In [17]:
# Creating Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix :\n\n',cm)

# Finding R_score
R = accuracy_score(y_test, y_pred)
print('\nR score = ',R)

Confusion Matrix :

 [[37  0]
 [23  0]]

R score =  0.6166666666666667


### 5 The Logistic Regression model

In [18]:
# Importing library for the Logistic Regression model
from sklearn.linear_model import LogisticRegression

# Creating instance
classifier5 = LogisticRegression(random_state = 0)

# Traning model as KNN Classification model
classifier5.fit(X_train, y_train)

# Predicting test sets results using Model
y_pred = classifier5.predict(X_test)

# Comparing the predicted y and actual y to ensure accuracy of model
Df_5 = pd.DataFrame(y_pred)
Df_5.columns = ['Predicted_Events']
Df_5['Actual_Events'] = y_test
Df_5

Unnamed: 0,Predicted_Events,Actual_Events
0,0,0
1,0,0
2,1,1
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,1
9,0,0


In [19]:
# Creating Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix :\n\n',cm)

# Finding R_score
R = accuracy_score(y_test, y_pred)
print('\nR score = ',R)

Confusion Matrix :

 [[35  2]
 [14  9]]

R score =  0.7333333333333333


### 6. The Random Forest Classification model

In [20]:
# Importing library for the Random Forest Classification model
from sklearn.ensemble import RandomForestClassifier

# Creating instance
classifier6 = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)

# Traning model as KNN Classification model
classifier6.fit(X_train, y_train)

# Predicting test sets results using Model
y_pred = classifier6.predict(X_test)

# Comparing the predicted y and actual y to ensure accuracy of model
Df_6 = pd.DataFrame(y_pred)
Df_6.columns = ['Predicted_Events']
Df_6['Actual_Events'] = y_test
Df_6

Unnamed: 0,Predicted_Events,Actual_Events
0,0,0
1,0,0
2,0,1
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,1
9,0,0


In [21]:
# Creating Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix :\n\n',cm)

# Finding R_score
R = accuracy_score(y_test, y_pred)
print('\nR score = ',R)

Confusion Matrix :

 [[36  1]
 [10 13]]

R score =  0.8166666666666667


### 7. The SVM Classification model

In [22]:
# Importing library for the SVM model
from sklearn.svm import SVC

# Creating instance
classifier7 = SVC(kernel = 'linear', random_state = 0)

# Traning model as KNN Classification model
classifier7.fit(X_train, y_train)

# Predicting test sets results using Model
y_pred = classifier7.predict(X_test)

# Comparing the predicted y and actual y to ensure accuracy of model
Df_7 = pd.DataFrame(y_pred)
Df_7.columns = ['Predicted_Events']
Df_7['Actual_Events'] = y_test
Df_7

Unnamed: 0,Predicted_Events,Actual_Events
0,0,0
1,0,0
2,1,1
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,1,1
9,0,0


In [23]:
# Creating Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix :\n\n',cm)

# Finding R_score
R = accuracy_score(y_test, y_pred)
print('\nR score = ',R)

Confusion Matrix :

 [[36  1]
 [13 10]]

R score =  0.7666666666666667


# Finalising Model : The Decision Tree Classification(85% Accuracy)

+ As we saw above The Decision Tree Classification model performed well so we are going to train our data Using The Decision Tree Classification model for better prediction

In [24]:
def Prediction_Single_Feature(X):
    new_y_pred2 = classifier2.predict(X)
    print("Predicted value by Decision Tree Classification Model: ",new_y_pred2[0])

### Predicting 

+ A woman having features as:
    - Age = 20
    - Amaemia = 1
    - creatinine_phosphokinase = 7000
    - diabetes = 1
    - ejection_fraction = 20
    - high_blood_pressure = 0
    - platelets = 260000
    - serum_creatinine = 2.0
    - serum_sodium = 115
    - sex = 0
    - smoking = 0
    - time = 40

In [25]:
X1 = [[20,1,7000,1,20,0,260000,2,115,0,0,40]]
Prediction_Single_Feature(X1)

Predicted value by Decision Tree Classification Model:  1
