Logistic Regression for Heart Disease Prediction

In [2]:
import numpy as np
import pandas as pd
import sklearn

heartDisease = pd.read_csv('heart.csv')

In [3]:
heartDisease.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
heartDisease.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,918.0,53.510893,9.432617,28.0,47.0,54.0,60.0,77.0
RestingBP,918.0,132.396514,18.514154,0.0,120.0,130.0,140.0,200.0
Cholesterol,918.0,198.799564,109.384145,0.0,173.25,223.0,267.0,603.0
FastingBS,918.0,0.233115,0.423046,0.0,0.0,0.0,0.0,1.0
MaxHR,918.0,136.809368,25.460334,60.0,120.0,138.0,156.0,202.0
Oldpeak,918.0,0.887364,1.06657,-2.6,0.0,0.6,1.5,6.2
HeartDisease,918.0,0.553377,0.497414,0.0,0.0,1.0,1.0,1.0


Encoding categorical values

In [5]:
encoding = {    
                "Sex" :   {"M": 1, "F": 0},
                "ChestPainType" : {"ASY":4, "NAP":3, "ATA":2, "TA":1},
                "RestingECG" : {"Normal":3, "LVH":2, "ST":1},
                "ExerciseAngina" : {"N":0, "Y":1},
                "ST_Slope" : {"Flat":1,"Up":3,"Down":2}
            }

In [6]:
heartDisease = heartDisease.replace(encoding)
heartDisease.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,2,140,289,0,3,172,0,0.0,3,0
1,49,0,3,160,180,0,3,156,0,1.0,1,1
2,37,1,2,130,283,0,1,98,0,0.0,3,0
3,48,0,4,138,214,0,3,108,1,1.5,1,1
4,54,1,3,150,195,0,3,122,0,0.0,3,0


Preparing independent and dependent variables

In [7]:
X = heartDisease.drop('HeartDisease', axis = 1)
X.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,1,2,140,289,0,3,172,0,0.0,3
1,49,0,3,160,180,0,3,156,0,1.0,1
2,37,1,2,130,283,0,1,98,0,0.0,3
3,48,0,4,138,214,0,3,108,1,1.5,1
4,54,1,3,150,195,0,3,122,0,0.0,3


In [8]:
y = heartDisease['HeartDisease']
y.head()

0    0
1    1
2    0
3    1
4    0
Name: HeartDisease, dtype: int64

Modelling

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify = y, random_state = 42, train_size = 0.7)

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
classifier = LogisticRegression()

In [13]:
classifier.max_iter = 100000

In [14]:
model = classifier.fit(X_train,y_train)

Analysing Accuracy

In [15]:
print("Accuracy of Regression model : ",model.score(X_test,y_test))

Accuracy of Regression model :  0.8985507246376812


In [16]:
from sklearn.metrics import confusion_matrix

In [17]:
matrix = confusion_matrix(y_test, model.predict(X_test))
print(matrix)

print("Actual Value : True, Predicted Value : True = " + str(matrix[0][0]))
print("Actual Value : False, Predicted Value : True = " + str(matrix[0][1]))
print("Actual Value : True, Predicted Value : False  = " + str(matrix[1][0]))
print("Actual Value : False, Predicted Value : False  = " + str(matrix[1][1]))

[[108  15]
 [ 13 140]]
Actual Value : True, Predicted Value : True = 108
Actual Value : False, Predicted Value : True = 15
Actual Value : True, Predicted Value : False  = 13
Actual Value : False, Predicted Value : False  = 140


Dumping the model and other objects needed to make predictions with 

Dumped objects :

    Model - It can then be imported into the heart disease prediction program
    Storage Dataframe - Data must be passed to the model as a DataFrame with the appropriate column headers
    Input Series - Input can be recorded in this series, and be made a record of Storage Dataframe and then passed to model

In [18]:
import pickle

with open('LogModel.pickle', 'wb') as outfile:
    pickle.dump(model,outfile)

In [19]:
Storage = X.loc[10:11]

Storage

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
10,37,0,3,130,211,0,3,142,0,0.0,3
11,58,1,2,136,164,0,1,99,1,2.0,1


In [20]:
Input = X.loc[10]

Input

Age                37.0
Sex                 0.0
ChestPainType       3.0
RestingBP         130.0
Cholesterol       211.0
FastingBS           0.0
RestingECG          3.0
MaxHR             142.0
ExerciseAngina      0.0
Oldpeak             0.0
ST_Slope            3.0
Name: 10, dtype: float64

In [21]:
with open('Input_Series.pickle','wb') as outfile:
    pickle.dump(Input,outfile)

with open('Storage_DF.pickle','wb') as outfile:
    pickle.dump(Storage,outfile)