# Innovative Assignment Name : Heart Disease Prediction Model

Course Code & Name :- 2CS404, Programming for Scientific Computing

Group Members Roll no. :- 21BCE248, 21BCE259, 21BCE296

Faculty Name :- Malaram Kumhar

Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# libraries to handle the null values
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

Important links to explore above libraries

Pandas :
https://pandas.pydata.org/docs/user_guide/index.html

Numpy :
https://numpy.org/doc/stable/user/index.html

Logistic Regression :
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

RandomForestRegressor :
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn-ensemble-randomforestregressor

Random Forest :
https://scikit-learn.org/stable/modules/ensemble.html#random-forests

Preprocessing Data :
https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-data

Handling null values :
https://towardsdatascience.com/6-tips-for-dealing-with-null-values-e16d1d1a1b33

Data Collection and Processing

In [2]:
# loading the csv data to a Pandas DataFrame
heart_data = pd.read_csv('heart_disease_data.csv')

In [3]:
# print first 5 rows of the dataset
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52.0,1.0,0,125,212.0,0,1,168.0,0,1.0,2,2,3,0.0
1,53.0,1.0,0,140,203.0,1,0,155.0,1,3.1,0,0,3,0.0
2,70.0,1.0,0,145,174.0,0,1,125.0,1,2.6,0,0,3,0.0
3,61.0,1.0,0,148,203.0,0,1,161.0,0,0.0,2,1,3,0.0
4,62.0,0.0,0,138,294.0,1,1,106.0,0,1.9,1,3,2,0.0


In [4]:
# print last 5 rows of the dataset
heart_data.tail()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
1023,50.0,0.0,0,110,254.0,0,0,159.0,0,0.0,2,0,2,1.0
1024,54.0,1.0,0,120,188.0,0,1,113.0,0,1.4,1,1,3,0.0
1025,61.0,,0,148,203.0,0,1,161.0,0,0.0,2,1,3,
1026,,1.0,0,112,204.0,0,1,143.0,0,0.1,2,0,2,
1027,47.0,1.0,2,108,,0,1,,0,0.0,2,0,2,0.0


In [5]:
# number of rows and columns in the dataset
heart_data.shape

(1028, 14)

In [6]:
# getting some info about the data
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028 entries, 0 to 1027
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1027 non-null   float64
 1   sex       1027 non-null   float64
 2   cp        1028 non-null   int64  
 3   trestbps  1028 non-null   int64  
 4   chol      1027 non-null   float64
 5   fbs       1028 non-null   int64  
 6   restecg   1028 non-null   int64  
 7   thalach   1027 non-null   float64
 8   exang     1028 non-null   int64  
 9   oldpeak   1028 non-null   float64
 10  slope     1028 non-null   int64  
 11  ca        1028 non-null   int64  
 12  thal      1028 non-null   int64  
 13  target    1026 non-null   float64
dtypes: float64(6), int64(8)
memory usage: 112.6 KB


In [7]:
# checking for missing values
heart_data.isnull().sum()

age         1
sex         1
cp          0
trestbps    0
chol        1
fbs         0
restecg     0
thalach     1
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      2
dtype: int64

Handling the null values.

In [8]:
# Here we are using RandomForestRegressor. We can use any other regression model also.
# we will do imputation and will save imputed dataset as heart_data2

heart_data1 = heart_data[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']]
imptr = IterativeImputer(RandomForestRegressor(), max_iter=10, random_state=0)
heart_data2 = pd.DataFrame(imptr.fit_transform(heart_data1), columns = heart_data1.columns)
heart_data2 # unimputed data, the data that handled the null values

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52.00,1.00,0.0,125.0,212.00,0.0,1.0,168.00,0.0,1.0,2.0,2.0,3.0,0.0
1,53.00,1.00,0.0,140.0,203.00,1.0,0.0,155.00,1.0,3.1,0.0,0.0,3.0,0.0
2,70.00,1.00,0.0,145.0,174.00,0.0,1.0,125.00,1.0,2.6,0.0,0.0,3.0,0.0
3,61.00,1.00,0.0,148.0,203.00,0.0,1.0,161.00,0.0,0.0,2.0,1.0,3.0,0.0
4,62.00,0.00,0.0,138.0,294.00,1.0,1.0,106.00,0.0,1.9,1.0,3.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1023,50.00,0.00,0.0,110.0,254.00,0.0,0.0,159.00,0.0,0.0,2.0,0.0,2.0,1.0
1024,54.00,1.00,0.0,120.0,188.00,0.0,1.0,113.00,0.0,1.4,1.0,1.0,3.0,0.0
1025,61.00,0.99,0.0,148.0,203.00,0.0,1.0,161.00,0.0,0.0,2.0,1.0,3.0,0.0
1026,47.02,1.00,0.0,112.0,204.00,0.0,1.0,143.00,0.0,0.1,2.0,0.0,2.0,1.0


In [9]:
# checking for missing values
heart_data2.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [10]:
# Replacing imputed data with new data. / Imputation
heart_data = heart_data2
heart_data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52.00,1.00,0.0,125.0,212.00,0.0,1.0,168.00,0.0,1.0,2.0,2.0,3.0,0.0
1,53.00,1.00,0.0,140.0,203.00,1.0,0.0,155.00,1.0,3.1,0.0,0.0,3.0,0.0
2,70.00,1.00,0.0,145.0,174.00,0.0,1.0,125.00,1.0,2.6,0.0,0.0,3.0,0.0
3,61.00,1.00,0.0,148.0,203.00,0.0,1.0,161.00,0.0,0.0,2.0,1.0,3.0,0.0
4,62.00,0.00,0.0,138.0,294.00,1.0,1.0,106.00,0.0,1.9,1.0,3.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1023,50.00,0.00,0.0,110.0,254.00,0.0,0.0,159.00,0.0,0.0,2.0,0.0,2.0,1.0
1024,54.00,1.00,0.0,120.0,188.00,0.0,1.0,113.00,0.0,1.4,1.0,1.0,3.0,0.0
1025,61.00,0.99,0.0,148.0,203.00,0.0,1.0,161.00,0.0,0.0,2.0,1.0,3.0,0.0
1026,47.02,1.00,0.0,112.0,204.00,0.0,1.0,143.00,0.0,0.1,2.0,0.0,2.0,1.0


In [11]:
# statistical measures about the data
heart_data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1028.0,1028.0,1028.0,1028.0,1028.0,1028.0,1028.0,1028.0,1028.0,1028.0,1028.0,1028.0,1028.0,1028.0
mean,54.426089,0.696488,0.941634,131.585603,245.914018,0.148833,0.531128,149.122802,0.335603,1.068482,1.38716,0.752918,2.32393,0.512646
std,9.067264,0.459988,1.029505,17.524762,51.55127,0.356097,0.527717,22.976089,0.472431,1.174676,0.617744,1.029857,0.620277,0.500083
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [12]:
# checking the distribution of Target Variable
heart_data['target'].value_counts()

1.0    527
0.0    501
Name: target, dtype: int64

1 --> Defective Heart

0 --> Healthy Heart

Splitting the Features and Target

In [13]:
X = heart_data.drop(columns='target', axis=1)
Y = heart_data['target']

In [14]:
print(X)

        age   sex   cp  trestbps    chol  fbs  restecg  thalach  exang  \
0     52.00  1.00  0.0     125.0  212.00  0.0      1.0   168.00    0.0   
1     53.00  1.00  0.0     140.0  203.00  1.0      0.0   155.00    1.0   
2     70.00  1.00  0.0     145.0  174.00  0.0      1.0   125.00    1.0   
3     61.00  1.00  0.0     148.0  203.00  0.0      1.0   161.00    0.0   
4     62.00  0.00  0.0     138.0  294.00  1.0      1.0   106.00    0.0   
...     ...   ...  ...       ...     ...  ...      ...      ...    ...   
1023  50.00  0.00  0.0     110.0  254.00  0.0      0.0   159.00    0.0   
1024  54.00  1.00  0.0     120.0  188.00  0.0      1.0   113.00    0.0   
1025  61.00  0.99  0.0     148.0  203.00  0.0      1.0   161.00    0.0   
1026  47.02  1.00  0.0     112.0  204.00  0.0      1.0   143.00    0.0   
1027  47.00  1.00  2.0     108.0  242.61  0.0      1.0   152.24    0.0   

      oldpeak  slope   ca  thal  
0         1.0    2.0  2.0   3.0  
1         3.1    0.0  0.0   3.0  
2        

In [15]:
print(Y)

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
1023    1.0
1024    0.0
1025    0.0
1026    1.0
1027    0.0
Name: target, Length: 1028, dtype: float64


Splitting the Data into Training data & Test Data

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [17]:
print(X.shape, X_train.shape, X_test.shape)

(1028, 13) (822, 13) (206, 13)


Model Training

Logistic Regression

In [18]:
model = LogisticRegression()

In [19]:
# training the LogisticRegression model with Training data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

Model Evaluation

Accuracy Score

In [20]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [21]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.8479318734793188


In [22]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [23]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.8009708737864077


Building a Predictive System

In [24]:
print("Enter the number of persons : ",end='')
n = int(input())
result = []

print("\nHow you want to enter the data (1)Manually or (2)In string format? \n Enter the number of your choice : ", end='')
choice = int(input())

if(choice==2):
    print("Input the data in string format (values separated by ',' comma.)")

for i in range(n):
    
    if(choice==1):
        print("\nEnter the details of person no.({})".format(i+1))
        age = int(input("\nAge (between 0 to 100 years): "))
        sex = int(input("Sex (1 = male, 0 = female) : "))
        cp = int(input("Chest Pain type (0 to 4) : "))
        trestbps = int(input("Resting blood pressure (in mm Hg) : "))
        chol = int(input("Serum cholestoral (in mg/dl) : "))
        fbs = int(input("Fasting blood sugar & gt (in mg/dl) (1 = true, 0 = false) : "))
        restecg = int(input("Resting electrocardiographic : "))
        thalach = int(input("Maximum heart rate achieved : "))
        exang = int(input("Exercise induced angina (1 = yes, 0 = no) : "))
        oldpeak = float(input("ST depression induced by exercise relative to rest : "))
        slope = int(input("The slope of exercise ST segment (0 to 2) : "))
        ca = int(input("Number of major vessels (0-3 colored by flourosopy (0 to 3) : )"))
        thal = int(input("Thalassemia (1 = normal, 2 = fixed defect, 3 = reversable defect) : "))
        
        input_data = (age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal)
        
    elif(choice==2):
        print("\nEnter the data of person no.({}) : ".format(i+1), end='')
        input_str = input()
        input_split = input_str.split(",")
        
        age = int(input_split[0])
        sex = int(input_split[1])
        cp = int(input_split[2])
        trestbps = int(input_split[3])
        chol = int(input_split[4])
        fbs = int(input_split[5])
        restecg = int(input_split[6])
        thalach = int(input_split[7])
        exang = int(input_split[8])
        oldpeak = float(input_split[9])
        slope = int(input_split[10])
        ca = int(input_split[11])
        thal = int(input_split[12])
        
        input_data = (age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal)
    
#     input_data = (62,0,0,140,268,0,0,160,0,3.6,0,2,2)
#     input_data = (64,1,3,110,211,0,0,144,1,1.8,1,0,2)

    # change the input data to a numpy array
    input_data_as_numpy_array = np.asarray(input_data)

    # reshape the numpy array as we are predicting for only on instance
    input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

    prediction = model.predict(input_data_reshaped)
#     print(prediction)

    if (prediction[0]== 0):
#       print('The Person does not have a Heart Disease')
        result.append(0)
    else:
#       print('The Person has Heart Disease')
        result.append(1)
    
for i in range(n):
    if(result[i]==0):
        print("\nThe pateint no.({}) does not have a Heart Disease.".format(i+1))
    else:
        print("\nThe pateint no.({}) has a Heart Disease.".format(i+1))

Enter the number of persons : 2

How you want to enter the data (1)Manually or (2)In string format? 
 Enter the number of your choice : 2
Input the data in string format (values separated by ',' comma.)

Enter the data of person no.(1) : 42,1,0,140,226,0,1,178,0,0,2,0,2





Enter the data of person no.(2) : 39,1,0,118,219,0,1,140,0,1.2,1,0,3

The pateint no.(1) has a Heart Disease.

The pateint no.(2) does not have a Heart Disease.




# Thank You.