In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.linear_model import LogisticRegression

In [5]:
from sklearn.metrics import accuracy_score

# Data Collection and Processing

Loading the CSV data to pandas dataframe

In [18]:
heart_data=pd.read_csv('HeartDiseaseTrain-Test.csv')

In [19]:
#print first 5 rows of the Dataset
heart_data.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholestoral,fasting_blood_sugar,rest_ecg,Max_heart_rate,exercise_induced_angina,oldpeak,slope,vessels_colored_by_flourosopy,thalassemia,target
0,52,Male,Typical angina,125,212,Lower than 120 mg/ml,ST-T wave abnormality,168,No,1.0,Downsloping,Two,Reversable Defect,0
1,53,Male,Typical angina,140,203,Greater than 120 mg/ml,Normal,155,Yes,3.1,Upsloping,Zero,Reversable Defect,0
2,70,Male,Typical angina,145,174,Lower than 120 mg/ml,ST-T wave abnormality,125,Yes,2.6,Upsloping,Zero,Reversable Defect,0
3,61,Male,Typical angina,148,203,Lower than 120 mg/ml,ST-T wave abnormality,161,No,0.0,Downsloping,One,Reversable Defect,0
4,62,Female,Typical angina,138,294,Greater than 120 mg/ml,ST-T wave abnormality,106,No,1.9,Flat,Three,Fixed Defect,0


In [20]:
#Number of rows and columns in the dataset
heart_data.shape

(1025, 14)

In [21]:
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            1025 non-null   int64  
 1   sex                            1025 non-null   object 
 2   chest_pain_type                1025 non-null   object 
 3   resting_blood_pressure         1025 non-null   int64  
 4   cholestoral                    1025 non-null   int64  
 5   fasting_blood_sugar            1025 non-null   object 
 6   rest_ecg                       1025 non-null   object 
 7   Max_heart_rate                 1025 non-null   int64  
 8   exercise_induced_angina        1025 non-null   object 
 9   oldpeak                        1025 non-null   float64
 10  slope                          1025 non-null   object 
 11  vessels_colored_by_flourosopy  1025 non-null   object 
 12  thalassemia                    1025 non-null   o

In [22]:
#checking for missing values
heart_data.isnull().sum()

age                              0
sex                              0
chest_pain_type                  0
resting_blood_pressure           0
cholestoral                      0
fasting_blood_sugar              0
rest_ecg                         0
Max_heart_rate                   0
exercise_induced_angina          0
oldpeak                          0
slope                            0
vessels_colored_by_flourosopy    0
thalassemia                      0
target                           0
dtype: int64

In [23]:
#statistical measures about data
heart_data.describe()

Unnamed: 0,age,resting_blood_pressure,cholestoral,Max_heart_rate,oldpeak,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,131.611707,246.0,149.114146,1.071512,0.513171
std,9.07229,17.516718,51.59251,23.005724,1.175053,0.50007
min,29.0,94.0,126.0,71.0,0.0,0.0
25%,48.0,120.0,211.0,132.0,0.0,0.0
50%,56.0,130.0,240.0,152.0,0.8,1.0
75%,61.0,140.0,275.0,166.0,1.8,1.0
max,77.0,200.0,564.0,202.0,6.2,1.0


In [29]:
#cheking the distribution of target variable
heart_data['target'].value_counts()

1    526
0    499
Name: target, dtype: int64

# splitting the feature and target


In [48]:
X=heart_data[['age', 'resting_blood_pressure',
       'cholestoral', 'Max_heart_rate', 'oldpeak']]
y=heart_data['target']

In [58]:
#splitting the data into Train data and Test data
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3,stratify=y, random_state=2)

In [59]:
print(X.shape,X_train.shape,X_test.shape)

(1025, 5) (717, 5) (308, 5)


# Model training 

In [60]:
#Using Logistic Regression Model because of binary classification

In [61]:
model = LogisticRegression()


In [62]:
#training the model
model.fit(X_train,y_train)

LogisticRegression()

# Model Evaluation

In [63]:
#accuracy on training data
X_train_predictions=model.predict(X_train)
training_data_prediction=accuracy_score(X_train_predictions , y_train)

In [68]:
print('Accuracy on training data: ',training_data_prediction)

Accuracy on training data:  0.7447698744769874


In [69]:
#accuracy on test data
X_test_predictions=model.predict(X_test)
test_data_prediction=accuracy_score(X_test_predictions , y_test)

In [70]:
print('Accuracy on test data: ',test_data_prediction)

Accuracy on test data:  0.6915584415584416


# Building a Predictive System

In [71]:
input_data=(70,145,174,125,2.6)

In [75]:
#change the input data into numpy array
input_data_numpy= np.array(input_data)
input_data_reshape=input_data_numpy.reshape(1,-1)

In [77]:
prediction =model.predict(input_data_reshape)




In [78]:
if(prediction[0]==0):
    print('Person does not have disease')
else:
     print('Person have disease')

Person does not have disease
