# Heart Disease Prediction 

### 

#### To predict whether a person has heart disease or not

Let us start by importing the necessary libraries

In [1]:
import numpy as np    # used for making numpy arrays   
import pandas as pd    # used for creating structured panda dataframes
from sklearn.model_selection import train_test_split    # for splitting data into training and test data
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score    # for evaluation

#### Importing the data

In [2]:
heart_data = pd.read_csv(r'C:\Users\Shrudin\Desktop\Hub\Coding\project\Heart Disease\archive\heart.csv')

# Heart Disease UCI dataset from kaggle

# pd.read_csv is used for loading the csv data into Pandas DataFrame

In [3]:
heart_data.shape # to find the number of rows and columns

(303, 14)

In [4]:
# printing the first 5 rows of the dataset 
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [5]:
# printing the last 5 rows of the dataset 
heart_data.tail()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0
302,57,0,1,130,236,0,0,174,0,0.0,1,1,2,0


In [6]:
table = pd.DataFrame (heart_data) 
table

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [8]:
# for checking if there are any missing entries in the dataset
heart_data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [9]:
# we can see that there are no missing entries


# Now let us see the statistical overview of the entire data
heart_data.describe().round(decimals=2)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.37,0.68,0.97,131.62,246.26,0.15,0.53,149.65,0.33,1.04,1.4,0.73,2.31,0.54
std,9.08,0.47,1.03,17.54,51.83,0.36,0.53,22.91,0.47,1.16,0.62,1.02,0.61,0.5
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [10]:
# to find the distribution of people affected by heart disease
heart_data['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

### Here,

#### 0 represents that the person is not affected by heart disease
#### 1 represents that the person is affected by heart disease

Since the 'target' column is what needed to be predicted, we have to split the target column from rest of the features.

### Splitting the features and target

In [11]:
Y = heart_data['target']
X = heart_data.drop(columns='target',axis=1)

In [12]:
X


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [13]:
Y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

### Splitting the data into Training data and Test data

Let us use the train_test_split function that we have imported

In [14]:
xtrain, xtest, ytrain, ytest= train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=3)

# X is the features
# Y is the target
# test size 0.2 represents that 20% of the data is test data
# stratify is used so that both training and test data contains equal proportions of target, in this case 0,1
# random state for randomizing the output

In [15]:
print(X.shape, xtrain.shape, xtest.shape)

# to see the rows and columns of X, xtrain, xtest

(303, 13) (242, 13) (61, 13)


### Model Training

We are using LogisticRegression model as this is a binary classification problem

In [16]:
model =  LogisticRegression()

#### Training the LogisticRegressio model using training data 

In [17]:
model.fit( xtrain, ytrain )

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

### Model Evaluation

In [18]:
# Let us find the accuracy on the training data

xtrain_prediction = model.predict(xtrain)
training_data_accuracy = accuracy_score(xtrain_prediction, ytrain)

print ("Accuracy on training data is ", training_data_accuracy)

Accuracy on training data is  0.8636363636363636


In [19]:
# Let us find the accuracy on the test data

xtest_prediction = model.predict(xtest)
test_data_accuracy = accuracy_score(xtest_prediction, ytest)

print ("Accuracy on the test data is ", test_data_accuracy)

Accuracy on the test data is  0.8032786885245902


### Building a predictive system

In [21]:
input_data = (48,1,0,124,274,0,0,166,0,0.5,1,0,3)

In [22]:
# to change the input data from tuple into numpy array for easier reshaping
nparray = np.asarray(input_data)

# reshaping the numpy array as we want the prediction for only one instance
reshaped_array = nparray.reshape(1,-1)
prediction = model.predict(reshaped_array)

Let us print the final prediction

In [23]:
print (prediction)

if (prediction==1):
    print("The person has heart disease.")
else: 
    print("The person does not have any heart disease.")

[0]
The person does not have any heart disease.
