In [68]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder  ## Used to convert strings to numeric value for using t 
from sklearn.metrics import accuracy_score
%matplotlib inline

In [69]:
heart_data = pd.read_csv('heart_disease.csv')
# First 5 rows of datset
heart_data.head() 

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [70]:
# Last 5 rows of dataset
heart_data.tail() 

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
915,916,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,917,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,918,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,919,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0
919,920,62,Male,VA Long Beach,atypical angina,120.0,254.0,False,lv hypertrophy,93.0,True,0.0,,,,1


In [71]:
# No.of rows and columns in the dataset
heart_data.shape 

(920, 16)

In [72]:
# Information about the data
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [73]:
# Checking for missing values
heart_data.isnull().sum()

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

In [74]:
# Fill missing values in numeric columns with the mean
# Here we used mean because these columns have continous datapoints
heart_data['trestbps'] = heart_data['trestbps'].fillna(heart_data['trestbps'].mean())
heart_data['chol'] = heart_data['chol'].fillna(heart_data['chol'].mean())
heart_data['thalch'] = heart_data['thalch'].fillna(heart_data['thalch'].mean())
heart_data['oldpeak'] = heart_data['oldpeak'].fillna(heart_data['oldpeak'].mean())

# Fill missing values in other columns with the most common value (mode)
## Here we used mode because these columns have repeated values such as true false
heart_data['fbs'] = heart_data['fbs'].fillna(heart_data['fbs'].mode()[0])
heart_data['restecg'] = heart_data['restecg'].fillna(heart_data['restecg'].mode()[0])
heart_data['exang'] = heart_data['exang'].fillna(heart_data['exang'].mode()[0])
heart_data['slope'] = heart_data['slope'].fillna(heart_data['slope'].mode()[0])
heart_data['ca'] = heart_data['ca'].fillna(heart_data['ca'].mode()[0])
heart_data['thal'] = heart_data['thal'].fillna(heart_data['thal'].mode()[0])

  heart_data['fbs'] = heart_data['fbs'].fillna(heart_data['fbs'].mode()[0])
  heart_data['exang'] = heart_data['exang'].fillna(heart_data['exang'].mode()[0])


In [75]:
heart_data.isnull().sum()

id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64

In [76]:
# Statistical measure of data
heart_data.describe()

Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num
count,920.0,920.0,920.0,920.0,920.0,920.0,920.0,920.0
mean,460.5,53.51087,132.132404,199.130337,137.545665,0.878788,0.227174,0.995652
std,265.725422,9.424685,18.443895,108.957634,25.138494,1.053774,0.628936,1.142693
min,1.0,28.0,0.0,0.0,60.0,-2.6,0.0,0.0
25%,230.75,47.0,120.0,177.75,120.0,0.0,0.0,0.0
50%,460.5,54.0,130.0,221.0,138.0,0.8,0.0,1.0
75%,690.25,60.0,140.0,267.0,156.0,1.5,0.0,2.0
max,920.0,77.0,200.0,603.0,202.0,6.2,3.0,4.0


In [77]:
# Checking the distribution of num variable
heart_data['num'].value_counts()

num
0    411
1    265
2    109
3    107
4     28
Name: count, dtype: int64

In [78]:
# Convert to binary: 0 = no disease, 1 = disease present
heart_data['num'] = heart_data['num'].apply(lambda x: 1 if x > 0 else 0)
heart_data['num'].value_counts()

num
1    509
0    411
Name: count, dtype: int64

In [79]:
# Distribution is balanced !

In [80]:
# Convert each string column to integer so that we can use algorithms
le = LabelEncoder()
heart_data['sex'] = le.fit_transform(heart_data['sex'])
heart_data['dataset'] = le.fit_transform(heart_data['dataset'])
heart_data['cp'] = le.fit_transform(heart_data['cp'])
heart_data['fbs'] = le.fit_transform(heart_data['fbs'])
heart_data['restecg'] = le.fit_transform(heart_data['restecg'])
heart_data['exang'] = le.fit_transform(heart_data['exang'])
heart_data['slope'] = le.fit_transform(heart_data['slope'])
heart_data['thal'] = le.fit_transform(heart_data['thal'])

In [81]:
heart_data.dtypes

id            int64
age           int64
sex           int32
dataset       int32
cp            int32
trestbps    float64
chol        float64
fbs           int64
restecg       int32
thalch      float64
exang         int64
oldpeak     float64
slope         int32
ca          float64
thal          int32
num           int64
dtype: object

In [82]:
## Dividing the dataset into dependent and independent features
x=heart_data.drop(columns='num',axis=1)
y=heart_data['num']

In [83]:
x

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,1,63,1,0,3,145.000000,233.0,1,0,150.000000,0,2.300000,0,0.0,0
1,2,67,1,0,0,160.000000,286.0,0,0,108.000000,1,1.500000,1,3.0,1
2,3,67,1,0,0,120.000000,229.0,0,0,129.000000,1,2.600000,1,2.0,2
3,4,37,1,0,2,130.000000,250.0,0,1,187.000000,0,3.500000,0,0.0,1
4,5,41,0,0,1,130.000000,204.0,0,0,172.000000,0,1.400000,2,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,0,3,0,127.000000,333.0,1,2,154.000000,0,0.000000,1,0.0,1
916,917,62,1,3,3,132.132404,139.0,0,2,137.545665,0,0.878788,1,0.0,1
917,918,55,1,3,0,122.000000,223.0,1,2,100.000000,0,0.000000,1,0.0,0
918,919,58,1,3,0,132.132404,385.0,1,0,137.545665,0,0.878788,1,0.0,1


In [84]:
y

0      0
1      1
2      1
3      0
4      0
      ..
915    1
916    0
917    1
918    0
919    1
Name: num, Length: 920, dtype: int64

In [85]:
## Splitting the data into train and test
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=42,stratify=y)

In [86]:
x.shape

(920, 15)

In [87]:
x_train.shape

(690, 15)

In [88]:
x_test.shape

(230, 15)

In [89]:
# Logistic Regression 
model = LogisticRegression()
model.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [91]:
# Model accuracy
x_train_prediction = model.predict(x_train)
training_accuracy = accuracy_score(x_train_prediction , y_train)
training_accuracy

0.8333333333333334

In [92]:
# Accuracy on test data
x_test_prediction = model.predict(x_test)
test_accuracy = accuracy_score(x_test_prediction , y_test)
test_accuracy

0.8304347826086956

In [99]:
# Prediction
input_data = (876,44,1,0,1,120,263,0,1,173,0,0,2,0,2)
# Change the input data to array
array = np.asarray(input_data)
# Reshape the array
reshaped = array.reshape(1,-1)
prediction = model.predict(reshaped)
prediction
if (prediction[0]==0):
    print('The person does not have heart disease')
else:
    print('The person has heart disease')

The person has heart disease


