## Import Dependencies

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


## Import dataset

In [2]:
data = pd.read_csv("heart.csv")
data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
data["ST_Slope"].value_counts()

Flat    460
Up      395
Down     63
Name: ST_Slope, dtype: int64

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [5]:
data.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


## Detecting the outliers in a datasets

In [6]:
data[data.duplicated()]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease


In [7]:
from scipy import stats 

In [8]:
Numerical_columns = data.select_dtypes(include = [np.number])
z_scores = np.abs(stats.zscore(Numerical_columns))

data[Numerical_columns.columns] = np.where(z_scores > 3, np.nan, data[Numerical_columns.columns])
data[Numerical_columns.columns]

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
0,40.0,140.0,289.0,0.0,172.0,0.0,0.0
1,49.0,160.0,180.0,0.0,156.0,1.0,1.0
2,37.0,130.0,283.0,0.0,98.0,0.0,0.0
3,48.0,138.0,214.0,0.0,108.0,1.5,1.0
4,54.0,150.0,195.0,0.0,122.0,0.0,0.0
...,...,...,...,...,...,...,...
913,45.0,110.0,264.0,0.0,132.0,1.2,1.0
914,68.0,144.0,193.0,1.0,141.0,3.4,1.0
915,57.0,130.0,131.0,0.0,115.0,1.2,1.0
916,57.0,130.0,236.0,0.0,174.0,0.0,1.0


In [9]:
data.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         8
Cholesterol       3
FastingBS         0
RestingECG        0
MaxHR             1
ExerciseAngina    0
Oldpeak           7
ST_Slope          0
HeartDisease      0
dtype: int64

In [10]:
data["RestingBP"] = data["RestingBP"].fillna(data["RestingBP"].mean())
data["Cholesterol"] = data["Cholesterol"].fillna(data["Cholesterol"].mean())
data["MaxHR"] = data["MaxHR"].fillna(data["MaxHR"].mean())
data["Oldpeak"] = data["Oldpeak"].fillna(data["Oldpeak"].mean())

In [11]:
data.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [12]:
data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40.0,M,ATA,140.0,289.0,0.0,Normal,172.0,N,0.0,Up,0.0
1,49.0,F,NAP,160.0,180.0,0.0,Normal,156.0,N,1.0,Flat,1.0
2,37.0,M,ATA,130.0,283.0,0.0,ST,98.0,N,0.0,Up,0.0
3,48.0,F,ASY,138.0,214.0,0.0,Normal,108.0,Y,1.5,Flat,1.0
4,54.0,M,NAP,150.0,195.0,0.0,Normal,122.0,N,0.0,Up,0.0


In [13]:
data["ST_Slope"].value_counts()

Flat    460
Up      395
Down     63
Name: ST_Slope, dtype: int64

In [14]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Numerical_columns = data.iloc[:,:-1].select_dtypes(include = [np.number])
data[Numerical_columns.columns] = scaler.fit_transform(Numerical_columns)
data


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,-1.433140,M,ATA,0.464938,0.852019,-0.551341,Normal,1.386542,N,-0.859464,Up,0.0
1,-0.478484,F,NAP,1.634997,-0.164041,-0.551341,Normal,0.754624,N,0.134660,Flat,1.0
2,-1.751359,M,ATA,-0.120092,0.796089,-0.551341,ST,-1.536080,N,-0.859464,Up,0.0
3,-0.584556,F,ASY,0.347932,0.152895,-0.551341,Normal,-1.141131,Y,0.631721,Flat,1.0
4,0.051881,M,NAP,1.049967,-0.024216,-0.551341,Normal,-0.588203,N,-0.859464,Up,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,-0.902775,M,TA,-1.290151,0.618978,-0.551341,Normal,-0.193254,N,0.333484,Flat,1.0
914,1.536902,M,ASY,0.698950,-0.042859,1.813758,Normal,0.162200,N,2.520556,Flat,1.0
915,0.370100,M,ASY,-0.120092,-0.620801,-0.551341,Normal,-0.864667,Y,0.333484,Flat,1.0
916,0.370100,F,ATA,-0.120092,0.357972,-0.551341,LVH,1.465532,N,-0.859464,Flat,1.0


In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
encoder = LabelEncoder()
data["Sex"] = encoder.fit_transform(data["Sex"])
data["ExerciseAngina"] = encoder.fit_transform(data["ExerciseAngina"])
data["ST_Slope"] = encoder.fit_transform(data["ST_Slope"])
data["RestingECG"] = encoder.fit_transform(data["RestingECG"])


## Using one hot encoding

In [17]:
encoded = pd.get_dummies(data["ChestPainType"])
data = pd.concat([data,encoded], axis = 1)
data


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,ASY,ATA,NAP,TA
0,-1.433140,1,ATA,0.464938,0.852019,-0.551341,1,1.386542,0,-0.859464,2,0.0,0,1,0,0
1,-0.478484,0,NAP,1.634997,-0.164041,-0.551341,1,0.754624,0,0.134660,1,1.0,0,0,1,0
2,-1.751359,1,ATA,-0.120092,0.796089,-0.551341,2,-1.536080,0,-0.859464,2,0.0,0,1,0,0
3,-0.584556,0,ASY,0.347932,0.152895,-0.551341,1,-1.141131,1,0.631721,1,1.0,1,0,0,0
4,0.051881,1,NAP,1.049967,-0.024216,-0.551341,1,-0.588203,0,-0.859464,2,0.0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,-0.902775,1,TA,-1.290151,0.618978,-0.551341,1,-0.193254,0,0.333484,1,1.0,0,0,0,1
914,1.536902,1,ASY,0.698950,-0.042859,1.813758,1,0.162200,0,2.520556,1,1.0,1,0,0,0
915,0.370100,1,ASY,-0.120092,-0.620801,-0.551341,1,-0.864667,1,0.333484,1,1.0,1,0,0,0
916,0.370100,0,ATA,-0.120092,0.357972,-0.551341,0,1.465532,0,-0.859464,1,1.0,0,1,0,0


In [18]:
data.drop(columns= "ChestPainType", inplace = True)

In [19]:
data.shape

(918, 15)

In [20]:
data

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,ASY,ATA,NAP,TA
0,-1.433140,1,0.464938,0.852019,-0.551341,1,1.386542,0,-0.859464,2,0.0,0,1,0,0
1,-0.478484,0,1.634997,-0.164041,-0.551341,1,0.754624,0,0.134660,1,1.0,0,0,1,0
2,-1.751359,1,-0.120092,0.796089,-0.551341,2,-1.536080,0,-0.859464,2,0.0,0,1,0,0
3,-0.584556,0,0.347932,0.152895,-0.551341,1,-1.141131,1,0.631721,1,1.0,1,0,0,0
4,0.051881,1,1.049967,-0.024216,-0.551341,1,-0.588203,0,-0.859464,2,0.0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,-0.902775,1,-1.290151,0.618978,-0.551341,1,-0.193254,0,0.333484,1,1.0,0,0,0,1
914,1.536902,1,0.698950,-0.042859,1.813758,1,0.162200,0,2.520556,1,1.0,1,0,0,0
915,0.370100,1,-0.120092,-0.620801,-0.551341,1,-0.864667,1,0.333484,1,1.0,1,0,0,0
916,0.370100,0,-0.120092,0.357972,-0.551341,0,1.465532,0,-0.859464,1,1.0,0,1,0,0


In [21]:
x = data[["Age", "Sex", "RestingBP", "Cholesterol", "FastingBS", "RestingECG", "MaxHR", "ExerciseAngina", "Oldpeak", "ST_Slope", "ASY", "ATA", "NAP", "TA"]]
y = data["HeartDisease"]


## Carrying out dimensionality Reduction on our datasets

In [22]:
from sklearn.decomposition import PCA

In [23]:
pca = PCA(0.95)
x_pca = pca.fit_transform(x)

In [24]:
x_pca_train, x_pca_test, y_train, y_test = train_test_split(x_pca, y, test_size = 0.2, random_state = 33)

In [25]:
model = LogisticRegression()
model.fit(x_pca_train, y_train)
pred = model.predict(x_pca_test)
pred

array([1., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0.,
       1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 1., 0., 1., 1.,
       1., 0., 1., 0., 1., 0., 1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 0.,
       1., 0., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1.,
       1., 0., 1., 0., 1., 0., 1., 0., 1., 1., 1., 0., 1., 0., 0., 0., 1.,
       1., 0., 0., 1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 0., 1., 0., 0.,
       0., 0., 1., 0., 1., 0., 0., 1., 1., 1., 1., 0., 0., 0., 1., 0., 0.,
       1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 0., 1., 1.,
       0., 1., 0., 0., 1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0.,
       0., 0., 1., 0., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1.,
       0., 1., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0.])

In [26]:
from sklearn.metrics import confusion_matrix, classification_report

In [27]:
classification_report(y_test, pred)

'              precision    recall  f1-score   support\n\n         0.0       0.76      0.88      0.81        74\n         1.0       0.91      0.81      0.86       110\n\n    accuracy                           0.84       184\n   macro avg       0.83      0.84      0.83       184\nweighted avg       0.85      0.84      0.84       184\n'