# Heart Disease Prediction
By: Harneet Kaur(102117052), Ridhi Thakur(102117032)

### Attributes

1. Age: age in years
2. Sex: 0=male, 1=female
3. chestPaintype: chest Pain
        (type of chest pain due to reduced blood flow to heart)
        -1: typical angina
        -2: atypical angina
        -3: non-anginal pain
        -4: asymptomatic
4. RestingBP: resting blood pressure ( in mm Hg)
5. Cholestrol: serum cholestrol im mg/dl
6. FastingBS: (fasting blood sugar >120mg/dl)(1=true, 0=false)
7. RestingECG: resting electrocardiographic results
        -0: normal
        -1: having ST-T wave abnormality
        -2: showing probable or definite left ventricular hypertrophy by Estes' criteria
8. MaxHR: maximum heart rate acieved
9. ExerciseAngina: exercise-induced angina (1=yes, 0=no)
10. Oldpeak: ST depression induces by exercise relative to rest
11. ST_Slope: the slope of peak exercise ST segment
        -0: upsloping
        -1:flat
        -2:downsloping
14. HeartDisease: target value 
        -0: no
        -1:yes


### Import Libraries

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix
from sklearn.metrics import classification_report

### Load Dataset

In [3]:
df = pd.read_csv("/Users/Dell/Documents/ML_project/heart.csv")

In [4]:
df.head(6)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
5,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0


In [5]:
df.tail(6)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
912,57,F,ASY,140,241,0,Normal,123,Y,0.2,Flat,1
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1
917,38,M,NAP,138,175,0,Normal,173,N,0.0,Up,0


### Information about Dataset 

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [7]:
df.shape

(918, 12)

In [8]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


### Handling Null Values


In [9]:
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [10]:
percentage_missing = df.isnull().sum()*100/len(df)
print(percentage_missing)

Age               0.0
Sex               0.0
ChestPainType     0.0
RestingBP         0.0
Cholesterol       0.0
FastingBS         0.0
RestingECG        0.0
MaxHR             0.0
ExerciseAngina    0.0
Oldpeak           0.0
ST_Slope          0.0
HeartDisease      0.0
dtype: float64


In [11]:
df.dropna()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [12]:
col1= ['Age','RestingBP','Cholesterol','MaxHR']
for col in col1:
    df[col].replace(to_replace=0, value=np.nan, inplace=True)
    
df.shape



(918, 12)

In [13]:
df['Cholesterol'].fillna(df['Cholesterol'].mean(numeric_only=True), inplace=True)


In [14]:
nan_mask = df.isna()
nan_count_rows = nan_mask.sum(axis=1)
nan_count_columns = nan_mask.sum(axis=0)
print("\nNaN count along columns:")
print(nan_count_columns)


NaN count along columns:
Age               0
Sex               0
ChestPainType     0
RestingBP         1
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64


In [15]:
from sklearn.impute import SimpleImputer
import numpy as np
my_imputer=SimpleImputer(strategy='median')
df['RestingBP'] = my_imputer.fit_transform(df[['RestingBP']])

### Converting Categorical data to Numerical data

In [16]:
df['Sex'].unique()

array(['M', 'F'], dtype=object)

In [17]:
df['Sex'] = df['Sex'].apply(lambda x: 1 if x == 'F' else 0)
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,ATA,140.0,289.0,0,Normal,172,N,0.0,Up,0
1,49,1,NAP,160.0,180.0,0,Normal,156,N,1.0,Flat,1
2,37,0,ATA,130.0,283.0,0,ST,98,N,0.0,Up,0
3,48,1,ASY,138.0,214.0,0,Normal,108,Y,1.5,Flat,1
4,54,0,NAP,150.0,195.0,0,Normal,122,N,0.0,Up,0


In [18]:
df['ChestPainType'].unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

In [19]:
df['ChestPainType'] = df['ChestPainType'].apply(lambda x: {'TA': 1, 'ATA': 2, 'NAP': 3, 'ASY':4}[x])
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,2,140.0,289.0,0,Normal,172,N,0.0,Up,0
1,49,1,3,160.0,180.0,0,Normal,156,N,1.0,Flat,1
2,37,0,2,130.0,283.0,0,ST,98,N,0.0,Up,0
3,48,1,4,138.0,214.0,0,Normal,108,Y,1.5,Flat,1
4,54,0,3,150.0,195.0,0,Normal,122,N,0.0,Up,0


In [20]:
df['RestingECG'].unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

In [21]:
df['RestingECG'] = df['RestingECG'].apply(lambda x: {'Normal': 0, 'ST': 1, 'LVH': 2}[x])
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,2,140.0,289.0,0,0,172,N,0.0,Up,0
1,49,1,3,160.0,180.0,0,0,156,N,1.0,Flat,1
2,37,0,2,130.0,283.0,0,1,98,N,0.0,Up,0
3,48,1,4,138.0,214.0,0,0,108,Y,1.5,Flat,1
4,54,0,3,150.0,195.0,0,0,122,N,0.0,Up,0


In [22]:
df['ExerciseAngina'].unique()

array(['N', 'Y'], dtype=object)

In [23]:
df['ExerciseAngina'] = df['ExerciseAngina'].apply(lambda x: {'N': 0, 'Y': 1}[x])
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,2,140.0,289.0,0,0,172,0,0.0,Up,0
1,49,1,3,160.0,180.0,0,0,156,0,1.0,Flat,1
2,37,0,2,130.0,283.0,0,1,98,0,0.0,Up,0
3,48,1,4,138.0,214.0,0,0,108,1,1.5,Flat,1
4,54,0,3,150.0,195.0,0,0,122,0,0.0,Up,0


In [24]:
df['ST_Slope'].unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [25]:
df['ST_Slope'] = df['ST_Slope'].apply(lambda x: {'Up': 0, 'Flat': 1, 'Down':2}[x])
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,2,140.0,289.0,0,0,172,0,0.0,0,0
1,49,1,3,160.0,180.0,0,0,156,0,1.0,1,1
2,37,0,2,130.0,283.0,0,1,98,0,0.0,0,0
3,48,1,4,138.0,214.0,0,0,108,1,1.5,1,1
4,54,0,3,150.0,195.0,0,0,122,0,0.0,0,0


## Checking the value counts for all the features

In [26]:
for col in df.columns:
    print(f"{col}: \n{df[col].value_counts().sort_values(ascending=True)}\n\n")

Age: 
73     1
28     1
30     1
31     2
76     2
77     2
33     2
75     3
29     3
72     4
32     5
71     5
36     6
34     7
74     7
70     7
68    10
37    11
35    11
40    13
69    13
66    13
39    15
67    15
38    16
45    18
42    18
44    19
47    19
49    21
65    21
64    22
43    24
41    24
46    24
50    25
63    30
48    31
61    31
60    32
53    33
62    35
59    35
51    35
52    36
57    38
56    38
55    41
58    42
54    51
Name: Age, dtype: int64


Sex: 
1    193
0    725
Name: Sex, dtype: int64


ChestPainType: 
1     46
2    173
3    203
4    496
Name: ChestPainType, dtype: int64


RestingBP: 
164.0      1
117.0      1
101.0      1
192.0      1
129.0      1
        ... 
150.0     55
110.0     58
140.0    107
130.0    119
120.0    132
Name: RestingBP, Length: 66, dtype: int64


Cholesterol: 
131.000000      1
132.000000      1
337.000000      1
313.000000      1
141.000000      1
             ... 
230.000000      9
220.000000     10
223.000000     10
254.0

In [27]:
df.shape

(918, 12)

## One Hot Encoding 

In [28]:
one_hot_encoders_cols = ['ChestPainType','RestingECG','ST_Slope']

df = pd.get_dummies(df, columns=one_hot_encoders_cols)

In [29]:
df.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_1,ChestPainType_2,ChestPainType_3,ChestPainType_4,RestingECG_0,RestingECG_1,RestingECG_2,ST_Slope_0,ST_Slope_1,ST_Slope_2
0,40,0,140.0,289.0,0,172,0,0.0,0,0,1,0,0,1,0,0,1,0,0
1,49,1,160.0,180.0,0,156,0,1.0,1,0,0,1,0,1,0,0,0,1,0
2,37,0,130.0,283.0,0,98,0,0.0,0,0,1,0,0,0,1,0,1,0,0
3,48,1,138.0,214.0,0,108,1,1.5,1,0,0,0,1,1,0,0,0,1,0
4,54,0,150.0,195.0,0,122,0,0.0,0,0,0,1,0,1,0,0,1,0,0


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Age              918 non-null    int64  
 1   Sex              918 non-null    int64  
 2   RestingBP        918 non-null    float64
 3   Cholesterol      918 non-null    float64
 4   FastingBS        918 non-null    int64  
 5   MaxHR            918 non-null    int64  
 6   ExerciseAngina   918 non-null    int64  
 7   Oldpeak          918 non-null    float64
 8   HeartDisease     918 non-null    int64  
 9   ChestPainType_1  918 non-null    uint8  
 10  ChestPainType_2  918 non-null    uint8  
 11  ChestPainType_3  918 non-null    uint8  
 12  ChestPainType_4  918 non-null    uint8  
 13  RestingECG_0     918 non-null    uint8  
 14  RestingECG_1     918 non-null    uint8  
 15  RestingECG_2     918 non-null    uint8  
 16  ST_Slope_0       918 non-null    uint8  
 17  ST_Slope_1      

In [31]:
numerical_cols = ['Age','RestingBP', 'Cholesterol', 'Oldpeak', 'MaxHR']
target_col = ['HeartDisease'] 
categorical_cols = list(set(df.columns) - set(numerical_cols) - set(target_col))


In [32]:
numerical_cols

['Age', 'RestingBP', 'Cholesterol', 'Oldpeak', 'MaxHR']

In [33]:
categorical_cols

['ST_Slope_0',
 'Sex',
 'ST_Slope_2',
 'RestingECG_1',
 'ChestPainType_3',
 'FastingBS',
 'ST_Slope_1',
 'RestingECG_0',
 'ChestPainType_4',
 'RestingECG_2',
 'ChestPainType_2',
 'ChestPainType_1',
 'ExerciseAngina']

## Splitting Testing and Training data 

In [34]:
df_train , df_test = train_test_split(df, test_size = 0.2, random_state = 45)

In [35]:
len(df_train)

734

In [36]:
len(df_test)

184

In [37]:
len(df)

918

## Standardization 

In [38]:
scaler = StandardScaler()

def get_features_and_target_arrays(df, numerical_columns, categorical_columns, scaler):
    x_numeric_scaled = scaler.fit_transform(df[numerical_columns])
    x_categorical = df[categorical_columns].to_numpy()
    x = np.hstack((x_numeric_scaled,x_categorical))
    y = df["HeartDisease"]
    
    return x, y

In [39]:
x_train , y_train = get_features_and_target_arrays(df_train, numerical_cols, categorical_cols, scaler)

In [40]:
x_test , y_test = get_features_and_target_arrays(df_test, numerical_cols, categorical_cols, scaler)

In [41]:
x_train

array([[-0.17426818,  0.41071752, -0.37663428, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.99437074, -1.35192329,  0.4769565 , ...,  0.        ,
         0.        ,  1.        ],
       [-1.02418739,  0.96154278,  0.84555252, ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [-1.55538689,  0.30055247, -0.47363323, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.46317123, -1.13159319, -0.27963533, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.03821163, -0.14010773,  0.00428815, ...,  0.        ,
         0.        ,  1.        ]])

In [42]:
x_test

array([[-0.5274778 , -1.01439676, -0.01476328, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.52862574,  0.25965787, -0.22631702, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.31740503, -0.43528102, -0.01476328, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-1.58358133, -0.72483889,  0.61083072, ...,  0.        ,
         0.        ,  0.        ],
       [-0.94991921, -0.14572315, -0.61140499, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.0566775 ,  3.03941344, -0.01476328, ...,  0.        ,
         0.        ,  1.        ]])

In [43]:
y_train

164    0
745    1
23     1
808    1
622    1
      ..
607    1
544    0
892    0
643    1
414    1
Name: HeartDisease, Length: 734, dtype: int64

In [44]:
y_test

347    1
558    1
412    1
395    1
363    1
      ..
876    1
191    0
116    1
417    0
372    1
Name: HeartDisease, Length: 184, dtype: int64

## Feature Selection - PCA 

In [856]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pca = PCA(n_components = 3) 
x_train = pca.fit_transform(x_train)

pca = PCA(n_components = 3) 
x_test = pca.fit_transform(x_test)

## Logistic Regression Model


In [46]:
logReg = LogisticRegression()

logReg.fit(x_train, y_train)

In [47]:
test_pred = logReg.predict(x_test)

In [48]:
y_test

347    1
558    1
412    1
395    1
363    1
      ..
876    1
191    0
116    1
417    0
372    1
Name: HeartDisease, Length: 184, dtype: int64

In [49]:
test_pred

array([1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 1])

### Accuracy

In [50]:
mean_squared_error(y_test, test_pred)

0.1358695652173913

In [51]:
accuracy_score(y_test, test_pred)

0.8641304347826086

In [52]:
confusion_matrix(y_test, test_pred)

array([[73, 18],
       [ 7, 86]])

In [53]:
print(classification_report(y_test, test_pred))

              precision    recall  f1-score   support

           0       0.91      0.80      0.85        91
           1       0.83      0.92      0.87        93

    accuracy                           0.86       184
   macro avg       0.87      0.86      0.86       184
weighted avg       0.87      0.86      0.86       184



## Random Forest Model

In [63]:
#cart algo used
#split info based normalization(ig/slit info)
#ID3 not used as biased towards features having more having classes


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

rf_classifier = RandomForestClassifier(n_estimators=15, random_state=42)

rf_classifier.fit(x_train, y_train)

predictions = rf_classifier.predict(x_test)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy*100}%")



Accuracy: 79.8913043478261%


In [58]:
from sklearn.model_selection import GridSearchCV
#GridSearch is used for hyperparameter tuning which need to be run m
param_grid = { #dictionary 
    'n_estimators': [10, 50, 100, 200],
    'max_depth': range(2,19),
    'min_samples_split': range(2,10),
    'min_samples_leaf': range(2,5)
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5) #number of folds for cross-validation

# Fit the GridSearchCV object
grid_search.fit(x_train, y_train)

# Get the best model and parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Evaluate the best model on the testing set
y_pred = best_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("best parameters: ",best_params)

Accuracy: 0.8532608695652174
best parameters:  {'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 10}


## KNN 

In [60]:
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=27)

knn_classifier.fit(x_train, y_train)

y_pred = knn_classifier.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8641304347826086


In [61]:
param_grid = {
    'n_neighbors': range(2, 28)
}

grid_search = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid, cv=5)

grid_search.fit(x_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred = best_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Best parameters:", best_params)

Accuracy: 0.8586956521739131
Best parameters: {'n_neighbors': 20}


In [62]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.86      0.71      0.78        91
           1       0.76      0.88      0.82        93

    accuracy                           0.80       184
   macro avg       0.81      0.80      0.80       184
weighted avg       0.81      0.80      0.80       184

