In [104]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

In [105]:
# https://www.kaggle.com/fedesoriano/heart-failure-prediction
df = pd.read_csv('heart.csv')

In [106]:
df.head(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [107]:
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [108]:
df.shape

(918, 12)

In [109]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [110]:
# Function to find outliers based on Z-score
def find_outliers_zscore(df, threshold=3):
    z_scores = stats.zscore(df.select_dtypes(include='number'))
    abs_z_scores = abs(z_scores)
    outliers = (abs_z_scores > threshold).any(axis=1)
    return df[outliers]



In [111]:
df_outliers = find_outliers_zscore(df)

In [112]:
df_outliers

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
76,32,M,ASY,118,529,0,Normal,130,N,0.0,Flat,1
109,39,M,ATA,190,241,0,Normal,106,N,0.0,Up,0
149,54,M,ASY,130,603,1,Normal,125,Y,1.0,Flat,1
166,50,M,ASY,140,231,0,ST,140,Y,5.0,Flat,1
241,54,M,ASY,200,198,0,Normal,142,Y,2.0,Flat,1
324,46,M,ASY,100,0,1,ST,133,N,-2.6,Flat,1
365,64,F,ASY,200,0,0,Normal,140,Y,1.0,Flat,1
390,51,M,ASY,140,0,0,Normal,60,N,0.0,Flat,1
399,61,M,NAP,200,0,1,ST,70,N,0.0,Flat,1
449,55,M,NAP,0,0,0,Normal,155,N,1.5,Flat,1


In [113]:
# Function to remove outliers based on Z-score
def remove_outliers_zscore(df, threshold=3):
    z_scores = stats.zscore(df.select_dtypes(include='number'))
    abs_z_scores = abs(z_scores)
    filtered_entries = (abs_z_scores < threshold).all(axis=1)
    return df[filtered_entries]

In [114]:
# Apply the function
df_cleaned = remove_outliers_zscore(df)

In [115]:
df_cleaned

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [116]:
df_cleaned['ExerciseAngina'].unique()

array(['N', 'Y'], dtype=object)

In [117]:
df_cleaned['ST_Slope'].unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [118]:
df_cleaned['ChestPainType'].unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

In [119]:
df_cleaned['RestingECG'].unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

In [120]:
df_dummies = df_cleaned.copy()
df_dummies.ExerciseAngina.replace(
    {
        'N': 0,
        'Y': 1
    },
    inplace=True)

df_dummies.ST_Slope.replace(
    {
        'Down': 1,
        'Flat': 2,
        'Up': 3
    },
    inplace=True
)

df_dummies.RestingECG.replace(
    {
        'Normal': 1,
        'ST': 2,
        'LVH': 3
    },
    inplace=True)

df_dummies.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,1,172,0,0.0,3,0
1,49,F,NAP,160,180,0,1,156,0,1.0,2,1
2,37,M,ATA,130,283,0,2,98,0,0.0,3,0
3,48,F,ASY,138,214,0,1,108,1,1.5,2,1
4,54,M,NAP,150,195,0,1,122,0,0.0,3,0


In [121]:
df2 = pd.get_dummies(df_dummies, drop_first=True)
df2.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,140,289,0,1,172,0,0.0,3,0,1,1,0,0
1,49,160,180,0,1,156,0,1.0,2,1,0,0,1,0
2,37,130,283,0,2,98,0,0.0,3,0,1,1,0,0
3,48,138,214,0,1,108,1,1.5,2,1,0,0,0,0
4,54,150,195,0,1,122,0,0.0,3,0,1,0,1,0


In [122]:
X = df2.drop("HeartDisease", axis='columns')
y = df2.HeartDisease

X.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,140,289,0,1,172,0,0.0,3,1,1,0,0
1,49,160,180,0,1,156,0,1.0,2,0,0,1,0
2,37,130,283,0,2,98,0,0.0,3,1,1,0,0
3,48,138,214,0,1,108,1,1.5,2,0,0,0,0
4,54,150,195,0,1,122,0,0.0,3,1,0,1,0


In [123]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-1.42815446,  0.46590022,  0.84963584, ...,  2.06332497,
        -0.5349047 , -0.22955001],
       [-0.47585532,  1.63471366, -0.16812204, ..., -0.48465463,
         1.86949191, -0.22955001],
       [-1.7455875 , -0.1185065 ,  0.79361247, ...,  2.06332497,
        -0.5349047 , -0.22955001],
       ...,
       [ 0.3706328 , -0.1185065 , -0.62564622, ..., -0.48465463,
        -0.5349047 , -0.22955001],
       [ 0.3706328 , -0.1185065 ,  0.35476274, ...,  2.06332497,
        -0.5349047 , -0.22955001],
       [-1.63977649,  0.34901888, -0.21480818, ..., -0.48465463,
         1.86949191, -0.22955001]])

In [124]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=30)

In [125]:
X_train.shape

(719, 13)

In [126]:
X_test.shape

(180, 13)

In [127]:
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)
model_rf.score(X_test, y_test)

0.8555555555555555

In [128]:
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)
model_lr.score(X_test, y_test)

0.8611111111111112

### PCA to Reduce Dimensions

In [129]:
pca = PCA(0.95)
X_pca = pca.fit_transform(X)
X_pca

array([[ 93.12912839, -29.67670735],
       [-16.33895199, -14.80374789],
       [ 82.67026321,  38.91313153],
       ...,
       [-68.22650773,  17.69545401],
       [ 40.0272494 , -33.46953106],
       [-20.61297776, -37.61461313]])

In [130]:
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.2, random_state=30)

In [131]:
model_rf_pca = RandomForestClassifier(random_state=42)
model_rf_pca.fit(X_train_pca, y_train_pca)
model_rf_pca.score(X_test_pca, y_test_pca)

0.6333333333333333

In [132]:
model_lr_pca = LogisticRegression()
model_lr_pca.fit(X_train_pca, y_train_pca)
model_lr_pca.score(X_test_pca, y_test_pca)

0.6666666666666666

In [133]:
pca.explained_variance_ratio_

array([0.92111815, 0.05064593])

In [139]:
# using two components only
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X)
X_pca

array([[ 93.12912839, -29.67670735,  10.94787007,  -9.17497093,
         -0.66779798],
       [-16.33895199, -14.80374789,  31.08552568,  -6.20680876,
          0.16331994],
       [ 82.67026321,  38.91313153, -14.78753752, -21.14554194,
         -0.9740044 ],
       ...,
       [-68.22650773,  17.69545401,  -4.33549331,   0.38471846,
          0.31165274],
       [ 40.0272494 , -33.46953106,   5.36266107,   9.21011634,
         -0.54794126],
       [-20.61297776, -37.61461313,  12.15514148, -10.99166521,
         -0.47067986]])

In [140]:
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.2, random_state=30)

In [141]:
model_rf_pca = RandomForestClassifier(random_state=42)
model_rf_pca.fit(X_train_pca, y_train_pca)
model_rf_pca.score(X_test_pca, y_test_pca)

0.8222222222222222

In [142]:
model_lr_pca = LogisticRegression()
model_lr_pca.fit(X_train_pca, y_train_pca)
model_lr_pca.score(X_test_pca, y_test_pca)

0.7888888888888889

In [143]:
pca.explained_variance_ratio_

array([9.21118153e-01, 5.06459321e-02, 2.25811626e-02, 5.43828894e-03,
       8.56075944e-05])