Use heart disease dataset heart.csv and do following, (credits of dataset:  https://www.kaggle.com/fedesoriano/heart-failure-prediction)

1. Load heart disease dataset in pandas dataframe
1. Remove outliers using Z score. Usual guideline is to remove anything that has Z score > 3 formula or Z score < -3
1. Convert text columns to numbers using label encoding and one hot encoding
1. Apply scaling
1. Build a classification model using various methods (SVM, logistic regression, random forest) and check which model gives you the best accuracy
1. Now use PCA to reduce dimensions, retrain your model and see what impact it has on your model in terms of accuracy. Keep in mind that many times doing PCA reduces the accuracy but computation is much lighter and that's the trade off you need to consider while building models in real life


In [83]:
import pandas as pd

df = pd.read_csv("heart.csv")

print(df.shape)
df.head()

(918, 12)


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [84]:
# Check for N/A values

df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

### No N/A values

### For each numeric column, find all outliers (Z score <-3 or >3)

In [85]:
df_numeric = df[["Age","RestingBP","Cholesterol","FastingBS","MaxHR","Oldpeak"]]

df_numeric.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak
0,40,140,289,0,172,0.0
1,49,160,180,0,156,1.0
2,37,130,283,0,98,0.0
3,48,138,214,0,108,1.5
4,54,150,195,0,122,0.0


In [86]:
# Now, add a z score column

for i in df_numeric.columns:
    df[i+'_z_score'] = df[i].apply(lambda x: (x - df[i].mean()) / df[i].std())

df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Age_z_score,RestingBP_z_score,Cholesterol_z_score,FastingBS_z_score,MaxHR_z_score,Oldpeak_z_score
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0,-1.432359,0.410685,0.824621,-0.551041,1.382175,-0.831979
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1,-0.478223,1.49094,-0.171867,-0.551041,0.753746,0.105606
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0,-1.750404,-0.129442,0.769768,-0.551041,-1.524307,-0.831979
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1,-0.584238,0.30266,0.138964,-0.551041,-1.131539,0.574398
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0,0.051853,0.950812,-0.034736,-0.551041,-0.581664,-0.831979


In [87]:
# Sanity check

print(df["Age"].mean())
print(df["Age"].std())

53.510893246187365
9.43261650673201


In [88]:
(40-df["Age"].mean())/df["Age"].std()

-1.4323590105189488

This matches our column

In [89]:
# Sanity check #2

print(df["MaxHR"].mean())
print(df["MaxHR"].std())

136.80936819172112
25.4603341382503


In [90]:
(108-df["MaxHR"].mean())/df["MaxHR"].std()

-1.1315392812712306

This also matches

In [91]:
df_outliers = df[(df["Age_z_score"]< -3) | (df["Age_z_score"]> 3)]

print(df.shape)

df

(918, 18)


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Age_z_score,RestingBP_z_score,Cholesterol_z_score,FastingBS_z_score,MaxHR_z_score,Oldpeak_z_score
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0,-1.432359,0.410685,0.824621,-0.551041,1.382175,-0.831979
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1,-0.478223,1.490940,-0.171867,-0.551041,0.753746,0.105606
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0,-1.750404,-0.129442,0.769768,-0.551041,-1.524307,-0.831979
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1,-0.584238,0.302660,0.138964,-0.551041,-1.131539,0.574398
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0,0.051853,0.950812,-0.034736,-0.551041,-0.581664,-0.831979
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1,-0.902283,-1.209697,0.596068,-0.551041,-0.188897,0.293123
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1,1.536064,0.626736,-0.053020,1.812770,0.164595,2.355810
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1,0.369898,-0.129442,-0.619830,-0.551041,-0.856602,0.293123
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1,0.369898,-0.129442,0.340090,-0.551041,1.460728,-0.831979


In [92]:
df_outliers = df[(df["RestingBP_z_score"]< -3) | (df["RestingBP_z_score"]> 3)]

print(df_outliers.shape)

df_outliers

(8, 18)


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Age_z_score,RestingBP_z_score,Cholesterol_z_score,FastingBS_z_score,MaxHR_z_score,Oldpeak_z_score
109,39,M,ATA,190,241,0,Normal,106,N,0.0,Up,0,-1.538374,3.111322,0.3858,-0.551041,-1.210093,-0.831979
241,54,M,ASY,200,198,0,Normal,142,Y,2.0,Flat,1,0.051853,3.651449,-0.00731,-0.551041,0.203871,1.043191
365,64,F,ASY,200,0,0,Normal,140,Y,1.0,Flat,1,1.112004,3.651449,-1.817444,-0.551041,0.125318,0.105606
399,61,M,NAP,200,0,1,ST,70,N,0.0,Flat,1,0.793959,3.651449,-1.817444,1.81277,-2.624057,-0.831979
449,55,M,NAP,0,0,0,Normal,155,N,1.5,Flat,1,0.157868,-7.151097,-1.817444,-0.551041,0.714469,0.574398
592,61,M,ASY,190,287,1,LVH,150,Y,2.0,Down,1,0.793959,3.111322,0.806337,1.81277,0.518086,1.043191
732,56,F,ASY,200,288,1,LVH,133,Y,4.0,Down,1,0.263883,3.651449,0.815479,1.81277,-0.14962,2.91836
759,54,M,ATA,192,283,0,LVH,195,N,0.0,Up,1,0.051853,3.219347,0.769768,-0.551041,2.285541,-0.831979


In [93]:
df_outliers = df[(df["Cholesterol_z_score"]< -3) | (df["Cholesterol_z_score"]> 3)]

print(df_outliers.shape)

df_outliers

(3, 18)


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Age_z_score,RestingBP_z_score,Cholesterol_z_score,FastingBS_z_score,MaxHR_z_score,Oldpeak_z_score
76,32,M,ASY,118,529,0,Normal,130,N,0.0,Flat,1,-2.28048,-0.777595,3.018723,-0.551041,-0.26745,-0.831979
149,54,M,ASY,130,603,1,Normal,125,Y,1.0,Flat,1,0.051853,-0.129442,3.695238,1.81277,-0.463834,0.105606
616,67,F,NAP,115,564,0,LVH,160,N,1.6,Flat,0,1.430049,-0.939633,3.338696,-0.551041,0.910853,0.668157


In [94]:
df_outliers = df[(df["FastingBS_z_score"]< -3) | (df["FastingBS_z_score"]> 3)]

print(df_outliers.shape)

df_outliers

(0, 18)


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Age_z_score,RestingBP_z_score,Cholesterol_z_score,FastingBS_z_score,MaxHR_z_score,Oldpeak_z_score


In [95]:
df_outliers = df[(df["MaxHR_z_score"]< -3) | (df["MaxHR_z_score"]> 3)]

print(df_outliers.shape)

df_outliers

(1, 18)


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Age_z_score,RestingBP_z_score,Cholesterol_z_score,FastingBS_z_score,MaxHR_z_score,Oldpeak_z_score
390,51,M,ASY,140,0,0,Normal,60,N,0.0,Flat,1,-0.266193,0.410685,-1.817444,-0.551041,-3.016825,-0.831979


In [96]:
df_outliers = df[(df["Oldpeak_z_score"]< -3) | (df["Oldpeak_z_score"]> 3)]

print(df_outliers.shape)

df_outliers

(7, 18)


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Age_z_score,RestingBP_z_score,Cholesterol_z_score,FastingBS_z_score,MaxHR_z_score,Oldpeak_z_score
166,50,M,ASY,140,231,0,ST,140,Y,5.0,Flat,1,-0.372208,0.410685,0.294379,-0.551041,0.125318,3.855945
324,46,M,ASY,100,0,1,ST,133,N,-2.6,Flat,1,-0.796268,-1.749824,-1.817444,1.81277,-0.14962,-3.269699
702,59,M,TA,178,270,0,LVH,145,N,4.2,Down,0,0.581928,2.463169,0.650921,-0.551041,0.321702,3.105877
771,55,M,ASY,140,217,0,Normal,111,Y,5.6,Down,1,0.157868,0.410685,0.16639,-0.551041,-1.013709,4.418496
791,51,M,ASY,140,298,0,Normal,122,Y,4.2,Flat,1,-0.266193,0.410685,0.9069,-0.551041,-0.581664,3.105877
850,62,F,ASY,160,164,0,LVH,145,N,6.2,Down,1,0.899974,1.49094,-0.318141,-0.551041,0.321702,4.981047
900,58,M,ASY,114,318,0,ST,140,N,4.4,Down,1,0.475913,-0.993646,1.089741,-0.551041,0.125318,3.293394


We found all the outliers, time to remove

In [97]:
# shape to start

df.shape

(918, 18)

In [98]:
df_no_outliers = df

for column in df_no_outliers.columns:
    if column.endswith('_z_score'):
        df_no_outliers = df_no_outliers[(df_no_outliers[column]>= -3) & (df_no_outliers[column] <= 3)]

df_no_outliers.shape

(899, 18)

We removed all 19 outliers!!

In [99]:
df_no_outliers.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Age_z_score,RestingBP_z_score,Cholesterol_z_score,FastingBS_z_score,MaxHR_z_score,Oldpeak_z_score
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0,-1.432359,0.410685,0.824621,-0.551041,1.382175,-0.831979
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1,-0.478223,1.49094,-0.171867,-0.551041,0.753746,0.105606
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0,-1.750404,-0.129442,0.769768,-0.551041,-1.524307,-0.831979
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1,-0.584238,0.30266,0.138964,-0.551041,-1.131539,0.574398
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0,0.051853,0.950812,-0.034736,-0.551041,-0.581664,-0.831979


In [100]:
# Drop the specified columns from df_no_outliers in place
df_no_outliers.drop(["Age_z_score","RestingBP_z_score","Cholesterol_z_score",
                     "FastingBS_z_score","MaxHR_z_score","Oldpeak_z_score"], axis=1, inplace=True)

# Display the first few rows of the updated DataFrame
df_no_outliers.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


##### Convert text columns to numbers using label encoding and one hot encoding

In [102]:
df_categorical = df_no_outliers[["Sex","ChestPainType","RestingECG","ExerciseAngina","ST_Slope"]]

for i in df_categorical.columns:
    print(df_categorical[i].unique())

['M' 'F']
['ATA' 'NAP' 'ASY' 'TA']
['Normal' 'ST' 'LVH']
['N' 'Y']
['Up' 'Flat' 'Down']


In [103]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [107]:
dfle = df_no_outliers
dfle.Sex = le.fit_transform(dfle.Sex)
dfle.ChestPainType = le.fit_transform(dfle.ChestPainType)
dfle.RestingECG = le.fit_transform(dfle.RestingECG)
dfle.ExerciseAngina = le.fit_transform(dfle.ExerciseAngina)
dfle.ST_Slope = le.fit_transform(dfle.ST_Slope)

dfle.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0


In [108]:
X = dfle.drop("HeartDisease",axis='columns')
y = dfle.HeartDisease

X.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,1,1,140,289,0,1,172,0,0.0,2
1,49,0,2,160,180,0,1,156,0,1.0,1
2,37,1,1,130,283,0,2,98,0,0.0,2
3,48,0,0,138,214,0,1,108,1,1.5,1
4,54,1,2,150,195,0,1,122,0,0.0,2


In [109]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-1.42815446,  0.515943  ,  0.2245723 , ..., -0.8229452 ,
        -0.85546862,  1.04249607],
       [-0.47585532, -1.93819859,  1.27063705, ..., -0.8229452 ,
         0.13751561, -0.62216462],
       [-1.7455875 ,  0.515943  ,  0.2245723 , ..., -0.8229452 ,
        -0.85546862,  1.04249607],
       ...,
       [ 0.3706328 ,  0.515943  , -0.82149245, ...,  1.21514774,
         0.33611246, -0.62216462],
       [ 0.3706328 , -1.93819859,  0.2245723 , ..., -0.8229452 ,
        -0.85546862, -0.62216462],
       [-1.63977649,  0.515943  ,  1.27063705, ..., -0.8229452 ,
        -0.85546862,  1.04249607]])

In [111]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)

In [112]:
print(X_train.shape)
print(X_test.shape)

(719, 11)
(180, 11)


In [113]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [114]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

In [117]:
from sklearn.model_selection import GridSearchCV

scores = []

for model_name, mp in model_params.items():
    print(model_name, mp)  # This will print the model name and parameters
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

    
df_results = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df_results

svm {'model': SVC(gamma='auto'), 'params': {'C': [1, 10, 20], 'kernel': ['rbf', 'linear']}}
random_forest {'model': RandomForestClassifier(), 'params': {'n_estimators': [1, 5, 10]}}
logistic_regression {'model': LogisticRegression(solver='liblinear'), 'params': {'C': [1, 5, 10]}}


Unnamed: 0,model,best_score,best_params
0,svm,0.862286,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.83177,{'n_estimators': 5}
2,logistic_regression,0.848397,{'C': 1}


In [120]:
from sklearn.decomposition import PCA

pca = PCA(0.95)
X_pca = pca.fit_transform(X)
X_pca

array([[ 93.12926348, -29.67413245],
       [-16.33750689, -14.81536427],
       [ 82.66842478,  38.91589868],
       ...,
       [-68.22644416,  17.7012641 ],
       [ 40.02690223, -33.47134474],
       [-20.61151816, -37.62451392]])

In [121]:
pca.n_components_

2

In [122]:
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.2)

In [123]:
scores = []

for model_name, mp in model_params.items():
    print(model_name, mp)  # This will print the model name and parameters
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

    
df_results = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df_results

svm {'model': SVC(gamma='auto'), 'params': {'C': [1, 10, 20], 'kernel': ['rbf', 'linear']}}
random_forest {'model': RandomForestClassifier(), 'params': {'n_estimators': [1, 5, 10]}}
logistic_regression {'model': LogisticRegression(solver='liblinear'), 'params': {'C': [1, 5, 10]}}


Unnamed: 0,model,best_score,best_params
0,svm,0.549378,"{'C': 1, 'kernel': 'linear'}"
1,random_forest,0.548019,{'n_estimators': 1}
2,logistic_regression,0.52432,{'C': 1}


With only 2 variables, accuary was around 52-54%