In [2]:
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.svm import SVC

In [3]:
df = pd.read_csv('heart.csv')
df.head() 

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
z = stats.zscore(df.select_dtypes(include='number'))
df = df[(abs(z) < 3).all(axis=1)]

In [5]:
label_encode_cols = ['Sex', 'ExerciseAngina']
cols_to_encode = ['ChestPainType', 'RestingECG', 'ST_Slope']

# Label encode
le = LabelEncoder()
for col in label_encode_cols:
    df[col] = le.fit_transform(df[col])

In [6]:
ohe = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)
encoded = ohe.fit_transform(df[cols_to_encode])
encoded_cols = ohe.get_feature_names_out(cols_to_encode)
encoded_df = pd.DataFrame(encoded, columns=encoded_cols, index=df.index)
df_encoded = pd.concat([df.drop(columns=cols_to_encode), encoded_df], axis=1)

In [7]:
df_encoded

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ST_Slope_Flat,ST_Slope_Up
0,40,1,140,289,0,172,0,0.0,0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,49,0,160,180,0,156,0,1.0,1,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,37,1,130,283,0,98,0,0.0,0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
3,48,0,138,214,0,108,1,1.5,1,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,54,1,150,195,0,122,0,0.0,0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,110,264,0,132,0,1.2,1,0.0,0.0,1.0,1.0,0.0,1.0,0.0
914,68,1,144,193,1,141,0,3.4,1,0.0,0.0,0.0,1.0,0.0,1.0,0.0
915,57,1,130,131,0,115,1,1.2,1,0.0,0.0,0.0,1.0,0.0,1.0,0.0
916,57,0,130,236,0,174,0,0.0,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [8]:
X = df_encoded.drop(columns=['HeartDisease'])
y = df_encoded['HeartDisease']

In [9]:
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ST_Slope_Flat,ST_Slope_Up
0,-1.428154,0.515943,0.4659,0.849636,-0.550362,1.38432,-0.822945,-0.855469,2.063325,-0.534905,-0.22955,0.809702,-0.489898,-0.998888,1.134695
1,-0.475855,-1.938199,1.634714,-0.168122,-0.550362,0.752973,-0.822945,0.137516,-0.484655,1.869492,-0.22955,0.809702,-0.489898,1.001113,-0.881294
2,-1.745588,0.515943,-0.118507,0.793612,-0.550362,-1.535661,-0.822945,-0.855469,2.063325,-0.534905,-0.22955,-1.235023,2.041241,-0.998888,1.134695
3,-0.581666,-1.938199,0.349019,0.149344,-0.550362,-1.141069,1.215148,0.634008,-0.484655,-0.534905,-0.22955,0.809702,-0.489898,1.001113,-0.881294
4,0.0532,0.515943,1.050307,-0.028064,-0.550362,-0.58864,-0.822945,-0.855469,-0.484655,1.869492,-0.22955,0.809702,-0.489898,-0.998888,1.134695


In [10]:
# 80/20 split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=30)


In [11]:
def three_model_calculation(X_train ,y_train):
    # Logistical Regression score
    logistic_model = LogisticRegression()
    logistic_model.fit(X_train, y_train)
    print("Logistical Regression Score: ", logistic_model.score(X_test, y_test))

    # Random forest score
    random_forest_model = RandomForestClassifier(random_state=42)
    random_forest_model.fit(X_train, y_train)
    print("Random Forest Score: ", random_forest_model.score(X_test, y_test))

    # SVM score
    SVM_model = SVC()
    SVM_model.fit(X_train, y_train)
    print("SVM Score: ",  SVM_model.score(X_test, y_test))

In [12]:
three_model_calculation(X_train ,y_train)

Logistical Regression Score:  0.8611111111111112
Random Forest Score:  0.8611111111111112
SVM Score:  0.85


In [13]:
pca = PCA(0.95)

X_pca = pca.fit_transform(X_scaled)
X_pca.shape
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=30)

In [14]:
three_model_calculation(X_train ,y_train)

Logistical Regression Score:  0.8666666666666667
Random Forest Score:  0.8666666666666667
SVM Score:  0.85
