In [4]:
import pandas as pd

In [5]:
from google.colab import files
uploaded = files.upload()


Saving heart.csv to heart.csv


You can upload the `heart.csv` file using the following code. After running this cell, a file upload dialog will appear. Select the `heart.csv` file from your local machine.

After uploading the file, you can now read it into a pandas DataFrame using the following code. Make sure the filename in `pd.read_csv()` matches the name of the file you uploaded.

In [6]:
df = pd.read_csv('heart.csv')
df.head()


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [7]:
df.shape

(918, 12)

In [8]:
from scipy.stats import zscore
import numpy as np


z_scores = np.abs(zscore(df.select_dtypes(include=[np.number])))


df_clean = df[(z_scores < 3).all(axis=1)]
df_clean.shape


(899, 12)

In [15]:
print(df.columns.tolist())


['Age', 'Sex', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'HeartDisease', 'ChestPainType_ATA', 'ChestPainType_NAP', 'ChestPainType_TA', 'RestingECG_Normal', 'RestingECG_ST', 'ST_Slope_Flat', 'ST_Slope_Up']


In [16]:
from sklearn.preprocessing import LabelEncoder


df['Sex'] = LabelEncoder().fit_transform(df['Sex'])
df['FastingBS'] = LabelEncoder().fit_transform(df['FastingBS'])
df['ExerciseAngina'] = LabelEncoder().fit_transform(df['ExerciseAngina'])

In [19]:
from sklearn.preprocessing import StandardScaler


df_scaled = df.copy()


num_cols = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']


scaler = StandardScaler()
df_scaled[num_cols] = scaler.fit_transform(df_scaled[num_cols])


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [21]:
X = df_scaled.drop('HeartDisease', axis=1)
y = df_scaled['HeartDisease']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
acc_logreg = accuracy_score(y_test, y_pred_logreg)

In [24]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)

print(f"Logistic Regression Accuracy: {acc_logreg:.4f}")
print(f"Random Forest Accuracy: {acc_rf:.4f}")

Logistic Regression Accuracy: 0.8533
Random Forest Accuracy: 0.8641


In [25]:
from sklearn.decomposition import PCA

In [26]:
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X)

In [27]:
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.2, random_state=42)

In [28]:
logreg_pca = LogisticRegression()
logreg_pca.fit(X_train_pca, y_train_pca)
y_pred_logreg_pca = logreg_pca.predict(X_test_pca)
acc_logreg_pca = accuracy_score(y_test_pca, y_pred_logreg_pca)

In [29]:
rf_pca = RandomForestClassifier()
rf_pca.fit(X_train_pca, y_train_pca)
y_pred_rf_pca = rf_pca.predict(X_test_pca)
acc_rf_pca = accuracy_score(y_test_pca, y_pred_rf_pca)

In [30]:
print(f"LogReg Accuracy with PCA: {acc_logreg_pca:.4f}")
print(f"Random Forest Accuracy with PCA: {acc_rf_pca:.4f}")

LogReg Accuracy with PCA: 0.8315
Random Forest Accuracy with PCA: 0.8750
