In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Load dataset
df = pd.read_csv('heart.csv')

# Step 1: Encode categorical features
df_encoded = df.copy()
label_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

# Apply label encoding for binary columns, one-hot encoding for others
for col in label_cols:
    if df_encoded[col].nunique() == 2:
        df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])
    else:
        df_encoded = pd.get_dummies(df_encoded, columns=[col], drop_first=True)

# Step 2: Split features and target
X = df_encoded.drop(columns='HeartDisease')
y = df_encoded['HeartDisease']

# Step 3: Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 5: Train classifiers
models = {
    'SVM': SVC(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier()
}

print("Accuracy Before PCA:")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name}: {acc:.4f}")

# Step 6: Apply PCA to retain 95% variance
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print("\nAccuracy After PCA:")
for name, model in models.items():
    model.fit(X_train_pca, y_train)
    y_pred_pca = model.predict(X_test_pca)
    acc_pca = accuracy_score(y_test, y_pred_pca)
    print(f"{name} (PCA): {acc_pca:.4f}")

# Optional: Display PCA variance ratios
print("\nPCA Explained Variance Ratios:")
print(np.round(pca.explained_variance_ratio_ * 100, 2))


Accuracy Before PCA:
SVM: 0.8750
Logistic Regression: 0.8533
Random Forest: 0.8696

Accuracy After PCA:
SVM (PCA): 0.8750
Logistic Regression (PCA): 0.8533
Random Forest (PCA): 0.8587

PCA Explained Variance Ratios:
[23.61 10.82  9.48  8.34  7.38  6.66  6.24  5.27  5.17  4.29  4.09  3.04
  2.81]
