# ML Assignment 2 - Heart Disease Classification## Student: Raji## Date: February 15, 2026

## 1. Import Libraries

In [None]:
import pandas as pdimport numpy as npfrom sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import StandardScaler, LabelEncoderfrom sklearn.impute import SimpleImputerfrom sklearn.metrics import (    accuracy_score, roc_auc_score, precision_score,    recall_score, f1_score, matthews_corrcoef,    confusion_matrix, classification_report)from sklearn.linear_model import LogisticRegressionfrom sklearn.tree import DecisionTreeClassifierfrom sklearn.neighbors import KNeighborsClassifierfrom sklearn.naive_bayes import GaussianNBfrom sklearn.ensemble import RandomForestClassifierfrom xgboost import XGBClassifierimport joblibimport osimport warningswarnings.filterwarnings('ignore')

## 2. Load Dataset

In [None]:
# Load the heart disease datasetdf = pd.read_csv("heart_disease_uci.csv")print("Dataset Shape:", df.shape)print("\nFirst few rows:")df.head()

## 3. Data Exploration

In [None]:
# Check columns and data typesprint("Columns:", df.columns.tolist())print("\nData types:\n", df.dtypes)print("\nMissing values:\n", df.isnull().sum())print("\nTarget distribution:\n", df["num"].value_counts())

## 4. Data Preprocessing### 4.1 Separate Features and Target

In [None]:
# Separate features and targetX = df.drop("num", axis=1)y = df["num"]# Convert to binary classification (0: No disease, 1: Disease)y = (y > 0).astype(int)print("Feature shape:", X.shape)print("Target shape:", y.shape)print("Target classes:", y.unique())print("Target distribution:\n", y.value_counts())

### 4.2 Encode Categorical Features

In [None]:
# Encode categorical variablesX_encoded = X.copy()label_encoders = {}for col in X_encoded.columns:    if X_encoded[col].dtype == "object":        le = LabelEncoder()        X_encoded[col] = le.fit_transform(X_encoded[col])        label_encoders[col] = le        print(f"Encoded column: {col}")print(f"\nTotal categorical columns encoded: {len(label_encoders)}")

### 4.3 Train-Test Split

In [None]:
# Split data into training and testing setsX_train, X_test, y_train, y_test = train_test_split(    X_encoded, y,    test_size=0.2,    random_state=42,    stratify=y)print("Training set size:", X_train.shape)print("Test set size:", X_test.shape)print("\nTrain target distribution:\n", y_train.value_counts())print("\nTest target distribution:\n", y_test.value_counts())

### 4.4 Feature Scaling

In [None]:
# Scale features using StandardScalerscaler = StandardScaler()# Fit on training data and transform both train and testX_train_scaled = scaler.fit_transform(X_train)X_test_scaled = scaler.transform(X_test)print("Training data scaled")print("Test data transformed using training scaler")

### 4.5 Handle Missing Values

In [None]:
# Impute missing values using median strategyimputer = SimpleImputer(strategy="median")# Fit on training data and transform both train and testX_train_final = imputer.fit_transform(X_train_scaled)X_test_final = imputer.transform(X_test_scaled)print("NaNs in training data:", np.isnan(X_train_final).sum())print("NaNs in test data:", np.isnan(X_test_final).sum())

### 4.6 Save Preprocessing Objects

In [None]:
# Create model directoryos.makedirs("model", exist_ok=True)# Save preprocessing objects for deploymentjoblib.dump(imputer, "model/imputer.pkl")joblib.dump(scaler, "model/scaler.pkl")print("✓ Imputer saved: model/imputer.pkl")print("✓ Scaler saved: model/scaler.pkl")

## 5. Model Training and Evaluation### 5.1 Define Models

In [None]:
# Define all classification modelsmodels = {    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),    "Decision Tree": DecisionTreeClassifier(random_state=42),    "KNN": KNeighborsClassifier(n_neighbors=5),    "Naive Bayes": GaussianNB(),    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),    "XGBoost": XGBClassifier(        use_label_encoder=False,        eval_metric="logloss",        random_state=42    )}print("Models defined:")for name in models.keys():    print(f"  • {name}")

### 5.2 Train Models and Calculate Metrics

In [None]:
# Train all models and evaluateresults = []for name, model in models.items():    print(f"\nTraining {name}...")        # Train model    model.fit(X_train_final, y_train)        # Make predictions    y_pred = model.predict(X_test_final)    y_prob = model.predict_proba(X_test_final)[:, 1]        # Calculate metrics    metrics = {        "Model": name,        "Accuracy": round(accuracy_score(y_test, y_pred), 4),        "AUC": round(roc_auc_score(y_test, y_prob), 4),        "Precision": round(precision_score(y_test, y_pred), 4),        "Recall": round(recall_score(y_test, y_pred), 4),        "F1": round(f1_score(y_test, y_pred), 4),        "MCC": round(matthews_corrcoef(y_test, y_pred), 4)    }        results.append(metrics)        # Save model    model_filename = f"model/{name.replace(' ', '_')}.pkl"    joblib.dump(model, model_filename)    print(f"✓ Model saved: {model_filename}")        # Print metrics    print(f"  Accuracy: {metrics['Accuracy']:.4f}")    print(f"  AUC: {metrics['AUC']:.4f}")    print(f"  F1 Score: {metrics['F1']:.4f}")print("\n" + "="*50)print("All models trained and saved successfully!")print("="*50)

### 5.3 Results Comparison

In [None]:
# Create results DataFrameresults_df = pd.DataFrame(results)# Display resultsprint("\nMODEL COMPARISON TABLE")print("="*80)print(results_df.to_string(index=False))print("="*80)# Find best model for each metricprint("\nBEST MODELS:")for metric in ["Accuracy", "AUC", "Precision", "Recall", "F1", "MCC"]:    best_idx = results_df[metric].idxmax()    best_model = results_df.loc[best_idx, "Model"]    best_score = results_df.loc[best_idx, metric]    print(f"  {metric:12s}: {best_model:20s} ({best_score:.4f})")results_df

### 5.4 Save Metrics

In [None]:
# Save metrics for documentationresults_df.to_csv("model/model_metrics.csv", index=False)print("✓ Metrics saved: model/model_metrics.csv")

## 6. Prepare Test Data for Streamlit### 6.1 Save Preprocessed Test Data

In [None]:
# Convert preprocessed test data back to DataFrameX_test_df = pd.DataFrame(X_test_final, columns=X_encoded.columns)# Add target columnX_test_df["num"] = y_test.values# Save test dataX_test_df.to_csv("test_data.csv", index=False)print("✓ Test data saved: test_data.csv")print(f"  Shape: {X_test_df.shape}")print(f"\nFirst few rows:")print(X_test_df.head())

## 7. Verification### 7.1 Load and Test Saved Models

In [None]:
# Verify that saved models work correctlyprint("VERIFYING SAVED MODELS\n" + "="*50)for name in models.keys():    model_filename = f"model/{name.replace(' ', '_')}.pkl"        # Load model    loaded_model = joblib.load(model_filename)        # Make predictions    y_pred = loaded_model.predict(X_test_final)    accuracy = accuracy_score(y_test, y_pred)        print(f"{name:20s}: Accuracy = {accuracy:.4f} ✓")print("\n" + "="*50)print("All models loaded and verified successfully!")

## 8. Summary### Files Created:- **model/Logistic_Regression.pkl** - Trained Logistic Regression model- **model/Decision_Tree.pkl** - Trained Decision Tree model- **model/KNN.pkl** - Trained KNN model- **model/Naive_Bayes.pkl** - Trained Naive Bayes model- **model/Random_Forest.pkl** - Trained Random Forest model- **model/XGBoost.pkl** - Trained XGBoost model- **model/imputer.pkl** - Fitted SimpleImputer- **model/scaler.pkl** - Fitted StandardScaler- **model/model_metrics.csv** - All model metrics- **test_data.csv** - Preprocessed test data for Streamlit### Next Steps:1. Upload all pkl files to GitHub in a `model/` directory2. Upload test_data.csv to GitHub3. Deploy Streamlit app using app.py4. Test the app with test_data.csv