# ML Assignment 2 - Heart Disease Classification
## Model Training and Evaluation

This notebook implements 6 classification models and calculates 6 evaluation metrics for each.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, 
    recall_score, f1_score, matthews_corrcoef,
    confusion_matrix, classification_report
)
import joblib
import warnings
warnings.filterwarnings('ignore')

# Fix SSL certificate verification issue for downloading dataset
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

print("All libraries imported successfully!")

All libraries imported successfully!


## Step 1: Load and Explore Dataset

In [None]:
# Load the Heart Disease dataset
# We'll download and use the combined dataset from UCI repository
# For now, let's download it using pandas

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'

# Column names based on UCI documentation
column_names = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
    'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'
]

# Load Cleveland dataset
df = pd.read_csv(url, names=column_names, na_values='?')

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1032)>

In [None]:
# Check for missing values and data types
print("Dataset Info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())
print("\nBasic statistics:")
df.describe()

In [None]:
# Convert target to binary (0 = no disease, 1 = disease)
# Original: 0 = no disease, 1-4 = different levels of disease
df['target'] = (df['target'] > 0).astype(int)

print("Target distribution:")
print(df['target'].value_counts())
print(f"\nTarget balance: {df['target'].value_counts(normalize=True)}")

## Step 2: Data Preprocessing

In [None]:
# Handle missing values - drop rows with missing values
df_clean = df.dropna()
print(f"Dataset shape after removing missing values: {df_clean.shape}")

# Note: If instances < 500, we'll need to use additional datasets
# from Hungary, Switzerland, and VA Long Beach
if len(df_clean) < 500:
    print("\nWarning: Dataset has fewer than 500 instances.")
    print("We'll need to combine with other UCI heart disease datasets.")

In [None]:
# Separate features and target
X = df_clean.drop('target', axis=1)
y = df_clean['target']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nNumber of features: {X.shape[1]}")

In [None]:
# Split data into train and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

In [None]:
# Feature scaling (important for KNN and Logistic Regression)
import os
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler for later use in Streamlit app
os.makedirs('../model', exist_ok=True)
joblib.dump(scaler, '../model/scaler.pkl')
print("Scaler saved!")

In [None]:
# Save test data as CSV for Streamlit upload feature
import os
test_data = pd.concat([X_test, y_test], axis=1)
os.makedirs('../data', exist_ok=True)
test_data.to_csv('../data/test_data.csv', index=False)
print("Test data saved to data/test_data.csv")

## Step 3: Train Classification Models

In [None]:
# Initialize all 6 models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'XGBoost': XGBClassifier(random_state=42, eval_metric='logloss')
}

print("Models initialized:")
for name in models.keys():
    print(f"  - {name}")

In [None]:
# Train all models
trained_models = {}

for name, model in models.items():
    print(f"Training {name}...")
    
    # Use scaled data for models that benefit from it
    if name in ['Logistic Regression', 'K-Nearest Neighbors']:
        model.fit(X_train_scaled, y_train)
    else:
        model.fit(X_train, y_train)
    
    trained_models[name] = model
    print(f"  ✓ {name} trained successfully")

print("\nAll models trained!")

In [None]:
# Save all trained models
import os
for name, model in trained_models.items():
    filename = name.lower().replace(' ', '_').replace('-', '_') + '.pkl'
    joblib.dump(model, f'{filename}')
    print(f"Saved: {filename}")

print("\nAll models saved!")

## Step 4: Calculate Evaluation Metrics

In [None]:
# Function to calculate all metrics for a model
def calculate_metrics(model, X_test_data, y_test_data, model_name):
    """
    Calculate all 6 required metrics for a given model.
    """
    # Make predictions
    y_pred = model.predict(X_test_data)
    y_pred_proba = model.predict_proba(X_test_data)[:, 1]  # For AUC
    
    # Calculate metrics
    metrics = {
        'Model': model_name,
        'Accuracy': accuracy_score(y_test_data, y_pred),
        'AUC': roc_auc_score(y_test_data, y_pred_proba),
        'Precision': precision_score(y_test_data, y_pred, zero_division=0),
        'Recall': recall_score(y_test_data, y_pred, zero_division=0),
        'F1': f1_score(y_test_data, y_pred, zero_division=0),
        'MCC': matthews_corrcoef(y_test_data, y_pred)
    }
    
    return metrics

In [None]:
# Calculate metrics for all models
all_metrics = []

for name, model in trained_models.items():
    print(f"Evaluating {name}...")
    
    # Use scaled test data for appropriate models
    if name in ['Logistic Regression', 'K-Nearest Neighbors']:
        metrics = calculate_metrics(model, X_test_scaled, y_test, name)
    else:
        metrics = calculate_metrics(model, X_test, y_test, name)
    
    all_metrics.append(metrics)
    print(f"  ✓ {name} evaluated")

print("\nAll models evaluated!")

In [None]:
# Create DataFrame with all metrics
metrics_df = pd.DataFrame(all_metrics)
metrics_df = metrics_df.round(4)

print("\n" + "="*80)
print("EVALUATION METRICS FOR ALL MODELS")
print("="*80)
print(metrics_df.to_string(index=False))
print("="*80)

In [None]:
# Save metrics to CSV
metrics_df.to_csv('metrics_comparison.csv', index=False)
print("Metrics saved to metrics_comparison.csv")

## Step 5: Visualize Results

In [None]:
# Visualize metrics comparison
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold')

metrics_to_plot = ['Accuracy', 'AUC', 'Precision', 'Recall', 'F1', 'MCC']
axes = axes.flatten()

for idx, metric in enumerate(metrics_to_plot):
    ax = axes[idx]
    data = metrics_df.sort_values(metric, ascending=False)
    ax.barh(data['Model'], data[metric], color=plt.cm.viridis(np.linspace(0.3, 0.9, len(data))))
    ax.set_xlabel(metric, fontweight='bold')
    ax.set_title(f'{metric} Comparison', fontweight='bold')
    ax.grid(axis='x', alpha=0.3)
    
    # Add value labels
    for i, v in enumerate(data[metric]):
        ax.text(v + 0.01, i, f'{v:.3f}', va='center')

plt.tight_layout()
plt.savefig('metrics_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("Visualization saved as metrics_comparison.png")

## Step 6: Model Performance Observations

In [None]:
# Identify best performing model for each metric
print("Best Models by Metric:")
print("="*60)
for metric in ['Accuracy', 'AUC', 'Precision', 'Recall', 'F1', 'MCC']:
    best_model = metrics_df.loc[metrics_df[metric].idxmax(), 'Model']
    best_value = metrics_df[metric].max()
    print(f"{metric:12s}: {best_model:25s} ({best_value:.4f})")
print("="*60)

In [None]:
# Generate observations for each model
observations = []

for _, row in metrics_df.iterrows():
    model_name = row['Model']
    
    # Analyze performance
    avg_score = row[['Accuracy', 'F1', 'MCC']].mean()
    
    if avg_score >= 0.85:
        performance = "Excellent"
    elif avg_score >= 0.80:
        performance = "Very Good"
    elif avg_score >= 0.75:
        performance = "Good"
    else:
        performance = "Moderate"
    
    # Create observation
    obs_text = f"{performance} performance with {row['Accuracy']:.2%} accuracy. "
    
    if row['Precision'] > row['Recall']:
        obs_text += f"High precision ({row['Precision']:.2%}) indicates fewer false positives. "
    elif row['Recall'] > row['Precision']:
        obs_text += f"High recall ({row['Recall']:.2%}) captures most positive cases. "
    
    obs_text += f"MCC of {row['MCC']:.3f} shows {'strong' if row['MCC'] > 0.6 else 'moderate'} correlation."
    
    observations.append({
        'Model': model_name,
        'Observation': obs_text
    })

observations_df = pd.DataFrame(observations)
print("\nModel Observations:")
print("="*100)
for _, row in observations_df.iterrows():
    print(f"\n{row['Model']}:")
    print(f"  {row['Observation']}")
print("="*100)

In [None]:
# Save observations - check working directory first
import os
print(f"Current working directory: {os.getcwd()}")
output_path = os.path.join(os.getcwd(), 'observations.csv')
observations_df.to_csv(output_path, index=False)
print(f"Observations saved to: {output_path}")
print(f"File exists: {os.path.exists(output_path)}")
if os.path.exists(output_path):
    print(f"File size: {os.path.getsize(output_path)} bytes")

## Summary

✅ All 6 models trained successfully  
✅ All 6 metrics calculated for each model  
✅ Models saved as .pkl files  
✅ Test data saved for Streamlit app  
✅ Metrics comparison table created  
✅ Performance observations generated  

Next step: Build the Streamlit web application!