# ðŸ¤– CardioDetect - Model Training & Evaluation

## Milestone 2: Comprehensive Model Comparison

In this notebook, I train and evaluate **8 different machine learning models** on my unified cardiovascular risk dataset. My goal is to find the model that best balances accuracy and recall for predicting heart disease.

### Models I'm Testing:
1. **Logistic Regression** - Linear baseline
2. **Random Forest** - Ensemble of decision trees
3. **XGBoost** - Gradient boosted trees
4. **LightGBM** - Microsoft fast gradient boosting
5. **SVM (RBF)** - Support vector machine with radial basis function
6. **Gradient Boosting** - Sklearn gradient boosting
7. **MLP** - Multi-layer perceptron (neural network)
8. **Ensemble** - Soft-voting combination of RF + XGB + LGBM + MLP

All models use class weighting (where applicable) to handle the dataset balance. I am using the unified dataset with ~16k records.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
print("âœ… Libraries Loaded")

In [None]:
# Load Data
data_path = '../data/processed/combined_processed.csv'
df = pd.read_csv(data_path)
print(f"Dataset Shape: {df.shape}")

# Split Features/Target
X = df.drop('target', axis=1)
y = df['target']

# Train/Test Split (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Train: {X_train.shape}, Test: {X_test.shape}")

In [None]:
# Define Preprocessor
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numeric_features),
        ('cat', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), categorical_features)
    ])

print("âœ… Preprocessor Ready")

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=42),
    'LightGBM': LGBMClassifier(random_state=42, verbose=-1),
    'SVM (RBF)': SVC(probability=True, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'MLP': MLPClassifier(max_iter=500, random_state=42)
}

print("âœ… Models Initialized")

In [None]:
results = []
trained_models = {}

print("ðŸš€ Starting Training Loop...")

for name, model in models.items():
    print(f"Training {name}...")
    clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    clf.fit(X_train, y_train)
    
    # Evaluate
    y_pred = clf.predict(X_test)
    y_prob = clf.predict_proba(X_test)[:, 1]
    
    acc = accuracy_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    
    results.append({'Model': name, 'Accuracy': acc, 'Recall': rec, 'ROC-AUC': auc})
    trained_models[name] = clf

# Create Ensemble (Voting)
print("Training Ensemble...")
estimators = [
    ('rf', models['Random Forest']),
    ('xgb', models['XGBoost']),
    ('lgbm', models['LightGBM']),
    ('mlp', models['MLP'])
]
voting = VotingClassifier(estimators=estimators, voting='soft')
clf_voting = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', voting)])
clf_voting.fit(X_train, y_train)

y_pred = clf_voting.predict(X_test)
y_prob = clf_voting.predict_proba(X_test)[:, 1]
results.append({'Model': 'Voting Ensemble', 
                'Accuracy': accuracy_score(y_test, y_pred),
                'Recall': recall_score(y_test, y_pred),
                'ROC-AUC': roc_auc_score(y_test, y_prob)})

print("âœ… Training Complete")

In [None]:
results_df = pd.DataFrame(results).sort_values(by='Accuracy', ascending=False)
print(results_df)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Accuracy', y='Model', data=results_df, palette='viridis')
plt.title('Model Comparison (Accuracy)')
plt.xlim(0.8, 1.0)
plt.show()