# Heart Disease Prediction - Model Training

This notebook demonstrates the training process for various machine learning models to predict heart disease.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Loading dataset...")

# Load the dataset
try:
    # Try to load from the data directory
    df = pd.read_csv('../data/raw/heart.csv')
except FileNotFoundError:
    # If not found, try the project root
    try:
        df = pd.read_csv('../../heart.csv')
    except FileNotFoundError:
        # Create sample data for demonstration
        print("Dataset not found. Creating sample data for demonstration.")
        np.random.seed(42)
        n_samples = 1000
        df = pd.DataFrame({
            'age': np.random.randint(25, 80, n_samples),
            'sex': np.random.choice([0, 1], n_samples),
            'cp': np.random.choice([0, 1, 2, 3], n_samples),
            'trestbps': np.random.randint(90, 200, n_samples),
            'chol': np.random.randint(120, 400, n_samples),
            'fbs': np.random.choice([0, 1], n_samples),
            'restecg': np.random.choice([0, 1, 2], n_samples),
            'thalach': np.random.randint(70, 200, n_samples),
            'exang': np.random.choice([0, 1], n_samples),
            'oldpeak': np.random.uniform(0, 6, n_samples),
            'slope': np.random.choice([0, 1, 2], n_samples),
            'ca': np.random.choice([0, 1, 2, 3], n_samples),
            'thal': np.random.choice([0, 1, 2], n_samples),
            'target': np.random.choice([0, 1], n_samples)
        })

print(f"Dataset loaded with shape: {df.shape}")

In [None]:
# Prepare features and target
X = df.drop('target', axis=1)
y = df['target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

In [None]:
# Define models to train
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'SVM': SVC(random_state=42, probability=True),
    'XGBoost': XGBClassifier(random_state=42),
    'Neural Network': MLPClassifier(random_state=42, max_iter=1000)
}

# Train and evaluate models
model_results = {}

print("Training and evaluating models...")
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Use scaled data for models that benefit from it
    if name in ['Logistic Regression', 'SVM', 'Neural Network']:
        X_train_model = X_train_scaled
        X_test_model = X_test_scaled
    else:
        X_train_model = X_train
        X_test_model = X_test
    
    # Train the model
    model.fit(X_train_model, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_model)
    y_pred_proba = model.predict_proba(X_test_model)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    # Cross-validation score
    cv_scores = cross_val_score(model, X_train_model, y_train, cv=5, scoring='accuracy')
    
    # Store results
    model_results[name] = {
        'model': model,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }
    
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1 Score: {f1:.4f}")
    print(f"  ROC AUC: {roc_auc:.4f}")
    print(f"  CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

In [None]:
# Compare model performance
comparison_df = pd.DataFrame({
    name: {
        'Accuracy': results['accuracy'],
        'Precision': results['precision'],
        'Recall': results['recall'],
        'F1 Score': results['f1_score'],
        'ROC AUC': results['roc_auc'],
        'CV Accuracy': results['cv_mean']
    }
    for name, results in model_results.items()
})

comparison_df = comparison_df.T
comparison_df = comparison_df.sort_values('F1 Score', ascending=False)

print("Model Comparison (sorted by F1 Score):")
comparison_df

In [None]:
# Visualize model comparison
fig, ax = plt.subplots(figsize=(12, 8))
comparison_df[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(kind='bar', ax=ax)
plt.title('Model Performance Comparison')
plt.ylabel('Score')
plt.xlabel('Model')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# Select best model based on F1 score
best_model_name = comparison_df.index[0]
best_model = model_results[best_model_name]['model']
best_model_results = model_results[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"F1 Score: {best_model_results['f1_score']:.4f}")
print(f"ROC AUC: {best_model_results['roc_auc']:.4f}")

In [None]:
# Confusion matrix for best model
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, best_model_results['y_pred'])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title(f'Confusion Matrix - {best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Classification report
print(f"\nClassification Report - {best_model_name}:\n")
print(classification_report(y_test, best_model_results['y_pred']))

In [None]:
# Feature importance for tree-based models
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 8))
    sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
    plt.title(f'Top 10 Feature Importances - {best_model_name}')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()
    
    print("Top 10 Most Important Features:")
    print(feature_importance.head(10))

In [None]:
# Hyperparameter tuning for top models
print("Performing hyperparameter tuning for top models...")

# Select top 3 models for hyperparameter tuning
top_models = comparison_df.head(3).index.tolist()
print(f"Top 3 models for tuning: {top_models}")

# Define parameter grids
param_grids = {
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7, None],
        'min_samples_split': [2, 5, 10]
    },
    'XGBoost': {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2]
    },
    'Logistic Regression': {
        'C': [0.1, 1, 10],
        'penalty': ['l1', 'l2']
    }
}

# Perform hyperparameter tuning
tuned_results = {}

for model_name in top_models:
    if model_name in param_grids:
        print(f"\nTuning {model_name}...")
        
        # Select appropriate model and data
        if model_name == 'Logistic Regression':
            model = LogisticRegression(random_state=42, max_iter=1000)
            X_train_tune = X_train_scaled
            X_test_tune = X_test_scaled
        elif model_name == 'XGBoost':
            model = XGBClassifier(random_state=42)
            X_train_tune = X_train
            X_test_tune = X_test
        else:  # Random Forest
            model = RandomForestClassifier(random_state=42)
            X_train_tune = X_train
            X_test_tune = X_test
        
        # Perform grid search
        grid_search = GridSearchCV(
            model, 
            param_grids[model_name], 
            cv=5, 
            scoring='f1',
            n_jobs=-1,
            verbose=1
        )
        
        grid_search.fit(X_train_tune, y_train)
        
        # Evaluate tuned model
        best_tuned_model = grid_search.best_estimator_
        y_pred_tuned = best_tuned_model.predict(X_test_tune)
        y_pred_proba_tuned = best_tuned_model.predict_proba(X_test_tune)[:, 1]
        
        tuned_accuracy = accuracy_score(y_test, y_pred_tuned)
        tuned_f1 = f1_score(y_test, y_pred_tuned)
        tuned_roc_auc = roc_auc_score(y_test, y_pred_proba_tuned)
        
        tuned_results[model_name] = {
            'model': best_tuned_model,
            'best_params': grid_search.best_params_,
            'accuracy': tuned_accuracy,
            'f1_score': tuned_f1,
            'roc_auc': tuned_roc_auc
        }
        
        print(f"  Best Parameters: {grid_search.best_params_}")
        print(f"  F1 Score: {tuned_f1:.4f} (improved from {model_results[model_name]['f1_score']:.4f})")

In [None]:
# Compare original vs tuned models
if tuned_results:
    comparison_data = []
    for model_name in tuned_results.keys():
        comparison_data.append({
            'Model': model_name,
            'Type': 'Original',
            'F1 Score': model_results[model_name]['f1_score'],
            'ROC AUC': model_results[model_name]['roc_auc']
        })
        comparison_data.append({
            'Model': model_name,
            'Type': 'Tuned',
            'F1 Score': tuned_results[model_name]['f1_score'],
            'ROC AUC': tuned_results[model_name]['roc_auc']
        })
    
    comparison_tuned_df = pd.DataFrame(comparison_data)
    
    # Visualize comparison
    fig, ax = plt.subplots(figsize=(12, 6))
    comparison_tuned_df.set_index(['Model', 'Type'])['F1 Score'].unstack().plot(kind='bar', ax=ax)
    plt.title('Original vs Tuned Model Performance')
    plt.ylabel('F1 Score')
    plt.xlabel('Model')
    plt.xticks(rotation=45)
    plt.legend(title='Type')
    plt.tight_layout()
    plt.show()
    
    print("Performance Comparison (Original vs Tuned):")
    print(comparison_tuned_df.pivot(index='Model', columns='Type', values=['F1 Score', 'ROC AUC']))

In [None]:
# Select final best model
final_model = None
final_model_name = ""
final_f1_score = 0

# Check tuned models
for model_name, results in tuned_results.items():
    if results['f1_score'] > final_f1_score:
        final_model = results['model']
        final_model_name = f"{model_name} (Tuned)"
        final_f1_score = results['f1_score']

# Check original models if tuned models aren't better
for model_name, results in model_results.items():
        if results['f1_score'] > final_f1_score:
            final_model = results['model']
            final_model_name = model_name
            final_f1_score = results['f1_score']

print(f"\nFinal Best Model: {final_model_name}")
print(f"Final F1 Score: {final_f1_score:.4f}")

In [None]:
# Save the best model and scaler
import pickle
import os

# Create models directory if it doesn't exist
models_dir = '../models/trained_models'
os.makedirs(models_dir, exist_ok=True)

# Save the model
model_path = os.path.join(models_dir, 'best_model.pkl')
with open(model_path, 'wb') as f:
    pickle.dump(final_model, f)

# Save the scaler
scaler_path = os.path.join(models_dir, 'scaler.pkl')
with open(scaler_path, 'wb') as f:
    pickle.dump(scaler, f)

# Save feature names
feature_names = X.columns.tolist()
features_path = os.path.join(models_dir, 'feature_names.pkl')
with open(features_path, 'wb') as f:
    pickle.dump(feature_names, f)

print(f"\nModel saved to: {model_path}")
print(f"Scaler saved to: {scaler_path}")
print(f"Feature names saved to: {features_path}")

## Model Training Summary

### Models Trained:
1. Logistic Regression
2. Decision Tree
3. Random Forest
4. Support Vector Machine
5. XGBoost
6. Neural Network

### Best Performing Model:
**{final_model_name}** with F1 Score: {final_f1_score:.4f}

### Key Metrics:
- **Accuracy**: [Value]
- **Precision**: [Value]
- **Recall**: [Value]
- **F1 Score**: {final_f1_score:.4f}
- **ROC AUC**: [Value]

### Model Saved:
The best model has been saved to `../models/trained_models/best_model.pkl` along with the scaler and feature names.

### Next Steps:
1. Model evaluation and interpretation
2. Feature importance analysis
3. Integration with the web application
4. Deployment preparation