# Heart Disease Prediction - Model Training Pipeline

This notebook demonstrates the complete machine learning pipeline for heart disease prediction, including:
- Data loading and exploration
- Preprocessing and feature engineering
- Training multiple classification models
- Hyperparameter tuning
- Model evaluation and comparison
- Saving the best model for deployment

## 1. Import Required Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning - Preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler

# Machine Learning - Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Machine Learning - Evaluation
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, roc_auc_score, confusion_matrix, 
                            classification_report, roc_curve, auc)

# Model persistence
import joblib
import os

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Configure visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✓ All libraries imported successfully!")

## 2. Load and Explore the Dataset

In [None]:
# Load the dataset
df = pd.read_csv('heart_disease_dataset.csv')

print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Dataset information
print("=" * 60)
print("DATASET INFORMATION")
print("=" * 60)
df.info()
print("\n" + "=" * 60)
print("STATISTICAL SUMMARY")
print("=" * 60)
df.describe()

In [None]:
# Check for missing values
print("=" * 60)
print("MISSING VALUES")
print("=" * 60)
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0] if missing_values.sum() > 0 else "No missing values found!")

# Check target distribution
print("\n" + "=" * 60)
print("TARGET VARIABLE DISTRIBUTION")
print("=" * 60)
print(df['target'].value_counts())
print(f"\nClass Balance: {df['target'].value_counts(normalize=True) * 100}")

In [None]:
# Visualize target class distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
df['target'].value_counts().plot(kind='bar', ax=axes[0], color=['#3498db', '#e74c3c'])
axes[0].set_title('Target Variable Distribution (Count)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Target (0=No Disease, 1=Disease)', fontsize=11)
axes[0].set_ylabel('Count', fontsize=11)
axes[0].set_xticklabels(['No Disease (0)', 'Disease (1)'], rotation=0)
axes[0].grid(axis='y', alpha=0.3)

# Pie chart
target_counts = df['target'].value_counts()
axes[1].pie(target_counts, labels=['Disease (1)', 'No Disease (0)'], 
           autopct='%1.1f%%', startangle=90, colors=['#e74c3c', '#3498db'],
           explode=(0.05, 0))
axes[1].set_title('Target Variable Distribution (%)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\nClass Distribution:")
print(f"  No Disease (0): {target_counts[0]} ({target_counts[0]/len(df)*100:.1f}%)")
print(f"  Disease (1): {target_counts[1]} ({target_counts[1]/len(df)*100:.1f}%)")

In [None]:
# Visualize target distribution and correlation matrix
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Target distribution
axes[0].pie(df['target'].value_counts(), labels=['No Disease (0)', 'Disease (1)'], 
           autopct='%1.1f%%', startangle=90, colors=['#2ecc71', '#e74c3c'])
axes[0].set_title('Target Variable Distribution', fontsize=14, fontweight='bold')

# Correlation heatmap
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0, 
           linewidths=0.5, ax=axes[1], cbar_kws={'label': 'Correlation'})
axes[1].set_title('Feature Correlation Matrix', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Define feature types
continuous_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

print("Continuous Features:", continuous_features)
print("\nCategorical Features:", categorical_features)
print("\nUnique values in categorical features:")
for col in categorical_features:
    print(f"  {col}: {df[col].nunique()} unique values - {sorted(df[col].unique())}")

In [None]:
# Visualize categorical features distribution
fig, axes = plt.subplots(2, 4, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(categorical_features):
    df[col].value_counts().sort_index().plot(kind='bar', ax=axes[idx], color='steelblue')
    axes[idx].set_title(f'{col.upper()} Distribution', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col, fontsize=10)
    axes[idx].set_ylabel('Frequency', fontsize=10)
    axes[idx].grid(axis='y', alpha=0.3)
    axes[idx].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

In [None]:
# Visualize continuous features distribution
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(continuous_features):
    sns.histplot(df[col], kde=True, ax=axes[idx], color='coral', bins=30)
    axes[idx].set_title(f'{col.upper()} Distribution', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col, fontsize=10)
    axes[idx].set_ylabel('Frequency', fontsize=10)
    axes[idx].grid(axis='y', alpha=0.3)
    
    # Add mean line
    mean_val = df[col].mean()
    axes[idx].axvline(mean_val, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_val:.1f}')
    axes[idx].legend()

# Remove extra subplot
axes[-1].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# Age distribution by target
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
df[df['target'] == 0]['age'].hist(bins=20, alpha=0.7, label='No Disease', color='blue', edgecolor='black')
df[df['target'] == 1]['age'].hist(bins=20, alpha=0.7, label='Disease', color='red', edgecolor='black')
plt.xlabel('Age', fontsize=11, fontweight='bold')
plt.ylabel('Frequency', fontsize=11, fontweight='bold')
plt.title('Age Distribution by Heart Disease', fontsize=13, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)

plt.subplot(1, 2, 2)
df.boxplot(column='age', by='target', ax=plt.gca(), patch_artist=True)
plt.xlabel('Target (0=No Disease, 1=Disease)', fontsize=11, fontweight='bold')
plt.ylabel('Age', fontsize=11, fontweight='bold')
plt.title('Age Distribution by Target', fontsize=13, fontweight='bold')
plt.suptitle('')  # Remove default title

plt.tight_layout()
plt.show()

In [None]:
# Check for skewness in continuous features
print("=" * 60)
print("SKEWNESS ANALYSIS")
print("=" * 60)
skewness = df[continuous_features].skew()
for feature, skew_value in skewness.items():
    skew_type = "Highly Skewed" if abs(skew_value) > 1 else "Moderately Skewed" if abs(skew_value) > 0.5 else "Approximately Normal"
    print(f"{feature:15} : {skew_value:6.3f}  ({skew_type})")

# Visualize skewness
plt.figure(figsize=(10, 5))
skewness.plot(kind='bar', color='teal', edgecolor='black')
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)
plt.axhline(y=1, color='orange', linestyle='--', linewidth=1, alpha=0.7)
plt.axhline(y=-1, color='orange', linestyle='--', linewidth=1, alpha=0.7)
plt.title('Skewness of Continuous Features', fontsize=14, fontweight='bold')
plt.xlabel('Features', fontsize=11)
plt.ylabel('Skewness Value', fontsize=11)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Outlier detection using boxplots
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(continuous_features):
    df.boxplot(column=col, ax=axes[idx], patch_artist=True, 
              boxprops=dict(facecolor='lightblue'), medianprops=dict(color='red', linewidth=2))
    axes[idx].set_title(f'{col.upper()} - Outlier Detection', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel(col, fontsize=10)
    axes[idx].grid(axis='y', alpha=0.3)
    
    # Calculate IQR and outliers
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)][col]
    axes[idx].text(0.5, 0.95, f'Outliers: {len(outliers)}', 
                  transform=axes[idx].transAxes, ha='center', va='top',
                  bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.5))

# Remove extra subplot
axes[-1].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis with target
plt.figure(figsize=(10, 8))
correlations = df.corr()['target'].drop('target').sort_values(ascending=False)
colors = ['green' if x > 0 else 'red' for x in correlations]
correlations.plot(kind='barh', color=colors, edgecolor='black')
plt.title('Feature Correlation with Target Variable', fontsize=14, fontweight='bold')
plt.xlabel('Correlation Coefficient', fontsize=11)
plt.ylabel('Features', fontsize=11)
plt.axvline(x=0, color='black', linestyle='-', linewidth=1)
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

print("\nTop 5 Positively Correlated Features:")
print(correlations.head())
print("\nTop 5 Negatively Correlated Features:")
print(correlations.tail())

In [None]:
# Pairplot for key continuous features
key_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'target']
sns.pairplot(df[key_features], hue='target', palette={0: 'blue', 1: 'red'}, 
            diag_kind='kde', plot_kws={'alpha': 0.6}, height=2.5)
plt.suptitle('Pairplot of Continuous Features by Target', y=1.02, fontsize=16, fontweight='bold')
plt.show()

## 3. Preprocess the Data

In [None]:
# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

print("Features shape:", X.shape)
print("Target shape:", y.shape)
print("\nFeature columns:")
print(X.columns.tolist())

## 4. Split Data into Training and Testing Sets

In [None]:
# Split the data (80% train, 20% test) with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)
print("\nTraining set target distribution:")
print(y_train.value_counts())
print("\nTesting set target distribution:")
print(y_test.value_counts())

In [None]:
# Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for better readability
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)

print("✓ Features scaled successfully!")
print("\nScaled training data sample:")
X_train_scaled.head()

## 5. Initialize and Train Multiple Models

In [None]:
# Initialize models
models = {
    'Decision Tree': DecisionTreeClassifier(random_state=RANDOM_STATE),
    'Random Forest': RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1),
    'Logistic Regression': LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
    'SVM': SVC(random_state=RANDOM_STATE, probability=True)
}

print("Models initialized:")
for name in models.keys():
    print(f"  • {name}")

In [None]:
# Train all models and store results
trained_models = {}
results = []

print("Training models...\n")
for name, model in models.items():
    print(f"Training {name}...")
    
    # Train the model
    model.fit(X_train_scaled, y_train)
    trained_models[name] = model
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None
    
    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc
    })
    
    print(f"  ✓ {name} trained - Accuracy: {accuracy:.4f}, ROC-AUC: {roc_auc:.4f if roc_auc else 'N/A'}")

print("\n✓ All models trained successfully!")

## 6. Evaluate Model Performance

In [None]:
# Display results in a DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('ROC-AUC', ascending=False)
print("=" * 80)
print("MODEL COMPARISON")
print("=" * 80)
results_df

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Accuracy comparison
axes[0, 0].barh(results_df['Model'], results_df['Accuracy'], color='skyblue')
axes[0, 0].set_xlabel('Accuracy', fontweight='bold')
axes[0, 0].set_title('Model Accuracy Comparison', fontsize=12, fontweight='bold')
axes[0, 0].set_xlim([0, 1])

# Plot 2: ROC-AUC comparison
axes[0, 1].barh(results_df['Model'], results_df['ROC-AUC'], color='lightcoral')
axes[0, 1].set_xlabel('ROC-AUC Score', fontweight='bold')
axes[0, 1].set_title('Model ROC-AUC Comparison', fontsize=12, fontweight='bold')
axes[0, 1].set_xlim([0, 1])

# Plot 3: Precision, Recall, F1-Score
metrics_to_plot = ['Precision', 'Recall', 'F1-Score']
x = np.arange(len(results_df['Model']))
width = 0.25
for i, metric in enumerate(metrics_to_plot):
    axes[1, 0].bar(x + i*width, results_df[metric], width, label=metric)
axes[1, 0].set_xlabel('Models', fontweight='bold')
axes[1, 0].set_ylabel('Score', fontweight='bold')
axes[1, 0].set_title('Precision, Recall, and F1-Score Comparison', fontsize=12, fontweight='bold')
axes[1, 0].set_xticks(x + width)
axes[1, 0].set_xticklabels(results_df['Model'], rotation=45, ha='right')
axes[1, 0].legend()
axes[1, 0].set_ylim([0, 1])

# Plot 4: Overall metrics heatmap
metrics_heatmap = results_df.set_index('Model')[['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']]
sns.heatmap(metrics_heatmap, annot=True, fmt='.3f', cmap='YlGnBu', ax=axes[1, 1], 
           cbar_kws={'label': 'Score'}, vmin=0, vmax=1)
axes[1, 1].set_title('Performance Metrics Heatmap', fontsize=12, fontweight='bold')
axes[1, 1].set_ylabel('')

plt.tight_layout()
plt.show()

In [None]:
# Plot confusion matrices for all models
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
axes = axes.ravel()

for idx, (name, model) in enumerate(trained_models.items()):
    y_pred = model.predict(X_test_scaled)
    cm = confusion_matrix(y_test, y_pred)
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
               xticklabels=['No Disease', 'Disease'],
               yticklabels=['No Disease', 'Disease'])
    axes[idx].set_title(f'{name} - Confusion Matrix', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel('Actual', fontweight='bold')
    axes[idx].set_xlabel('Predicted', fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Plot ROC curves for all models
plt.figure(figsize=(10, 8))

for name, model in trained_models.items():
    if hasattr(model, 'predict_proba'):
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.3f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Classifier (AUC = 0.500)')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12, fontweight='bold')
plt.ylabel('True Positive Rate', fontsize=12, fontweight='bold')
plt.title('ROC Curves - Model Comparison', fontsize=14, fontweight='bold')
plt.legend(loc='lower right', fontsize=10)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Tune Hyperparameters (GridSearchCV)

In [None]:
# Select the best performing model from baseline results (typically Random Forest)
best_baseline_model = results_df.iloc[0]['Model']
print(f"Best baseline model: {best_baseline_model}")
print(f"Baseline ROC-AUC: {results_df.iloc[0]['ROC-AUC']:.4f}")
print("\nPerforming hyperparameter tuning on Random Forest...")

In [None]:
# Define parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Initialize Random Forest
rf = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1)

# Perform Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

print(f"Testing {len(param_grid['n_estimators']) * len(param_grid['max_depth']) * len(param_grid['min_samples_split']) * len(param_grid['min_samples_leaf']) * len(param_grid['max_features']) * len(param_grid['bootstrap'])} parameter combinations...")
print("This may take a few minutes...\n")

In [None]:
# Fit Grid Search
import time
start_time = time.time()

grid_search.fit(X_train_scaled, y_train)

end_time = time.time()
elapsed_time = end_time - start_time

print(f"\n✓ Grid Search completed in {elapsed_time:.2f} seconds!")
print(f"\nBest parameters found:")
for param, value in grid_search.best_params_.items():
    print(f"  • {param}: {value}")
print(f"\nBest cross-validation ROC-AUC score: {grid_search.best_score_:.4f}")

In [None]:
# Evaluate the tuned model on test set
best_model = grid_search.best_estimator_
y_pred_tuned = best_model.predict(X_test_scaled)
y_pred_proba_tuned = best_model.predict_proba(X_test_scaled)[:, 1]

# Calculate metrics for tuned model
tuned_accuracy = accuracy_score(y_test, y_pred_tuned)
tuned_precision = precision_score(y_test, y_pred_tuned)
tuned_recall = recall_score(y_test, y_pred_tuned)
tuned_f1 = f1_score(y_test, y_pred_tuned)
tuned_roc_auc = roc_auc_score(y_test, y_pred_proba_tuned)

print("=" * 60)
print("TUNED MODEL PERFORMANCE ON TEST SET")
print("=" * 60)
print(f"Accuracy:  {tuned_accuracy:.4f}")
print(f"Precision: {tuned_precision:.4f}")
print(f"Recall:    {tuned_recall:.4f}")
print(f"F1-Score:  {tuned_f1:.4f}")
print(f"ROC-AUC:   {tuned_roc_auc:.4f}")
print("\n" + classification_report(y_test, y_pred_tuned, target_names=['No Disease', 'Disease']))

In [None]:
# Compare baseline vs tuned model
comparison = pd.DataFrame({
    'Model': ['Baseline Random Forest', 'Tuned Random Forest'],
    'Accuracy': [results_df[results_df['Model'] == 'Random Forest']['Accuracy'].values[0], tuned_accuracy],
    'Precision': [results_df[results_df['Model'] == 'Random Forest']['Precision'].values[0], tuned_precision],
    'Recall': [results_df[results_df['Model'] == 'Random Forest']['Recall'].values[0], tuned_recall],
    'F1-Score': [results_df[results_df['Model'] == 'Random Forest']['F1-Score'].values[0], tuned_f1],
    'ROC-AUC': [results_df[results_df['Model'] == 'Random Forest']['ROC-AUC'].values[0], tuned_roc_auc]
})

print("=" * 80)
print("BASELINE vs TUNED MODEL COMPARISON")
print("=" * 80)
comparison

In [None]:
# Visualize feature importance from tuned model
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_model.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 8))
plt.barh(feature_importance['Feature'], feature_importance['Importance'], color='steelblue')
plt.xlabel('Feature Importance', fontsize=12, fontweight='bold')
plt.ylabel('Features', fontsize=12, fontweight='bold')
plt.title('Feature Importance - Tuned Random Forest Model', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 5 Most Important Features:")
for idx, row in feature_importance.head().iterrows():
    print(f"  {row['Feature']}: {row['Importance']:.4f}")

In [None]:
# Comprehensive performance comparison visualization
fig = plt.figure(figsize=(18, 12))
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

# 1. Accuracy Comparison
ax1 = fig.add_subplot(gs[0, 0])
results_df.plot(x='Model', y=['Accuracy'], kind='bar', ax=ax1, color='skyblue', legend=False)
ax1.set_title('Model Accuracy Comparison', fontsize=12, fontweight='bold')
ax1.set_ylabel('Accuracy', fontweight='bold')
ax1.set_ylim([0, 1])
ax1.grid(axis='y', alpha=0.3)
ax1.tick_params(axis='x', rotation=45)

# 2. ROC-AUC Comparison
ax2 = fig.add_subplot(gs[0, 1])
results_df.plot(x='Model', y=['ROC-AUC'], kind='bar', ax=ax2, color='lightcoral', legend=False)
ax2.set_title('ROC-AUC Score Comparison', fontsize=12, fontweight='bold')
ax2.set_ylabel('ROC-AUC', fontweight='bold')
ax2.set_ylim([0, 1])
ax2.grid(axis='y', alpha=0.3)
ax2.tick_params(axis='x', rotation=45)

# 3. Precision-Recall-F1 Comparison
ax3 = fig.add_subplot(gs[0, 2])
x_pos = np.arange(len(results_df))
width = 0.25
ax3.bar(x_pos - width, results_df['Precision'], width, label='Precision', color='#3498db')
ax3.bar(x_pos, results_df['Recall'], width, label='Recall', color='#e74c3c')
ax3.bar(x_pos + width, results_df['F1-Score'], width, label='F1-Score', color='#2ecc71')
ax3.set_title('Precision, Recall, F1-Score', fontsize=12, fontweight='bold')
ax3.set_ylabel('Score', fontweight='bold')
ax3.set_xticks(x_pos)
ax3.set_xticklabels(results_df['Model'], rotation=45, ha='right')
ax3.set_ylim([0, 1])
ax3.legend(loc='lower right')
ax3.grid(axis='y', alpha=0.3)

# 4. Heatmap of all metrics
ax4 = fig.add_subplot(gs[1, :])
metrics_heatmap = results_df.set_index('Model')[['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']]
sns.heatmap(metrics_heatmap.T, annot=True, fmt='.3f', cmap='YlGnBu', ax=ax4, 
           cbar_kws={'label': 'Score'}, vmin=0, vmax=1, linewidths=0.5)
ax4.set_title('Performance Metrics Heatmap (All Models)', fontsize=13, fontweight='bold')
ax4.set_xlabel('Models', fontweight='bold')
ax4.set_ylabel('Metrics', fontweight='bold')

# 5. Best model ranking
ax5 = fig.add_subplot(gs[2, :])
results_sorted = results_df.sort_values('ROC-AUC', ascending=True)
colors_rank = plt.cm.RdYlGn(np.linspace(0.3, 0.9, len(results_sorted)))
ax5.barh(results_sorted['Model'], results_sorted['ROC-AUC'], color=colors_rank, edgecolor='black')
ax5.set_title('Model Ranking by ROC-AUC Score', fontsize=13, fontweight='bold')
ax5.set_xlabel('ROC-AUC Score', fontweight='bold')
ax5.set_xlim([0, 1])
ax5.grid(axis='x', alpha=0.3)

# Add value labels on bars
for idx, (model, score) in enumerate(zip(results_sorted['Model'], results_sorted['ROC-AUC'])):
    ax5.text(score + 0.01, idx, f'{score:.3f}', va='center', fontweight='bold')

plt.suptitle('Comprehensive Model Performance Analysis', fontsize=16, fontweight='bold', y=0.995)
plt.show()

## 8. Save the Trained Model

In [None]:
# Create models directory if it doesn't exist
os.makedirs('models', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)

# Save the best model
model_path = 'models/best_model.pkl'
joblib.dump(best_model, model_path)
print(f"✓ Best model saved to: {model_path}")

# Save the scaler
scaler_path = 'data/processed/scaler.pkl'
joblib.dump(scaler, scaler_path)
print(f"✓ Scaler saved to: {scaler_path}")

# Save model metadata
metadata = {
    'model_type': 'RandomForestClassifier',
    'best_params': grid_search.best_params_,
    'test_metrics': {
        'accuracy': tuned_accuracy,
        'precision': tuned_precision,
        'recall': tuned_recall,
        'f1_score': tuned_f1,
        'roc_auc': tuned_roc_auc
    },
    'feature_names': list(X.columns),
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
}

metadata_path = 'models/model_metadata.pkl'
joblib.dump(metadata, metadata_path)
print(f"✓ Model metadata saved to: {metadata_path}")

print("\n" + "=" * 60)
print("MODEL SAVED SUCCESSFULLY!")
print("=" * 60)
print(f"Model file: {model_path}")
print(f"Scaler file: {scaler_path}")
print(f"Metadata file: {metadata_path}")

In [None]:
# Test loading the saved model
print("Testing model loading...")
loaded_model = joblib.load(model_path)
loaded_scaler = joblib.load(scaler_path)

# Make a test prediction
test_sample = X_test.iloc[0:1]
test_sample_scaled = loaded_scaler.transform(test_sample)
test_prediction = loaded_model.predict(test_sample_scaled)
test_probability = loaded_model.predict_proba(test_sample_scaled)[:, 1]

print(f"\n✓ Model loaded successfully!")
print(f"\nTest prediction:")
print(f"  Input features: {test_sample.values[0][:5]}... (first 5 features)")
print(f"  Prediction: {'Disease' if test_prediction[0] == 1 else 'No Disease'}")
print(f"  Probability: {test_probability[0]:.4f}")
print(f"  Actual: {'Disease' if y_test.iloc[0] == 1 else 'No Disease'}")

## Summary

### Key Findings:
1. **Dataset**: Heart disease prediction with 400 patients and 13 features
2. **Models Evaluated**: Decision Tree, Random Forest, Logistic Regression, SVM
3. **Best Model**: Random Forest (after hyperparameter tuning)
4. **Performance**: Achieved ROC-AUC score on test set
5. **Most Important Features**: Identified through feature importance analysis

### Next Steps:
- Deploy the model using the FastAPI backend (api/main.py)
- Test the model through the web interface (static/index.html)
- Monitor model performance in production
- Consider collecting more data for improved accuracy
- Implement model versioning and A/B testing

### Files Generated:
- `models/best_model.pkl` - Trained Random Forest model
- `data/processed/scaler.pkl` - StandardScaler for feature normalization
- `models/model_metadata.pkl` - Model parameters and performance metrics

In [None]:
# Display comprehensive project summary
print("=" * 80)
print("HEART DISEASE PREDICTION - PROJECT SUMMARY")
print("=" * 80)
print(f"\n📊 DATASET STATISTICS:")
print(f"   • Total Records: {df.shape[0]}")
print(f"   • Total Features: {df.shape[1] - 1}")
print(f"   • Continuous Features: {len(continuous_features)}")
print(f"   • Categorical Features: {len(categorical_features)}")
print(f"   • Missing Values: {df.isnull().sum().sum()}")
print(f"   • Class Distribution: {dict(df['target'].value_counts())}")

print(f"\n🔬 DATA SPLIT:")
print(f"   • Training Samples: {X_train.shape[0]}")
print(f"   • Testing Samples: {X_test.shape[0]}")
print(f"   • Train/Test Ratio: {X_train.shape[0]/X_test.shape[0]:.1f}:1")

print(f"\n🤖 MODELS EVALUATED:")
for idx, model_name in enumerate(results_df['Model'], 1):
    print(f"   {idx}. {model_name}")

print(f"\n🏆 BEST MODEL:")
best_idx = results_df['ROC-AUC'].idxmax()
best_model_name = results_df.loc[best_idx, 'Model']
print(f"   • Model: {best_model_name}")
print(f"   • Test Accuracy: {results_df.loc[best_idx, 'Accuracy']:.4f}")
print(f"   • Test Precision: {results_df.loc[best_idx, 'Precision']:.4f}")
print(f"   • Test Recall: {results_df.loc[best_idx, 'Recall']:.4f}")
print(f"   • Test F1-Score: {results_df.loc[best_idx, 'F1-Score']:.4f}")
print(f"   • ROC-AUC Score: {results_df.loc[best_idx, 'ROC-AUC']:.4f}")

print(f"\n📈 HYPERPARAMETER TUNING:")
print(f"   • Best Parameters:")
for param, value in grid_search.best_params_.items():
    print(f"      - {param}: {value}")
print(f"   • CV Score: {grid_search.best_score_:.4f}")

print(f"\n💾 SAVED ARTIFACTS:")
print(f"   • Best Model: models/best_model.pkl")
print(f"   • Scaler: data/processed/scaler.pkl")
print(f"   • Model Metadata: models/model_metadata.pkl")

print(f"\n🎯 KEY INSIGHTS:")
print(f"   • Top 3 Important Features:")
for idx, (feat, imp) in enumerate(feature_importance.head(3).values, 1):
    print(f"      {idx}. {feat}: {imp:.4f}")

print(f"\n✅ PROJECT STATUS: COMPLETED")
print("=" * 80)

## Environment Information

In [None]:
# Display Python and library versions
import sys
import sklearn
import scipy

print("=" * 60)
print("ENVIRONMENT INFORMATION")
print("=" * 60)
print(f"\nPython Version: {sys.version.split()[0]}")
print(f"\nLibrary Versions:")
print(f"  • pandas:      {pd.__version__}")
print(f"  • numpy:       {np.__version__}")
print(f"  • matplotlib:  {plt.matplotlib.__version__}")
print(f"  • seaborn:     {sns.__version__}")
print(f"  • scikit-learn: {sklearn.__version__}")
print(f"  • scipy:       {scipy.__version__}")
print(f"  • joblib:      {joblib.__version__}")

print(f"\nWorkspace: {os.getcwd()}")
print(f"Random State: {RANDOM_STATE}")
print("=" * 60)