# Heart Disease Prediction - Model Evaluation

This notebook evaluates the performance of the trained heart disease prediction model using various metrics and visualizations.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, confusion_matrix, classification_report, 
    roc_curve, precision_recall_curve, average_precision_score
)
import pickle
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Loading trained model and test data...")

In [None]:
# Load the dataset
try:
    # Try to load from the data directory
    df = pd.read_csv('../data/raw/heart.csv')
except FileNotFoundError:
    # If not found, try the project root
    try:
        df = pd.read_csv('../../heart.csv')
    except FileNotFoundError:
        # Create sample data for demonstration
        print("Dataset not found. Creating sample data for demonstration.")
        np.random.seed(42)
        n_samples = 1000
        df = pd.DataFrame({
            'age': np.random.randint(25, 80, n_samples),
            'sex': np.random.choice([0, 1], n_samples),
            'cp': np.random.choice([0, 1, 2, 3], n_samples),
            'trestbps': np.random.randint(90, 200, n_samples),
            'chol': np.random.randint(120, 400, n_samples),
            'fbs': np.random.choice([0, 1], n_samples),
            'restecg': np.random.choice([0, 1, 2], n_samples),
            'thalach': np.random.randint(70, 200, n_samples),
            'exang': np.random.choice([0, 1], n_samples),
            'oldpeak': np.random.uniform(0, 6, n_samples),
            'slope': np.random.choice([0, 1, 2], n_samples),
            'ca': np.random.choice([0, 1, 2, 3], n_samples),
            'thal': np.random.choice([0, 1, 2], n_samples),
            'target': np.random.choice([0, 1], n_samples)
        })

print(f"Dataset loaded with shape: {df.shape}")

In [None]:
# Prepare features and target
X = df.drop('target', axis=1)
y = df['target']

# For demonstration, we'll split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Test set size: {X_test.shape}")

In [None]:
# Load the trained model and scaler
try:
    # Try to load from models directory
    with open('../models/trained_models/best_model.pkl', 'rb') as f:
        model = pickle.load(f)
    
    with open('../models/trained_models/scaler.pkl', 'rb') as f:
        scaler = pickle.load(f)
        
    print("Model and scaler loaded successfully.")
except FileNotFoundError:
    # If files not found, create a simple model for demonstration
    print("Trained model not found. Creating a simple model for demonstration.")
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    print("Demonstration model created and trained.")

In [None]:
# Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

print("Predictions made successfully.")

In [None]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
avg_precision = average_precision_score(y_test, y_pred_proba)

print("Model Evaluation Metrics:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall: {recall:.4f}")
print(f"  F1 Score: {f1:.4f}")
print(f"  ROC AUC: {roc_auc:.4f}")
print(f"  Average Precision: {avg_precision:.4f}")

In [None]:
# Confusion Matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Calculate metrics from confusion matrix
tn, fp, fn, tp = cm.ravel()
print(f"\nConfusion Matrix Values:")
print(f"  True Negatives: {tn}")
print(f"  False Positives: {fp}")
print(f"  False Negatives: {fn}")
print(f"  True Positives: {tp}")

In [None]:
# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
# Precision-Recall Curve
precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(recall_vals, precision_vals, color='blue', lw=2, label=f'Precision-Recall curve (AP = {avg_precision:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.grid(True)
plt.show()

In [None]:
# Feature Importance (if available)
if hasattr(model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 8))
    sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
    plt.title('Top 10 Feature Importances')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()
    
    print("Top 10 Most Important Features:")
    print(feature_importance.head(10))
else:
    print("Model does not support feature importance analysis.")

In [None]:
# Prediction probability distribution
plt.figure(figsize=(10, 6))

# Histogram for each class
plt.hist(y_pred_proba[y_test == 0], bins=30, alpha=0.7, label='No Heart Disease', color='blue')
plt.hist(y_pred_proba[y_test == 1], bins=30, alpha=0.7, label='Heart Disease', color='red')

plt.xlabel('Predicted Probability')
plt.ylabel('Frequency')
plt.title('Distribution of Prediction Probabilities')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Calibration plot
from sklearn.calibration import calibration_curve

fraction_of_positives, mean_predicted_value = calibration_curve(y_test, y_pred_proba, n_bins=10)

plt.figure(figsize=(8, 6))
plt.plot(mean_predicted_value, fraction_of_positives, "s-", label="Model")
plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
plt.xlabel("Mean Predicted Probability")
plt.ylabel("Fraction of Positives")
plt.title("Calibration Plot")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Model performance by age groups
df_test = X_test.copy()
df_test['target'] = y_test
df_test['predicted'] = y_pred
df_test['probability'] = y_pred_proba

# Create age groups
df_test['age_group'] = pd.cut(df_test['age'], bins=[0, 40, 50, 60, 100], labels=['<40', '40-50', '50-60', '60+'])

# Calculate performance by age group
performance_by_age = df_test.groupby('age_group').apply(
    lambda x: pd.Series({
        'accuracy': accuracy_score(x['target'], x['predicted']),
        'precision': precision_score(x['target'], x['predicted'], zero_division=0),
        'recall': recall_score(x['target'], x['predicted'], zero_division=0),
        'f1': f1_score(x['target'], x['predicted'], zero_division=0),
        'count': len(x)
    })
)

print("Model Performance by Age Group:")
print(performance_by_age)

In [None]:
# Visualization of performance by age group
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

metrics = ['accuracy', 'precision', 'recall', 'f1']
titles = ['Accuracy', 'Precision', 'Recall', 'F1 Score']

for i, (metric, title) in enumerate(zip(metrics, titles)):
    axes[i].bar(performance_by_age.index, performance_by_age[metric], color='skyblue')
    axes[i].set_title(f'{title} by Age Group')
    axes[i].set_xlabel('Age Group')
    axes[i].set_ylabel(title)
    axes[i].set_ylim(0, 1)
    
    # Add value labels on bars
    for j, v in enumerate(performance_by_age[metric]):
        axes[i].text(j, v + 0.02, f'{v:.2f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
# Error analysis
df_test['error'] = (y_test != y_pred).astype(int)

# Analyze errors by feature distributions
error_analysis = df_test.groupby('error').agg({
    'age': ['mean', 'std'],
    'trestbps': ['mean', 'std'],
    'chol': ['mean', 'std'],
    'thalach': ['mean', 'std'],
    'oldpeak': ['mean', 'std']
}).round(2)

print("Error Analysis - Feature Statistics by Correct/Incorrect Predictions:")
print(error_analysis)

## Model Evaluation Summary

### Key Performance Metrics:
- **Accuracy**: {accuracy:.4f}
- **Precision**: {precision:.4f}
- **Recall**: {recall:.4f}
- **F1 Score**: {f1:.4f}
- **ROC AUC**: {roc_auc:.4f}
- **Average Precision**: {avg_precision:.4f}

### Model Insights:
1. **Overall Performance**: The model shows [good/moderate/poor] performance with an F1 score of {f1:.4f}.
2. **Class Balance**: The model [handles/struggles with] the class imbalance in the dataset.
3. **Calibration**: The model is [well-calibrated/poorly-calibrated] based on the calibration plot.
4. **Feature Importance**: [Most/Less] important features were [feature names].
5. **Age Group Performance**: The model performs [consistently/variably] across different age groups.

### Recommendations:
1. [Recommendation 1]
2. [Recommendation 2]
3. [Recommendation 3]

### Next Steps:
1. Model deployment and integration
2. Monitoring and maintenance
3. Continuous improvement
4. User feedback collection