# Heart Disease Prediction - Model Comparison

This notebook demonstrates multiple machine learning models for heart disease prediction and compares their performance.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from xgboost import XGBClassifier
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## 2. Load and Explore Data

In [None]:
# Load the dataset
df = pd.read_csv('../data/heart.csv')

print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
df.head()

In [None]:
# Dataset information
print("Dataset Info:")
df.info()
print("\nStatistical Summary:")
df.describe()

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

In [None]:
# Target distribution
print("Target Distribution:")
print(df['target'].value_counts())
print("\nTarget Proportions:")
print(df['target'].value_counts(normalize=True))

# Visualize target distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='target', data=df)
plt.title('Distribution of Heart Disease')
plt.xlabel('Target (0: No Disease, 1: Disease)')
plt.ylabel('Count')
plt.show()

## 3. Exploratory Data Analysis

In [None]:
# Correlation matrix
plt.figure(figsize=(14, 10))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Age distribution by target
plt.figure(figsize=(10, 6))
sns.boxplot(x='target', y='age', data=df)
plt.title('Age Distribution by Heart Disease Status')
plt.xlabel('Target (0: No Disease, 1: Disease)')
plt.ylabel('Age')
plt.show()

In [None]:
# Chest pain type distribution
plt.figure(figsize=(10, 6))
pd.crosstab(df['cp'], df['target']).plot(kind='bar')
plt.title('Chest Pain Type vs Heart Disease')
plt.xlabel('Chest Pain Type')
plt.ylabel('Count')
plt.legend(['No Disease', 'Disease'])
plt.tight_layout()
plt.show()

## 4. Data Preparation

In [None]:
# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

print("Features shape:", X.shape)
print("Target shape:", y.shape)

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler
joblib.dump(scaler, '../models/scaler.pkl')
print("Scaler saved successfully!")

## 5. Model Training and Evaluation

We will train and evaluate the following models:
1. Logistic Regression
2. Random Forest
3. XGBoost

In [None]:
# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """Train and evaluate a model, returning metrics"""
    # Train the model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Cross-validation score
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    
    print(f"\n{'='*60}")
    print(f"{model_name} Results")
    print(f"{'='*60}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"Cross-Validation Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
    
    if y_pred_proba is not None:
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        print(f"ROC-AUC Score: {roc_auc:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{model_name} - Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    
    # ROC Curve
    if y_pred_proba is not None:
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.4f})')
        plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'{model_name} - ROC Curve')
        plt.legend()
        plt.grid(True)
        plt.show()
    
    return {
        'model': model,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'roc_auc': roc_auc if y_pred_proba is not None else None
    }

### 5.1 Logistic Regression

In [None]:
# Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_results = evaluate_model(lr_model, X_train_scaled, X_test_scaled, y_train, y_test, "Logistic Regression")

# Save the model
joblib.dump(lr_results['model'], '../models/logistic_regression_model.pkl')
print("\nLogistic Regression model saved!")

### 5.2 Random Forest

In [None]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
rf_results = evaluate_model(rf_model, X_train, X_test, y_train, y_test, "Random Forest")

# Save the model
joblib.dump(rf_results['model'], '../models/random_forest_model.pkl')
print("\nRandom Forest model saved!")

In [None]:
# Feature importance from Random Forest
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_results['model'].feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance (Random Forest)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

print("\nTop 5 Important Features:")
print(feature_importance.head())

### 5.3 XGBoost

In [None]:
# XGBoost
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42, eval_metric='logloss')
xgb_results = evaluate_model(xgb_model, X_train, X_test, y_train, y_test, "XGBoost")

# Save the model
joblib.dump(xgb_results['model'], '../models/xgboost_model.pkl')
print("\nXGBoost model saved!")

In [None]:
# Feature importance from XGBoost
feature_importance_xgb = pd.DataFrame({
    'feature': X.columns,
    'importance': xgb_results['model'].feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(x='importance', y='feature', data=feature_importance_xgb)
plt.title('Feature Importance (XGBoost)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

print("\nTop 5 Important Features (XGBoost):")
print(feature_importance_xgb.head())

## 4. HRLFM (Hybrid Random Linear Forest Model)

HRLFM combines Random Forest with Logistic Regression for hybrid predictions.

In [None]:
# HRLFM (Hybrid Random Linear Forest Model)
from sklearn.ensemble import VotingClassifier

hrlfm_model = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)),
        ('lr', LogisticRegression(random_state=42, max_iter=1000))
    ],
    voting='soft'
)

# For HRLFM, we need to use scaled features for LR component
hrlfm_results = evaluate_model(hrlfm_model, X_train, X_test, y_train, y_test, "HRLFM")

# Save the model
joblib.dump(hrlfm_results['model'], '../models/hrlfm_model.pkl')
print("\nHRLFM model saved!")

## 6. Model Comparison

In [None]:
# Compare all models
models_comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost', 'HRLFM'],
    'Accuracy': [lr_results['accuracy'], rf_results['accuracy'], xgb_results['accuracy'], hrlfm_results['accuracy']],
    'Precision': [lr_results['precision'], rf_results['precision'], xgb_results['precision'], hrlfm_results['precision']],
    'Recall': [lr_results['recall'], rf_results['recall'], xgb_results['recall'], hrlfm_results['recall']],
    'F1-Score': [lr_results['f1'], rf_results['f1'], xgb_results['f1'], hrlfm_results['f1']],
    'ROC-AUC': [lr_results['roc_auc'], rf_results['roc_auc'], xgb_results['roc_auc'], hrlfm_results['roc_auc']],
    'CV Mean': [lr_results['cv_mean'], rf_results['cv_mean'], xgb_results['cv_mean'], hrlfm_results['cv_mean']]
})

print("\n" + "="*80)
print("MODEL COMPARISON SUMMARY")
print("="*80)
print(models_comparison.to_string(index=False))
print("="*80)

In [None]:
# Visualize model comparison
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
models_comparison_melted = models_comparison.melt(id_vars='Model', value_vars=metrics, 
                                                   var_name='Metric', value_name='Score')

plt.figure(figsize=(12, 6))
sns.barplot(x='Metric', y='Score', hue='Model', data=models_comparison_melted)
plt.title('Model Performance Comparison')
plt.ylabel('Score')
plt.xlabel('Metric')
plt.legend(title='Model')
plt.ylim(0, 1)
plt.tight_layout()
plt.show()

## 7. Best Model Selection

In [None]:
# Find the best model based on F1-score
best_model_idx = models_comparison['F1-Score'].idxmax()
best_model_name = models_comparison.loc[best_model_idx, 'Model']
best_f1_score = models_comparison.loc[best_model_idx, 'F1-Score']

print(f"\n{'='*60}")
print(f"BEST MODEL: {best_model_name}")
print(f"F1-Score: {best_f1_score:.4f}")
print(f"{'='*60}")

# Save the best model info
if best_model_name == 'Logistic Regression':
    best_model = lr_results['model']
elif best_model_name == 'Random Forest':
    best_model = rf_results['model']
elif best_model_name == 'XGBoost':
    best_model = xgb_results['model']
else:  # HRLFM
    best_model = hrlfm_results['model']

joblib.dump(best_model, '../models/best_model.pkl')
print(f"\nBest model ({best_model_name}) saved as 'best_model.pkl'")

## 8. Sample Prediction

In [None]:
# Make a sample prediction
sample_patient = X_test.iloc[0:1]
print("Sample Patient Data:")
print(sample_patient)

# For Logistic Regression (needs scaling)
if best_model_name == 'Logistic Regression':
    sample_scaled = scaler.transform(sample_patient)
    prediction = best_model.predict(sample_scaled)
    probability = best_model.predict_proba(sample_scaled)[0]
else:
    prediction = best_model.predict(sample_patient)
    probability = best_model.predict_proba(sample_patient)[0]

print(f"\nPrediction: {'Heart Disease' if prediction[0] == 1 else 'No Heart Disease'}")
print(f"Probability - No Disease: {probability[0]:.4f}, Disease: {probability[1]:.4f}")
print(f"\nActual: {'Heart Disease' if y_test.iloc[0] == 1 else 'No Heart Disease'}")

## 9. Conclusion

In this notebook, we:
1. Loaded and explored the heart disease dataset
2. Performed exploratory data analysis
3. Trained and evaluated multiple machine learning models:
   - Logistic Regression
   - Random Forest
   - XGBoost
4. Compared model performances
5. Selected the best model based on F1-score
6. Saved all models for deployment

The models are now ready to be used in the Streamlit application for real-time predictions!