# Heart Disease Prediction Model

This notebook demonstrates training a machine learning model to predict heart disease.

**Features:** age, sex, chest pain type, blood pressure, cholesterol, fasting blood sugar, ECG results, max heart rate, exercise angina, ST depression, slope, ca, thal  
**Target:** Heart disease presence (0 = no, 1 = yes)  
**Model:** Classification (SVM, Random Forest, Logistic Regression, etc.)

**Note:** This is a template notebook. You'll need to provide the heart disease dataset to run this completely.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix, 
    roc_auc_score, roc_curve, precision_score, recall_score, f1_score
)
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set plot styles
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("✓ Libraries imported!")

## 2. Load Dataset

**Dataset Source:** Heart disease dataset (UCI ML Repository / Kaggle)  
**Columns:** age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpeak, slope, ca, thal, target

In [None]:
# TODO: Replace with actual dataset path
# Common dataset: https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset

# Example loading (uncomment and modify path):
# df = pd.read_csv('path/to/heart_disease.csv')

# For demonstration, create synthetic data structure
print("Expected columns:")
columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 
           'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
print(columns)

# Load your dataset here:
# df = pd.read_csv('your_heart_disease_data.csv')
# display(df.head())
# print(f"Dataset shape: {df.shape}")

## 3. Exploratory Data Analysis

In [None]:
# Uncomment when you have data loaded:
# print("Dataset Info:")
# print(df.info())
# print("\nStatistical Summary:")
# display(df.describe())
# print("\nMissing values:")
# print(df.isnull().sum())
# print("\nTarget distribution:")
# print(df['target'].value_counts())

In [None]:
# Visualizations
# Uncomment when data is loaded:
# fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# 
# # Target distribution
# df['target'].value_counts().plot(kind='bar', ax=axes[0,0], color=['green', 'red'])
# axes[0,0].set_title('Heart Disease Distribution')
# axes[0,0].set_xlabel('0=No Disease, 1=Disease')
# 
# # Age distribution
# df['age'].hist(bins=20, ax=axes[0,1], color='skyblue')
# axes[0,1].set_title('Age Distribution')
# 
# # Correlation heatmap
# sns.heatmap(df.corr(), annot=False, cmap='coolwarm', ax=axes[1,0])
# axes[1,0].set_title('Feature Correlation')
# 
# # Box plot for age by target
# df.boxplot(column='age', by='target', ax=axes[1,1])
# axes[1,1].set_title('Age by Heart Disease Status')
# 
# plt.tight_layout()
# plt.show()

## 4. Data Preprocessing

In [None]:
# Uncomment and adapt when data is loaded:
# # Separate features and target
# X = df.drop('target', axis=1)
# y = df['target']
# 
# # Handle missing values if any
# X = X.fillna(X.median())
# 
# print(f"Features shape: {X.shape}")
# print(f"Target shape: {y.shape}")

In [None]:
# Split data
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42, stratify=y
# )
# 
# print(f"Training set: {X_train.shape}")
# print(f"Test set: {X_test.shape}")

In [None]:
# Feature scaling
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)
# 
# print("✓ Features scaled!")

## 5. Model Training

In [None]:
# Train multiple models and compare
# models = {
#     'Logistic Regression': LogisticRegression(max_iter=1000),
#     'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
#     'SVM': SVC(probability=True, kernel='rbf'),
#     'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
# }
# 
# results = {}
# for name, model in models.items():
#     model.fit(X_train_scaled, y_train)
#     y_pred = model.predict(X_test_scaled)
#     acc = accuracy_score(y_test, y_pred)
#     results[name] = acc
#     print(f"{name:25s} Accuracy: {acc:.4f}")
# 
# # Select best model
# best_model_name = max(results, key=results.get)
# best_model = models[best_model_name]
# print(f"\n🏆 Best Model: {best_model_name} ({results[best_model_name]:.4f})")

## 6. Model Evaluation

In [None]:
# Detailed evaluation of best model
# y_pred = best_model.predict(X_test_scaled)
# y_proba = best_model.predict_proba(X_test_scaled)[:, 1]
# 
# print("Classification Report:")
# print("="*60)
# print(classification_report(y_test, y_pred, target_names=['No Disease', 'Disease']))
# 
# print(f"\nROC-AUC Score: {roc_auc_score(y_test, y_proba):.4f}")

In [None]:
# Confusion Matrix
# cm = confusion_matrix(y_test, y_pred)
# plt.figure(figsize=(8, 6))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
#             xticklabels=['No Disease', 'Disease'],
#             yticklabels=['No Disease', 'Disease'])
# plt.title('Confusion Matrix - Heart Disease Prediction')
# plt.ylabel('Actual')
# plt.xlabel('Predicted')
# plt.show()

In [None]:
# Enhanced Confusion Matrix Visualization
# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# 
# # Standard confusion matrix
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
#             xticklabels=['No Disease', 'Disease'],
#             yticklabels=['No Disease', 'Disease'],
#             ax=ax1, cbar_kws={'label': 'Count'})
# ax1.set_title('Confusion Matrix (Counts)', fontsize=14, fontweight='bold')
# ax1.set_ylabel('Actual', fontsize=12)
# ax1.set_xlabel('Predicted', fontsize=12)
# 
# # Normalized confusion matrix
# cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# sns.heatmap(cm_norm, annot=True, fmt='.2%', cmap='Blues',
#             xticklabels=['No Disease', 'Disease'],
#             yticklabels=['No Disease', 'Disease'],
#             ax=ax2, cbar_kws={'label': 'Percentage'})
# ax2.set_title('Normalized Confusion Matrix (%)', fontsize=14, fontweight='bold')
# ax2.set_ylabel('Actual', fontsize=12)
# ax2.set_xlabel('Predicted', fontsize=12)
# 
# plt.tight_layout()
# plt.show()
# 
# # Print detailed metrics from confusion matrix
# tn, fp, fn, tp = cm.ravel()
# print(f"\nDetailed Breakdown:")
# print(f"  True Negatives:  {tn} - Correctly identified healthy patients")
# print(f"  False Positives: {fp} - Healthy patients incorrectly flagged")
# print(f"  False Negatives: {fn} - Heart disease cases missed ⚠️")
# print(f"  True Positives:  {tp} - Correctly identified heart disease")
# print(f"\nSensitivity/Recall: {tp/(tp+fn):.2%} - Detection rate")
# print(f"Specificity:        {tn/(tn+fp):.2%} - True negative rate")

### Enhanced Confusion Matrix with Metrics

In [None]:
# ROC Curve
# fpr, tpr, thresholds = roc_curve(y_test, y_proba)
# plt.figure(figsize=(8, 6))
# plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc_score(y_test, y_proba):.3f})')
# plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('ROC Curve - Heart Disease Prediction')
# plt.legend()
# plt.grid(alpha=0.3)
# plt.show()

In [None]:
# Feature importance (for tree-based models)
# if hasattr(best_model, 'feature_importances_'):
#     importance_df = pd.DataFrame({
#         'feature': X.columns,
#         'importance': best_model.feature_importances_
#     }).sort_values('importance', ascending=False)
#     
#     plt.figure(figsize=(10, 6))
#     plt.barh(importance_df['feature'], importance_df['importance'], color='coral')
#     plt.xlabel('Importance Score', fontsize=12)
#     plt.title('Feature Importance - Heart Disease Prediction', fontsize=14, fontweight='bold')
#     plt.gca().invert_yaxis()
#     plt.tight_layout()
#     plt.show()
#     
#     print("\nTop 5 Most Important Features:")
#     print(importance_df.head().to_string(index=False))

### Feature Importance Analysis

In [None]:
# Comprehensive metrics visualization
# metrics = {
#     'Accuracy': accuracy_score(y_test, y_pred),
#     'Precision': precision_score(y_test, y_pred),
#     'Recall': recall_score(y_test, y_pred),
#     'F1-Score': f1_score(y_test, y_pred),
#     'ROC-AUC': roc_auc_score(y_test, y_proba)
# }
# 
# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
# 
# # Metrics bar plot
# colors = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12', '#9b59b6']
# bars = ax1.bar(metrics.keys(), metrics.values(), color=colors)
# ax1.set_ylim([0, 1])
# ax1.set_ylabel('Score', fontsize=12)
# ax1.set_title('Performance Metrics', fontsize=14, fontweight='bold')
# ax1.axhline(y=0.8, color='gray', linestyle='--', alpha=0.5, label='80% threshold')
# ax1.grid(axis='y', alpha=0.3)
# ax1.legend()
# 
# for i, (key, value) in enumerate(metrics.items()):
#     ax1.text(i, value + 0.02, f'{value:.3f}', ha='center', fontweight='bold')
# 
# ax1.set_xticklabels(metrics.keys(), rotation=15, ha='right')
# 
# # Model comparison
# ax2.bar(results.keys(), results.values(), color='steelblue')
# ax2.set_ylim([0, 1])
# ax2.set_ylabel('Accuracy', fontsize=12)
# ax2.set_title('Model Comparison', fontsize=14, fontweight='bold')
# ax2.grid(axis='y', alpha=0.3)
# ax2.set_xticklabels(results.keys(), rotation=45, ha='right')
# 
# for i, (key, value) in enumerate(results.items()):
#     ax2.text(i, value + 0.02, f'{value:.3f}', ha='center', fontweight='bold', fontsize=9)
# 
# plt.tight_layout()
# plt.show()

### Performance Metrics Comparison

## 7. Save Model

In [None]:
# Save the best model and scaler
# joblib.dump(best_model, '../Frontend/models/heart_disease_model.sav')
# joblib.dump(scaler, '../Frontend/models/heart_disease_scaler.sav')
# 
# print("✓ Model saved to: ../Frontend/models/heart_disease_model.sav")
# print("✓ Scaler saved to: ../Frontend/models/heart_disease_scaler.sav")
# 
# # Save feature names for reference
# feature_names = X.columns.tolist()
# import json
# with open('../Frontend/models/heart_disease_features.json', 'w') as f:
#     json.dump(feature_names, f)
# print("✓ Feature names saved")

## 8. Usage Example

In [None]:
# Example: Make a prediction
# def predict_heart_disease(age, sex, cp, trestbps, chol, fbs, restecg, 
#                          thalach, exang, oldpeak, slope, ca, thal):
#     # Load model and scaler
#     model = joblib.load('../Frontend/models/heart_disease_model.sav')
#     scaler = joblib.load('../Frontend/models/heart_disease_scaler.sav')
#     
#     # Create input array
#     input_data = np.array([[age, sex, cp, trestbps, chol, fbs, restecg,
#                             thalach, exang, oldpeak, slope, ca, thal]])
#     
#     # Scale and predict
#     input_scaled = scaler.transform(input_data)
#     prediction = model.predict(input_scaled)[0]
#     probability = model.predict_proba(input_scaled)[0]
#     
#     return prediction, probability
# 
# # Test
# pred, proba = predict_heart_disease(
#     age=63, sex=1, cp=3, trestbps=145, chol=233, fbs=1, restecg=0,
#     thalach=150, exang=0, oldpeak=2.3, slope=0, ca=0, thal=1
# )
# print(f"Prediction: {'Heart Disease' if pred == 1 else 'No Disease'}")
# print(f"Probability: {proba[1]:.2%}")

---

## Next Steps

1. **Obtain Dataset:** Download heart disease dataset from [Kaggle](https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset) or UCI ML Repository
2. **Load Data:** Uncomment and run the data loading cells above
3. **Train Model:** Execute all cells sequentially
4. **Tune Hyperparameters:** Use GridSearchCV or RandomizedSearchCV for optimization
5. **Deploy:** Integrate trained model with Streamlit frontend