In [None]:
"""
Heart Disease Detection System
================================
Predicts if a patient has heart disease using patient vitals and medical data.
Optimized for Google Colab with <5 minute runtime.
"""

# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("=" * 60)
print("HEART DISEASE DETECTION SYSTEM")
print("=" * 60)

In [None]:
# ============================================================================
# STEP 1: LOAD DATASET
# ============================================================================
print("\n[1/7] Loading Dataset...")

# Upload dataset from local system
from google.colab import files
print("Please upload your heart disease dataset (CSV file):")
uploaded = files.upload()

# Get the filename
filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)

print(f"✓ Dataset loaded successfully: {filename}")
print(f"  Shape: {df.shape[0]} rows, {df.shape[1]} columns")


In [None]:
# ============================================================================
# STEP 2: EXPLORATORY DATA ANALYSIS (EDA)
# ============================================================================
print("\n[2/7] Exploratory Data Analysis...")

# Display basic information
print("\n--- Dataset Info ---")
print(df.info())

print("\n--- First 5 Rows ---")
print(df.head())

print("\n--- Statistical Summary ---")
print(df.describe())

print("\n--- Missing Values ---")
missing = df.isnull().sum()
if missing.sum() == 0:
    print("✓ No missing values found")
else:
    print(missing[missing > 0])

print("\n--- Target Distribution ---")
print(df['target'].value_counts())
print(f"Class Balance: {df['target'].value_counts(normalize=True).to_dict()}")

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Target distribution
df['target'].value_counts().plot(kind='bar', ax=axes[0, 0], color=['#2ecc71', '#e74c3c'])
axes[0, 0].set_title('Target Distribution', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Class (0=Normal, 1=Heart Disease)')
axes[0, 0].set_ylabel('Count')
axes[0, 0].set_xticklabels(['Normal', 'Heart Disease'], rotation=0)

# Age distribution by target
df.boxplot(column='age', by='target', ax=axes[0, 1])
axes[0, 1].set_title('Age Distribution by Target', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Target')
axes[0, 1].set_ylabel('Age (years)')

# Correlation heatmap
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm',
            ax=axes[1, 0], cbar_kws={'shrink': 0.8})
axes[1, 0].set_title('Feature Correlation Matrix', fontsize=14, fontweight='bold')

# Feature importance preview (using correlation with target)
target_corr = corr_matrix['target'].abs().sort_values(ascending=False)[1:]
target_corr.plot(kind='barh', ax=axes[1, 1], color='skyblue')
axes[1, 1].set_title('Feature Correlation with Target', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Absolute Correlation')

plt.tight_layout()
plt.show()

print("✓ EDA completed")


In [None]:
# ============================================================================
# STEP 3: DATA PREPROCESSING
# ============================================================================
print("\n[3/7] Data Preprocessing...")

# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

# Split dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"  Training set: {X_train.shape[0]} samples")
print(f"  Test set: {X_test.shape[0]} samples")

# Feature Scaling (important for SVM and Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✓ Data preprocessing completed")


In [None]:
# ============================================================================
# STEP 4: MODEL TRAINING
# ============================================================================
print("\n[4/7] Training Multiple Models...")

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'Support Vector Machine': SVC(probability=True, random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\n  Training {name}...")

    # Use scaled data for LR and SVM, original for tree-based
    if name in ['Logistic Regression', 'Support Vector Machine']:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'roc_auc': roc_auc,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }

    print(f"    Accuracy: {accuracy:.4f} | ROC-AUC: {roc_auc:.4f}")

print("\n✓ Model training completed")


In [None]:
# ============================================================================
# STEP 5: MODEL EVALUATION
# ============================================================================
print("\n[5/7] Evaluating Models...")

# Find best model
best_model_name = max(results, key=lambda x: results[x]['accuracy'])
best_model = results[best_model_name]['model']
best_accuracy = results[best_model_name]['accuracy']

print(f"\n🏆 Best Model: {best_model_name}")
print(f"   Accuracy: {best_accuracy:.4f}")
print(f"   ROC-AUC: {results[best_model_name]['roc_auc']:.4f}")

# Detailed classification report
print(f"\n--- Classification Report ({best_model_name}) ---")
print(classification_report(y_test, results[best_model_name]['y_pred'],
                          target_names=['Normal', 'Heart Disease']))

# Visualizations
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# 1. Model Comparison
model_names = list(results.keys())
accuracies = [results[m]['accuracy'] for m in model_names]
roc_aucs = [results[m]['roc_auc'] for m in model_names]

x = np.arange(len(model_names))
width = 0.35

axes[0].bar(x - width/2, accuracies, width, label='Accuracy', color='#3498db')
axes[0].bar(x + width/2, roc_aucs, width, label='ROC-AUC', color='#e74c3c')
axes[0].set_xlabel('Models', fontweight='bold')
axes[0].set_ylabel('Score', fontweight='bold')
axes[0].set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
axes[0].set_xticks(x)
axes[0].set_xticklabels(model_names, rotation=45, ha='right')
axes[0].legend()
axes[0].set_ylim([0.5, 1.0])
axes[0].grid(axis='y', alpha=0.3)

# 2. Confusion Matrix
cm = confusion_matrix(y_test, results[best_model_name]['y_pred'])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1],
            xticklabels=['Normal', 'Heart Disease'],
            yticklabels=['Normal', 'Heart Disease'])
axes[1].set_title(f'Confusion Matrix - {best_model_name}', fontsize=14, fontweight='bold')
axes[1].set_ylabel('True Label', fontweight='bold')
axes[1].set_xlabel('Predicted Label', fontweight='bold')

# 3. ROC Curve
for name in results:
    fpr, tpr, _ = roc_curve(y_test, results[name]['y_pred_proba'])
    axes[2].plot(fpr, tpr, label=f"{name} (AUC={results[name]['roc_auc']:.3f})", linewidth=2)

axes[2].plot([0, 1], [0, 1], 'k--', label='Random Classifier', linewidth=1)
axes[2].set_xlabel('False Positive Rate', fontweight='bold')
axes[2].set_ylabel('True Positive Rate', fontweight='bold')
axes[2].set_title('ROC Curves', fontsize=14, fontweight='bold')
axes[2].legend(loc='lower right')
axes[2].grid(alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# ============================================================================
# STEP 6: FEATURE IMPORTANCE (for best model if tree-based)
# ============================================================================
print("\n[6/7] Analyzing Feature Importance...")

if best_model_name in ['Random Forest', 'Gradient Boosting']:
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)

    print("\n--- Top 10 Important Features ---")
    print(feature_importance.head(10))

    plt.figure(figsize=(10, 6))
    sns.barplot(data=feature_importance.head(10), x='importance', y='feature', palette='viridis')
    plt.title(f'Top 10 Feature Importance - {best_model_name}', fontsize=14, fontweight='bold')
    plt.xlabel('Importance Score', fontweight='bold')
    plt.ylabel('Features', fontweight='bold')
    plt.tight_layout()
    plt.show()

print("✓ Feature importance analysis completed")


In [None]:
# ============================================================================
# STEP 7: PREDICTION FUNCTION
# ============================================================================
print("\n[7/7] Setting Up Prediction System...")

def predict_heart_disease(patient_data):
    """
    Predict heart disease for a new patient

    Parameters:
    -----------
    patient_data : dict or pd.DataFrame
        Patient features (same as training data)

    Returns:
    --------
    prediction : int (0 or 1)
    probability : float (0 to 1)
    """
    if isinstance(patient_data, dict):
        patient_data = pd.DataFrame([patient_data])

    if best_model_name in ['Logistic Regression', 'Support Vector Machine']:
        patient_scaled = scaler.transform(patient_data)
        prediction = best_model.predict(patient_scaled)[0]
        probability = best_model.predict_proba(patient_scaled)[0][1]
    else:
        prediction = best_model.predict(patient_data)[0]
        probability = best_model.predict_proba(patient_data)[0][1]

    return prediction, probability

# Example prediction
print("\n--- Example Prediction ---")
sample_patient = X_test.iloc[0].to_dict()
pred, prob = predict_heart_disease(sample_patient)

print(f"Patient Data: {sample_patient}")
print(f"Prediction: {'Heart Disease' if pred == 1 else 'Normal'}")
print(f"Probability: {prob:.2%}")
print(f"Actual: {'Heart Disease' if y_test.iloc[0] == 1 else 'Normal'}")

print("\n✓ Prediction system ready")


In [None]:
# ============================================================================
# FINAL SUMMARY
# ============================================================================
print("\n" + "=" * 60)
print("FINAL SUMMARY")
print("=" * 60)
print(f"✓ Dataset: {df.shape[0]} patients, {df.shape[1]} features")
print(f"✓ Best Model: {best_model_name}")
print(f"✓ Test Accuracy: {best_accuracy:.2%}")
print(f"✓ ROC-AUC Score: {results[best_model_name]['roc_auc']:.4f}")
print(f"✓ Model ready for predictions!")
print("=" * 60)

# Save the best model (optional)
print("\n💡 Tip: You can use 'predict_heart_disease()' function for new predictions")
print("💡 All models and results are stored in the 'results' dictionary")