# Heart Disease Prediction - Machine Learning Project

In this notebook, we'll build a machine learning model to predict heart disease using the UCI Heart Disease dataset. We'll perform exploratory data analysis, preprocess the data, train multiple models, and evaluate their performance.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Data Loading and Initial Exploration

In [None]:
# Load the dataset
df = pd.read_csv('data/heart_disease.csv')

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
print(df.info())
print("\nFirst 5 rows:")
df.head()

In [None]:
# Check for missing values
print("Missing Values:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

# Statistical summary
print("\nStatistical Summary:")
df.describe()

## 2. Exploratory Data Analysis (EDA)

In [None]:
# Distribution of target variable
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='target')
plt.title('Distribution of Heart Disease')
plt.xlabel('Heart Disease (0 = No, 1+ = Yes)')
plt.ylabel('Count')
plt.show()

# Convert target to binary (0 = No disease, 1 = Disease)
df['target'] = df['target'].apply(lambda x: 0 if x == 0 else 1)

print("Target distribution after conversion:")
print(df['target'].value_counts())

In [None]:
# Age distribution by heart disease
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='age', hue='target', bins=30, kde=True)
plt.title('Age Distribution by Heart Disease')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Sex distribution by heart disease
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='sex', hue='target')
plt.title('Heart Disease by Sex')
plt.xlabel('Sex (0 = Female, 1 = Male)')
plt.ylabel('Count')
plt.legend(title='Heart Disease', labels=['No', 'Yes'])
plt.show()

In [None]:
# Chest pain type distribution
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='cp', hue='target')
plt.title('Heart Disease by Chest Pain Type')
plt.xlabel('Chest Pain Type')
plt.ylabel('Count')
plt.legend(title='Heart Disease', labels=['No', 'Yes'])
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(14, 10))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

## 3. Data Preprocessing

In [None]:
# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Training set shape:", X_train_scaled.shape)
print("Test set shape:", X_test_scaled.shape)

## 4. Model Training and Evaluation

In [None]:
# Initialize models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42)
}

# Train and evaluate models
results = {}

for name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    # Store results
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'roc_auc': roc_auc,
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }
    
    print(f"{name} Results:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  ROC-AUC: {roc_auc:.4f}")
    print()

## 5. Model Comparison

In [None]:
# Compare model performances
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [results[model]['accuracy'] for model in results.keys()],
    'ROC-AUC': [results[model]['roc_auc'] for model in results.keys()]
})

print("Model Comparison:")
print(comparison_df)

# Visualize model comparison
fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(results))
width = 0.35

ax.bar(x - width/2, comparison_df['Accuracy'], width, label='Accuracy')
ax.bar(x + width/2, comparison_df['ROC-AUC'], width, label='ROC-AUC')

ax.set_xlabel('Models')
ax.set_ylabel('Score')
ax.set_title('Model Performance Comparison')
ax.set_xticks(x)
ax.set_xticklabels(comparison_df['Model'])
ax.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 6. Confusion Matrices

In [None]:
# Plot confusion matrices for all models
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, (name, result) in enumerate(results.items()):
    cm = confusion_matrix(y_test, result['predictions'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i])
    axes[i].set_title(f'{name} Confusion Matrix')
    axes[i].set_xlabel('Predicted')
    axes[i].set_ylabel('Actual')

plt.tight_layout()
plt.show()

## 7. ROC Curves

In [None]:
# Plot ROC curves for all models
plt.figure(figsize=(10, 8))

for name, result in results.items():
    fpr, tpr, _ = roc_curve(y_test, result['probabilities'])
    plt.plot(fpr, tpr, label=f"{name} (AUC = {result['roc_auc']:.3f})")

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend()
plt.grid(True)
plt.show()

## 8. Feature Importance (Random Forest)

In [None]:
# Get feature importance from Random Forest
rf_model = results['Random Forest']['model']
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance (Random Forest):")
print(feature_importance)

# Plot feature importance
plt.figure(figsize=(10, 8))
sns.barplot(data=feature_importance, y='feature', x='importance')
plt.title('Feature Importance (Random Forest)')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()

## 9. Save Best Model

In [None]:
# Select the best model based on ROC-AUC score
best_model_name = max(results, key=lambda x: results[x]['roc_auc'])
best_model = results[best_model_name]['model']

print(f"Best Model: {best_model_name}")
print(f"ROC-AUC Score: {results[best_model_name]['roc_auc']:.4f}")

# Save the best model and scaler
import joblib
joblib.dump(best_model, 'model/best_model.pkl')
joblib.dump(scaler, 'model/scaler.pkl')
joblib.dump(imputer, 'model/imputer.pkl')

print("\nModel and preprocessing objects saved successfully!")

## 10. Model Inference Function

In [None]:
def predict_heart_disease(age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpeak, slope, ca, thal):
    """
    Predict heart disease risk for a patient based on input features.
    """
    # Load the trained model and preprocessing objects
    model = joblib.load('model/best_model.pkl')
    scaler = joblib.load('model/scaler.pkl')
    imputer = joblib.load('model/imputer.pkl')
    
    # Create feature array
    features = np.array([[age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpeak, slope, ca, thal]])
    
    # Apply preprocessing
    features_imputed = imputer.transform(features)
    features_scaled = scaler.transform(features_imputed)
    
    # Make prediction
    prediction = model.predict(features_scaled)[0]
    probability = model.predict_proba(features_scaled)[0][1]
    
    return prediction, probability

# Example prediction
prediction, probability = predict_heart_disease(
    age=63, sex=1, cp=1, trestbps=145, chol=233, fbs=1, restecg=2, 
    thalach=150, exang=0, oldpeak=2.3, slope=3, ca=0, thal=6
)

print(f"Prediction: {'Heart Disease' if prediction == 1 else 'No Heart Disease'}")
print(f"Probability of Heart Disease: {probability:.4f}")

---

## Summary

In this notebook, we have:
1. Loaded and explored the UCI Heart Disease dataset
2. Performed exploratory data analysis with visualizations
3. Preprocessed the data (handled missing values, scaled features)
4. Trained and evaluated three machine learning models (Random Forest, SVM, Logistic Regression)
5. Compared model performances using accuracy and ROC-AUC scores
6. Analyzed feature importance using Random Forest
7. Saved the best performing model for deployment
8. Created a prediction function for making inference on new data

The best model can now be deployed as a web service using Flask.