# Heart Disease Classification (Kaggle)

This notebook trains and evaluates three models (Logistic Regression, Random Forest, Neural Network) to predict the presence of heart disease based on the classic Kaggle Heart Disease dataset.

In [None]:
# Install libraries if needed (uncomment if running in fresh Colab)
# !pip install numpy pandas scikit-learn matplotlib seaborn joblib tensorflow

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import tensorflow as tf

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, RocCurveDisplay


In [None]:
# Load dataset (ensure heart.csv is in working directory)
df = pd.read_csv("data/heart.csv")

print("Dataset shape:", df.shape)
df.head()


In [None]:
df.info()
print("\nClass distribution:")
print(df['target'].value_counts())
sns.countplot(x='target', data=df)
plt.title("Target Distribution")
plt.show()


In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()


In [None]:
RANDOM_SEED = 42
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_SEED)


In [None]:
pipe_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(max_iter=2000, random_state=RANDOM_SEED))
])
pipe_lr.fit(X_train, y_train)
y_pred_lr = pipe_lr.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("F1:", f1_score(y_test, y_pred_lr))
print("ROC AUC:", roc_auc_score(y_test, pipe_lr.predict_proba(X_test)[:,1]))


In [None]:
rf = RandomForestClassifier(n_estimators=200, min_samples_split=5, random_state=RANDOM_SEED)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("F1:", f1_score(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))


In [None]:
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
importances.plot(kind='bar', figsize=(10,5), title="Random Forest Feature Importance")
plt.show()


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

mlp = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

mlp.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
early_stop = tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)

history = mlp.fit(X_train_scaled, y_train, validation_split=0.2, epochs=100, batch_size=16, callbacks=[early_stop], verbose=0)

y_pred_nn_prob = mlp.predict(X_test_scaled).ravel()
y_pred_nn = (y_pred_nn_prob > 0.5).astype(int)

print("Neural Network Accuracy:", accuracy_score(y_test, y_pred_nn))
print("F1:", f1_score(y_test, y_pred_nn))
print("ROC AUC:", roc_auc_score(y_test, y_pred_nn_prob))


In [None]:
joblib.dump(pipe_lr, "models/logistic_pipeline.joblib")
joblib.dump(rf, "models/random_forest.joblib")
joblib.dump(scaler, "models/scaler.joblib")
mlp.save("models/neural_network.keras")

print("Models saved to ./models/")


In [None]:
# Reload models and test inference
pipe_lr_loaded = joblib.load("models/logistic_pipeline.joblib")
rf_loaded = joblib.load("models/random_forest.joblib")
scaler_loaded = joblib.load("models/scaler.joblib")
mlp_loaded = tf.keras.models.load_model("models/neural_network.keras")

sample = {
    "age": 63, "sex":1,"cp":3,"trestbps":145,"chol":233,"fbs":1,
    "restecg":0,"thalach":150,"exang":0,"oldpeak":2.3,"slope":0,
    "ca":0,"thal":1
}
X_sample = pd.DataFrame([sample])

pred_lr = pipe_lr_loaded.predict(X_sample)[0]
prob_lr = pipe_lr_loaded.predict_proba(X_sample)[:,1][0]

pred_rf = rf_loaded.predict(X_sample)[0]
prob_rf = rf_loaded.predict_proba(X_sample)[:,1][0]

X_sample_scaled = scaler_loaded.transform(X_sample)
prob_nn = mlp_loaded.predict(X_sample_scaled).ravel()[0]
pred_nn = int(prob_nn > 0.5)

print("Logistic Regression:", pred_lr, prob_lr)
print("Random Forest:", pred_rf, prob_rf)
print("Neural Net:", pred_nn, prob_nn)
