In [20]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from models.logistic_regression import train_model
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    confusion_matrix
)

In [19]:
df = pd.read_csv("./data/HeartDiseaseTrain-Test.csv")

df = df.dropna() 

X = df.drop("target", axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [21]:
num_features = [
    "age", "resting_blood_pressure", "cholestoral",
    "Max_heart_rate", "oldpeak"
]

cat_features = [
    "sex", "chest_pain_type", "fasting_blood_sugar",
    "rest_ecg", "exercise_induced_angina",
    "slope", "vessels_colored_by_flourosopy", "thalassemia"
]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ]
)

In [22]:
def evaluate_model(model, X_test, y_test):
    """
    Evaluates a trained classification model on test data.

    Parameters:
    - model : trained sklearn Pipeline
    - X_test : pandas DataFrame (features)
    - y_test : pandas Series (true labels)

    Returns:
    - metrics : dict containing all evaluation metrics
    - cm : confusion matrix
    """

    # Predictions
    y_pred = model.predict(X_test)

    # Probabilities (for AUC)
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_prob)
    else:
        auc = None   # for safety (rare case)

    # Metrics
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": auc,
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)

    return metrics, cm

In [None]:
# Train model
logistic_model = train_model(preprocessor, X_train, y_train)

metrics, cm = evaluate_model(logistic_model, X_test, y_test)

print(metrics)
print(cm)

# SAVE model (THIS LINE CREATES .pkl)
joblib.dump(logistic_model, "./saved_models/logistic_regression.pkl")

{'Accuracy': 0.8731707317073171, 'AUC': 0.9444761904761905, 'Precision': 0.8558558558558559, 'Recall': 0.9047619047619048, 'F1': 0.8796296296296297, 'MCC': 0.7471136777897657}
[[84 16]
 [10 95]]


['./saved_models/logistic_regression.pkl']

In [None]:
# Create test file
test_df = X_test.copy()
test_df["target"] = y_test.values
test_df.to_csv("./data/heart_test.csv", index=False)