In [2]:
# ==========================
# Heart Disease Training
# ==========================

# Step 1: Import Libraries
import pandas as pd
import numpy as np
import glob
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# ==========================
# Step 2: Load and Merge Data
# ==========================
files = glob.glob("../data/heart+disease/processed.*.data")

df_list = [pd.read_csv(file, header=None) for file in files]
df = pd.concat(df_list, ignore_index=True)
print("Initial Dataset shape:", df.shape)

# ==========================
# Step 3: Clean Missing Values
# ==========================
# Replace '?' and -9 with NaN
df.replace(['?', -9], np.nan, inplace=True)

# Drop rows with NaN
df.dropna(inplace=True)

# Convert everything to numeric
df = df.apply(pd.to_numeric)
print("After cleaning:", df.shape)


# ==========================
# Step 4: Features & Labels
# ==========================
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Convert to binary classification (0 = no disease, >0 = disease)
y = (y > 0).astype(int)
print("Target distribution:\n", y.value_counts())

# ==========================
# Step 5: Train/Test Split
# ==========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ==========================
# Step 6: Train Models
# ==========================
# Logistic Regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

# ==========================
# Step 7: Evaluate Models
# ==========================
print("\nModel Performance:")
print("Logistic Regression Accuracy (Train/Test):", 
      log_model.score(X_train, y_train), log_model.score(X_test, y_test))
print("Random Forest Accuracy (Train/Test):", 
      rf_model.score(X_train, y_train), rf_model.score(X_test, y_test))

# ==========================
# Step 8: Save Best Model
# ==========================
with open("../model/heart_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)

with open("../model/heart_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("\n✅ Model and Scaler saved as heart_model.pkl & heart_scaler.pkl")


Initial Dataset shape: (920, 14)
After cleaning: (299, 14)
Target distribution:
 13
0    160
1    139
Name: count, dtype: int64

Model Performance:
Logistic Regression Accuracy (Train/Test): 0.8577405857740585 0.8166666666666667
Random Forest Accuracy (Train/Test): 1.0 0.8333333333333334

✅ Model and Scaler saved as heart_model.pkl & heart_scaler.pkl
