# Model Training

In [None]:
# Import machine learning libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# Load the original dataset 
df = pd.read_csv('../data/heart_disease.csv')
print("Raw dataset loaded")
print(f"Shape: {df.shape}")
df.head()

Raw dataset loaded
Shape: (303, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0


In [7]:
# Separate features (X) and target (y)
X = df.drop('target', axis=1)  # Everything except the target column
y = df['target']               # Only the target column

print("Data split into features and target!")
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

Data split into features and target!
Features shape: (303, 13)
Target shape: (303,)


In [8]:
# Split data: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Data split into training and testing sets!")
print(f"Training set: {X_train.shape}")
print(f"Testing set: {X_test.shape}")
print(f"Target distribution in training: {np.bincount(y_train)}")
print(f"Target distribution in testing: {np.bincount(y_test)}")

Data split into training and testing sets!
Training set: (242, 13)
Testing set: (61, 13)
Target distribution in training: [131 111]
Target distribution in testing: [33 28]


In [10]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Support Vector Machine': SVC(random_state=42)
}

# Train & evaluate each model using PCA pipeline
results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("pca", PCA(n_components=12)),
        ("model", model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    results[name] = (acc, pipeline)
    print(f"{name} accuracy: {acc:.3f}")


Training Logistic Regression...
Logistic Regression accuracy: 0.869

Training Random Forest...
Random Forest accuracy: 0.885

Training Support Vector Machine...
Support Vector Machine accuracy: 0.869


In [13]:
# Pick the best model
best_model_name = max(results, key=lambda k: results[k][0])
best_accuracy, best_pipeline = results[best_model_name]

print(f"\nBEST MODEL: {best_model_name} ({best_accuracy:.3f} accuracy)")


BEST MODEL: Random Forest (0.885 accuracy)


In [14]:
# Save pipeline (includes scaler + PCA + model)
joblib.dump(best_pipeline, '../models/heart_disease_pipeline.pkl')
print("Pipeline saved successfully!")

Pipeline saved successfully!
