
# PRT565 — Assessment 2 (Machine Learning Coding Exercise)

**Student Name:** Md Sajjad Hossain Sawran  
**Student ID:** s377923  

**Dataset:** `heart (1).csv` (Kaggle upload)

**Aim:** Develop a ML-based model using **Logistic Regression**, **Decision Tree**, and **Random Forest** on the uploaded dataset.  
This notebook is fully commented and runs end-to-end.



## 0) Instructions
1. This notebook already points to your uploaded file.  
2. If you change files, update `DATA_PATH` and (if needed) `TARGET_COL` below.  
3. Run cells **top to bottom**.


In [None]:

# 1) Imports & Configuration
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, RocCurveDisplay, classification_report
)

import joblib

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

plt.rcParams['figure.figsize'] = (7, 5)
plt.rcParams['axes.grid'] = True

print('Ready. sklearn version:', __import__('sklearn').__version__)



## 2) Data loading
- `DATA_PATH` points to your Kaggle CSV.  
- `TARGET_COL` is auto-detected if possible; adjust if needed.


In [None]:

# 2) Configure dataset path and target column
DATA_PATH = Path(r"/mnt/data/heart (1).csv")
DETECTED_TARGET = 'target'
TARGET_COL = DETECTED_TARGET if DETECTED_TARGET is not None else 'HeartDisease'

print('Loading dataset from:', DATA_PATH)
df = pd.read_csv(DATA_PATH)
print('Shape:', df.shape)
print('Columns:', list(df.columns))

if TARGET_COL not in df.columns:
    print(f"WARNING: Target column '{TARGET_COL}' not found. Please set TARGET_COL to the correct name.")
    # Fallback search
    for c in ['HeartDisease','target','Target','output','Output','Outcome','class','Class','label','Label']:
        if c in df.columns:
            TARGET_COL = c
            print('Auto-switched TARGET_COL to:', TARGET_COL)
            break

df.head()



## 3) Quick data audit
- Dtypes, numeric summary, missing values, and class balance.


In [None]:

print('\nDTypes:\n', df.dtypes)
print('\nDescribe (numeric):\n', df.describe(numeric_only=True).T)

missing = df.isna().sum().sort_values(ascending=False)
print('\nMissing values per column:\n', missing)

print('\nTarget value counts:')
print(df[TARGET_COL].value_counts(dropna=False))



## 4) Train / Test split
- 25% test, stratified by target.


In [None]:

X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y
)
print('Train shape:', X_train.shape, ' Test shape:', X_test.shape)



## 5) Column typing
- Auto-detect numeric vs categorical; adjust manually if needed.


In [None]:

numeric_cols = X_train.select_dtypes(include=['int64','float64','int32','float32']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object','category','bool']).columns.tolist()

print('Numeric columns:', numeric_cols)
print('Categorical columns:', categorical_cols)
assert set(numeric_cols + categorical_cols) == set(X_train.columns), 'Column typing mismatch.'



## 6) Preprocessing pipelines
- Numeric: median imputer + standard scaler  
- Categorical: most_frequent imputer + one-hot


In [None]:

numeric_preprocess = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler(with_mean=True, with_std=True)),
])

categorical_preprocess = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
])

preprocess = ColumnTransformer(transformers=[
    ('num', numeric_preprocess, numeric_cols),
    ('cat', categorical_preprocess, categorical_cols),
])
preprocess



## 7) Define models + hyperparameter grids
- Logistic Regression, Decision Tree, Random Forest


In [None]:

models_and_grids = []

# Logistic Regression
logreg_pipe = Pipeline(steps=[
    ('prep', preprocess),
    ('clf', LogisticRegression(max_iter=2000, random_state=RANDOM_STATE, solver='liblinear'))
])
logreg_grid = {
    'clf__C': [0.01, 0.1, 1.0, 10.0],
    'clf__penalty': ['l1', 'l2'],
}
models_and_grids.append(('LogisticRegression', logreg_pipe, logreg_grid))

# Decision Tree
dt_pipe = Pipeline(steps=[
    ('prep', preprocess),
    ('clf', DecisionTreeClassifier(random_state=RANDOM_STATE))
])
dt_grid = {
    'clf__criterion': ['gini', 'entropy', 'log_loss'],
    'clf__max_depth': [None, 4, 6, 8, 12, 16],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4],
}
models_and_grids.append(('DecisionTree', dt_pipe, dt_grid))

# Random Forest
rf_pipe = Pipeline(steps=[
    ('prep', preprocess),
    ('clf', RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1))
])
rf_grid = {
    'clf__n_estimators': [100, 300, 500],
    'clf__max_depth': [None, 6, 10, 16],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4],
    'clf__max_features': ['sqrt', 'log2', None],
}
models_and_grids.append(('RandomForest', rf_pipe, rf_grid))

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
print('Pipelines and grids ready.')



## 8) Hyperparameter tuning (GridSearchCV, 5-fold)
Scoring: **accuracy**.


In [None]:

search_results = {}
for name, pipe, grid in models_and_grids:
    print(f'\n=== Tuning {name} ===')
    gs = GridSearchCV(
        estimator=pipe,
        param_grid=grid,
        scoring='accuracy',
        cv=cv,
        n_jobs=-1,
        verbose=1,
        refit=True,
        return_train_score=True
    )
    gs.fit(X_train, y_train)
    search_results[name] = gs
    print(f'Best CV accuracy for {name}: {gs.best_score_:.4f}')
    print('Best params:', gs.best_params_)



## 9) Evaluation on the test set
- Accuracy, Precision, Recall, F1, ROC-AUC  
- Confusion Matrix & Classification Report  
- ROC curves for all models


In [None]:

def evaluate_model(name, estimator, X_te, y_te):
    y_pred = estimator.predict(X_te)
    if hasattr(estimator, 'predict_proba'):
        y_prob = estimator.predict_proba(X_te)[:, 1]
    elif hasattr(estimator, 'decision_function'):
        raw = estimator.decision_function(X_te)
        y_prob = (raw - raw.min()) / (raw.max() - raw.min() + 1e-9)
    else:
        y_prob = np.zeros_like(y_pred, dtype=float)

    acc = accuracy_score(y_te, y_pred)
    prec = precision_score(y_te, y_pred, zero_division=0)
    rec = recall_score(y_te, y_pred, zero_division=0)
    f1 = f1_score(y_te, y_pred, zero_division=0)
    try:
        auc = roc_auc_score(y_te, y_prob)
    except ValueError:
        auc = np.nan
    cm = confusion_matrix(y_te, y_pred)
    print(f'\n[{name}] Test metrics')
    print(f'Accuracy:  {{acc:.4f}}')
    print(f'Precision: {{prec:.4f}}')
    print(f'Recall:    {{rec:.4f}}')
    print(f'F1-score:  {{f1:.4f}}')
    print(f'ROC-AUC:   {{auc:.4f}}')
    print('\nConfusion Matrix:\n', cm)
    print('\nClassification Report:\n', classification_report(y_te, y_pred, zero_division=0))
    return {{'name': name, 'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'auc': auc, 'cm': cm}}

test_summaries = []

# ROC curves
fig = plt.figure()
for name, gs in search_results.items():
    est = gs.best_estimator_
    try:
        RocCurveDisplay.from_estimator(est, X_test, y_test, name=name)
    except Exception:
        y_dummy = np.ones_like(y_test)
        RocCurveDisplay.from_predictions(y_test, y_dummy, name=name)
plt.title('ROC Curves (Test Set)')
plt.show()

for name, gs in search_results.items():
    test_summaries.append(evaluate_model(name, gs.best_estimator_, X_test, y_test))

summary_df = pd.DataFrame(test_summaries).sort_values(by='acc', ascending=False)
summary_df



## 10) Model interpretation
- Logistic Regression coefficients  
- Tree/Forest feature importances


In [None]:

def get_feature_names(preprocess, numeric_cols, categorical_cols):
    ohe = preprocess.named_transformers_['cat'].named_steps['onehot']
    ohe_names = ohe.get_feature_names_out(categorical_cols).tolist()
    return numeric_cols + ohe_names

for name, gs in search_results.items():
    est = gs.best_estimator_
    print(f'\n=== {name}: Interpretation ===')
    prep = est.named_steps['prep']
    feat_names = get_feature_names(prep, numeric_cols, categorical_cols)

    clf = est.named_steps['clf']
    if isinstance(clf, LogisticRegression):
        coefs = pd.Series(clf.coef_.ravel(), index=feat_names).sort_values()
        print('Top negative coefficients (most protective):')
        print(coefs.head(10))
        print('\nTop positive coefficients (most risky):')
        print(coefs.tail(10))
    elif isinstance(clf, (DecisionTreeClassifier, RandomForestClassifier)):
        importances = pd.Series(clf.feature_importances_, index=feat_names).sort_values(ascending=False)
        print('Top feature importances:')
        print(importances.head(15))
    else:
        print('No built-in interpretation for this estimator.')



## 11) Save artifacts
- Best model by test accuracy  
- Processed train/test splits  
- Metrics summary CSV


In [None]:

best_row = max(test_summaries, key=lambda d: d['acc'])
best_name = best_row['name']
best_model = search_results[best_name].best_estimator_

out_dir = Path('outputs')
out_dir.mkdir(exist_ok=True)

model_path = out_dir / f'best_model_{best_name}.pkl'
joblib.dump(best_model, model_path)

X_train_assign = X_train.copy()
X_test_assign = X_test.copy()
X_train_assign[TARGET_COL] = y_train.values
X_test_assign[TARGET_COL] = y_test.values
X_train_assign.to_csv(out_dir / 'train_split.csv', index=False)
X_test_assign.to_csv(out_dir / 'test_split.csv', index=False)

summary_df.to_csv(out_dir / 'test_metrics_summary.csv', index=False)

print(f'Saved best model to: {model_path.resolve()}')
print('Artifacts in:', out_dir.resolve())
summary_df



## 12) Submission reminder
- Submit this **.ipynb** (with outputs) and your **CSV** file from Kaggle.  
- Ensure metrics and plots are visible after running.
