In [13]:
import warnings
warnings.filterwarnings("ignore", message=".*valid feature names.*", category=UserWarning)

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold, GridSearchCV, cross_val_predict, cross_val_score
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, precision_recall_curve
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import files


In [16]:
## 2) Load & Merge Data
feat = pd.read_csv(files.PPG_MY_OWN)
meta = pd.read_csv(files.METADATA_PATH)
df = feat.merge(meta[['subject_ID', 'diabetes_label']], on="subject_ID", how="inner")

# 3) Define Features and Targets, Drop Constant Features
FEATURE_COLS = [c for c in df.columns if c not in ('subject_ID', 'diabetes_label')]
# Drop constant features
df_features = df[FEATURE_COLS]
constant_features = df_features.columns[df_features.nunique() <= 1]
df = df.drop(columns=constant_features)
print(f"Dropped constant features: {list(constant_features)}")
FEATURE_COLS = [c for c in df.columns if c not in ('subject_ID', 'diabetes_label')]
print(f"Remaining features: {len(FEATURE_COLS)}")

X = df[FEATURE_COLS].values
y = df['diabetes_label'].values
groups = df['subject_ID'].values

# 4) Plot Class Distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='diabetes_label', data=df, hue='diabetes_label', palette='Set2', legend=False)
plt.title('Class Distribution of Diabetes Labels', fontsize=14)
plt.xlabel('Diabetes Label (0: No, 1: Yes)')
plt.ylabel('Count')
plt.savefig('class_distribution.png')
plt.close()

# 5) Subject-wise Stratified CV and Generate Splits
cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
cv_splits = list(cv.split(X, y, groups))  # Generate splits once

# 6) Helper to Restore DataFrame for LightGBM
to_df = FunctionTransformer(lambda X: pd.DataFrame(X, columns=FEATURE_COLS))

# 7) Define Pipelines with Adjusted Feature Selection
models = {
    "RandomForest": {
        "pipe": Pipeline([
            ("scaler", StandardScaler()),
            ("selector", SelectKBest(score_func=f_classif, k='all')),
            ("clf", RandomForestClassifier(random_state=42))
        ]),
        "params": {
            "clf__n_estimators": [50, 100, 200, 300],
            "clf__max_depth": [None, 10, 20, 30],
            "clf__max_features": ["sqrt", "log2", 0.5, 0.7]
        }
    },
    "SVM": {
        "pipe": Pipeline([
            ("scaler", StandardScaler()),
            ("selector", SelectKBest(score_func=f_classif, k='all')),
            ("clf", SVC(kernel="rbf", probability=True, random_state=42))
        ]),
        "params": {
            "clf__C": [0.1, 1, 10, 100],
            "clf__gamma": ["scale", "auto", 0.01, 0.1, 1]
        }
    },
    "GradientBoosting": {
        "pipe": Pipeline([
            ("scaler", StandardScaler()),
            ("selector", SelectKBest(score_func=f_classif, k='all')),
            ("clf", GradientBoostingClassifier(random_state=42))
        ]),
        "params": {
            "clf__n_estimators": [50, 100, 200, 300],
            "clf__learning_rate": [0.001, 0.01, 0.1, 0.2],
            "clf__max_depth": [3, 5, 7, 9],
            "clf__subsample": [0.6, 0.8, 1.0]
        }
    },
    "LightGBM": {
        "pipe": Pipeline([
            ("scaler", StandardScaler()),
            ("selector", SelectKBest(score_func=f_classif, k='all')),
            ("to_df", to_df),
            ("clf", LGBMClassifier(random_state=42, verbosity=-1, class_weight='balanced'))
        ]),
        "params": {
            "clf__n_estimators": [50, 100, 200, 300],
            "clf__learning_rate": [0.001, 0.01, 0.1, 0.2],
            "clf__num_leaves": [20, 31, 50, 70],
            "clf__max_depth": [-1, 5, 10, 15]
        }
    }
}

# 8) Tune Models and Collect OOF Predictions
best_estimators = {}
y_pred_all = {}
y_proba_all = {}
model_names = list(models.keys())

for name, spec in models.items():
    print(f"\n=== Tuning {name} ===")
    search = GridSearchCV(
        estimator=spec["pipe"],
        param_grid=spec["params"],
        scoring="accuracy",
        cv=cv_splits,
        n_jobs=-1,
        verbose=1
    )
    search.fit(X, y)
    best = search.best_estimator_
    best_estimators[name] = best
    print(f"{name} best params: {search.best_params_}")
    print(f"{name} CV acc: {search.best_score_:.4f}")

    y_pred_all[name] = cross_val_predict(best, X, y, cv=cv_splits, n_jobs=-1)
    y_proba_all[name] = cross_val_predict(
        best, X, y, method="predict_proba", cv=cv_splits, n_jobs=-1
    )[:, 1]

    print(f"\n{name} Classification Report:")
    print(classification_report(y, y_pred_all[name], target_names=["No", "Yes"], zero_division=1))

    if name == "RandomForest":
        importances = best.named_steps['clf'].feature_importances_
        selected_features = FEATURE_COLS
        plt.figure(figsize=(10, 6))
        sns.barplot(x=importances, y=selected_features, palette='viridis')
        plt.title('RandomForest Feature Importances', fontsize=14)
        plt.xlabel('Importance')
        plt.ylabel('Feature')
        plt.savefig('randomforest_feature_importances.png')
        plt.close()

# 9) Plot Evaluation Metrics for Base Models
plot_evaluation_metrics(y, y_pred_all, y_proba_all, model_names, plot_prefix="base")

# 10) Build and Evaluate Ensembles
estimators = [(n, est) for n, est in best_estimators.items()]

stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=cv_splits,
    n_jobs=-1
)
vote = VotingClassifier(
    estimators=estimators,
    voting="soft",
    n_jobs=-1
)
weighted_vote = VotingClassifier(
    estimators=estimators,
    voting="soft",
    weights=[search.best_score_ for search in [GridSearchCV(
        estimator=spec["pipe"],
        param_grid=spec["params"],
        scoring="accuracy",
        cv=cv_splits,
        n_jobs=-1
    ).fit(X, y) for spec in models.values()]],
    n_jobs=-1
)



print("\nAll plots saved as PNG files.")

Dropped constant features: ['num_segments', 'length_to_max_ratio']
Remaining features: 4

=== Tuning RandomForest ===
Fitting 5 folds for each of 64 candidates, totalling 320 fits
RandomForest best params: {'clf__max_depth': 10, 'clf__max_features': 'sqrt', 'clf__n_estimators': 200}
RandomForest CV acc: 0.8129

RandomForest Classification Report:
              precision    recall  f1-score   support

          No       0.83      0.98      0.90       181
         Yes       0.20      0.03      0.05        38

    accuracy                           0.81       219
   macro avg       0.51      0.50      0.47       219
weighted avg       0.72      0.81      0.75       219


=== Tuning SVM ===
Fitting 5 folds for each of 20 candidates, totalling 100 fits



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=importances, y=selected_features, palette='viridis')


SVM best params: {'clf__C': 0.1, 'clf__gamma': 'scale'}
SVM CV acc: 0.8266

SVM Classification Report:
              precision    recall  f1-score   support

          No       0.83      1.00      0.91       181
         Yes       1.00      0.00      0.00        38

    accuracy                           0.83       219
   macro avg       0.91      0.50      0.45       219
weighted avg       0.86      0.83      0.75       219


=== Tuning GradientBoosting ===
Fitting 5 folds for each of 192 candidates, totalling 960 fits
GradientBoosting best params: {'clf__learning_rate': 0.01, 'clf__max_depth': 7, 'clf__n_estimators': 100, 'clf__subsample': 0.6}
GradientBoosting CV acc: 0.8312

GradientBoosting Classification Report:
              precision    recall  f1-score   support

          No       0.83      1.00      0.91       181
         Yes       1.00      0.03      0.05        38

    accuracy                           0.83       219
   macro avg       0.92      0.51      0.48       219
