In [1]:
%%capture
%pip install lightgbm catboost scikit-learn tabulate

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    fbeta_score,
    precision_score,
    recall_score,
    roc_auc_score,
    average_precision_score
)

import lightgbm as lgb
from catboost import CatBoostClassifier

In [3]:
disease = 'hibp'

In [4]:
df = pd.read_parquet(f'data/processed/randhrs_{disease}_4yr.parquet')
df.head()

Unnamed: 0,birth_year,age,age_squared,female,white,black,hispanic,education_years,college_plus,self_rated_health,...,stopped_activity,drinks_per_week,heavy_drinking,age_x_bmi,sedentary_and_obese,depression_x_mobility,cognitive_decline_x_age,metabolic_risk_score,frailty_indicators,incident_hibp_4yr
0,1936.0,60.0,3600.0,False,True,False,False,3.0,False,4.0,...,0,3.0,0,1596.0,0,0.0,-300.0,0,0,0.0
1,1938.0,58.0,3364.0,True,True,False,False,5.0,True,3.0,...,0,0.0,0,2024.2,0,0.0,-58.0,2,0,0.0
2,1939.0,57.0,3249.0,False,True,False,False,3.0,False,3.0,...,0,0.0,0,1350.9,0,0.0,57.0,0,0,0.0
3,1946.0,50.0,2500.0,True,True,False,False,3.0,False,1.0,...,0,28.0,1,1290.0,0,0.0,200.0,0,0,0.0
4,1947.0,49.0,2401.0,True,True,False,False,4.0,False,2.0,...,0,0.0,0,1171.1,0,0.0,49.0,0,0,0.0


In [5]:
TARGET = f"incident_{disease}_4yr"

X = df.drop(columns=[TARGET])
y = df[TARGET]

In [6]:
def get_scores(y_true, y_pred_proba, threshold=0.5, verbose=True):
    """
    Medical evaluation metrics:
    - F2 score (recall-focused)
    - PR-AUC (important for imbalance)
    - ROC-AUC (industry benchmark)
    """

    # Convert probabilities to class predictions
    y_pred = (y_pred_proba >= threshold).astype(int)

    # Core metrics
    f2 = fbeta_score(y_true, y_pred, beta=2)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)

    pr_auc = average_precision_score(y_true, y_pred_proba)
    roc_auc = roc_auc_score(y_true, y_pred_proba)

    results = {
        "F2": f2,
        "Precision": precision,
        "Recall": recall,
        "PR-AUC": pr_auc,
        "ROC-AUC": roc_auc
    }

    if verbose:
        print(f"Threshold: {threshold}")
        print(f"F2-score:   {f2:.4f}")
        print(f"Recall:     {recall:.4f}")
        print(f"Precision:  {precision:.4f}")
        print(f"PR-AUC:     {pr_auc:.4f}")
        print(f"ROC-AUC:    {roc_auc:.4f}")

    return results

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [8]:
# Count class distribution in TRAIN set
neg = (y_train == 0).sum()
pos = (y_train == 1).sum()

scale_pos_weight = neg / pos

print("Negatives:", neg)
print("Positives:", pos)
print("scale_pos_weight:", scale_pos_weight)


Negatives: 37604
Positives: 8067
scale_pos_weight: 4.661460270236767


In [9]:
def find_best_f2_threshold(y_true, y_pred_proba):
    best_thresh = 0
    best_f2 = 0

    for t in np.linspace(0.05, 0.5, 100):
        y_pred = (y_pred_proba >= t).astype(int)
        score = fbeta_score(y_true, y_pred, beta=2)

        if score > best_f2:
            best_f2 = score
            best_thresh = t

    return best_thresh, best_f2

In [10]:
lgb_model = lgb.LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.03,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1
)

lgb_model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    eval_metric="auc"
)

[LightGBM] [Info] Number of positive: 8067, number of negative: 37604
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028563 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4561
[LightGBM] [Info] Number of data points in the train set: 45671, number of used features: 133
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.176633 -> initscore=-1.539329
[LightGBM] [Info] Start training from score -1.539329


In [11]:
lgb_pred = lgb_model.predict_proba(X_test)[:, 1]

best_t, best_f2 = find_best_f2_threshold(y_test, lgb_pred)

print("Best threshold:", best_t)
print("Best F2:", best_f2)

lgb_results = get_scores(y_test, lgb_pred, threshold=best_t)

Best threshold: 0.25
Best F2: 0.5260237487515258
Threshold: 0.25
F2-score:   0.5260
Recall:     0.9400
Precision:  0.1905
PR-AUC:     0.2403
ROC-AUC:    0.6167


In [12]:
cat_model = CatBoostClassifier(
    iterations=3000,
    learning_rate=0.03,
    depth=6,
    eval_metric="AUC",
    loss_function="Logloss",
    scale_pos_weight=scale_pos_weight,
    random_seed=42,
    verbose=200
)

cat_model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    early_stopping_rounds=200,
    use_best_model=True
)

0:	test: 0.5752474	best: 0.5752474 (0)	total: 69ms	remaining: 3m 26s


200:	test: 0.6101358	best: 0.6101358 (200)	total: 3.35s	remaining: 46.7s
400:	test: 0.6124338	best: 0.6127226 (392)	total: 7.08s	remaining: 45.9s
600:	test: 0.6155337	best: 0.6155463 (591)	total: 10.4s	remaining: 41.3s
800:	test: 0.6147313	best: 0.6159183 (690)	total: 13.5s	remaining: 37.2s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.6159182952
bestIteration = 690

Shrink model to first 691 iterations.


<catboost.core.CatBoostClassifier at 0x743177c97a70>

In [13]:
cat_pred = cat_model.predict_proba(X_test)[:, 1]

best_t, best_f2 = find_best_f2_threshold(y_test, cat_pred)

print("Best threshold:", best_t)
print("Best F2:", best_f2)

cat_results = get_scores(y_test, cat_pred, threshold=best_t)

Best threshold: 0.3545454545454545
Best F2: 0.5291669042595655
Threshold: 0.3545454545454545
F2-score:   0.5292
Recall:     0.9202
Precision:  0.1960
PR-AUC:     0.2337
ROC-AUC:    0.6159


In [14]:
importances = cat_model.get_feature_importance()

# Create dataframe
feat_imp = pd.DataFrame({
    "feature": X_train.columns,
    "importance": importances
})

# Sort descending
feat_imp = feat_imp.sort_values(by="importance", ascending=False)

# Show top 15
print(feat_imp.head(15))

               feature  importance
127          age_x_bmi    6.783227
0           birth_year    5.944785
10                 bmi    3.806660
11              weight    2.619910
7      education_years    2.573822
9    self_rated_health    2.176246
12              height    2.118867
88   weight_change_pct    1.949322
36         weight_lag1    1.898497
35            bmi_lag1    1.810587
62         height_lag2    1.758560
86    bmi_acceleration    1.632135
85    bmi_velocity_4yr    1.619465
37         height_lag1    1.527037
33      marital_status    1.513607


In [15]:
ensemble_pred = (lgb_pred + cat_pred) / 2


best_t_ens, best_f2_ens = find_best_f2_threshold(y_test, ensemble_pred)

print("Best threshold:", best_t_ens)
print("Best F2:", best_f2_ens)

ensemble_results = get_scores(y_test, ensemble_pred, threshold=best_t_ens)

Best threshold: 0.2772727272727273
Best F2: 0.530643569784015
Threshold: 0.2772727272727273
F2-score:   0.5306
Recall:     0.9598
Precision:  0.1903
PR-AUC:     0.2410
ROC-AUC:    0.6208


In [16]:
def compare_models(models):
    metrics = ["F2", "Recall", "Precision", "PR-AUC", "ROC-AUC"]

    rows = []

    for metric in metrics:

        values = [model[1][metric] for model in models]
        max_value = max(values)

        row = [metric]

        for name, results in models:
            value = results[metric]

            if value == max_value:
                row.append(f"**{value:.4f}**")
            else:
                row.append(f"{value:.4f}")

        rows.append(row)

    columns = ["Metric"] + [model[0] for model in models]

    df = pd.DataFrame(rows, columns=columns)

    return df

In [17]:
models = [
    ("LightGBM", lgb_results),
    ("CatBoost", cat_results),
    ("Ensemble", ensemble_results),
]

comparison_table = compare_models(models)

print(f"Disease: {disease}")
print(comparison_table.to_markdown(index=False))


Disease: hibp
| Metric    |   LightGBM | CatBoost   | Ensemble   |
|:----------|-----------:|:-----------|:-----------|
| F2        |     0.526  | 0.5292     | **0.5306** |
| Recall    |     0.94   | 0.9202     | **0.9598** |
| Precision |     0.1905 | **0.1960** | 0.1903     |
| PR-AUC    |     0.2403 | 0.2337     | **0.2410** |
| ROC-AUC   |     0.6167 | 0.6159     | **0.6208** |
