In [1]:
%%capture
%pip install lightgbm catboost scikit-learn tabulate

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    fbeta_score,
    precision_score,
    recall_score,
    roc_auc_score,
    average_precision_score
)

import lightgbm as lgb
from catboost import CatBoostClassifier

In [3]:
disease = 'lung'

In [4]:
df = pd.read_parquet(f'data/processed/randhrs_{disease}_4yr.parquet')
df.head()

Unnamed: 0,person_id,calendar_year,birth_year,age,age_squared,female,ethnicity,education_years,college_plus,self_rated_health,...,sleep_problem,sleep_change,new_sleep_problem,former_smoker,quit_smoking,sedentary,stopped_activity,drinks_per_week,heavy_drinking,incident_lung_4yr
0,3010,1996,1936.0,60.0,3600.0,False,White,3.0,False,4.0,...,0,0.0,0,0,0,0,0,3.0,0,0.0
1,3020,1996,1938.0,58.0,3364.0,True,White,5.0,True,3.0,...,0,0.0,0,0,0,0,0,0.0,0,0.0
2,10001010,1996,1939.0,57.0,3249.0,False,White,3.0,False,3.0,...,1,0.0,0,0,0,0,0,0.0,0,0.0
3,10003030,1996,1956.0,40.0,1600.0,True,White,5.0,True,3.0,...,1,0.0,0,0,0,1,0,0.0,0,0.0
4,10004010,1996,1939.0,57.0,3249.0,False,White,5.0,True,3.0,...,1,0.0,0,1,0,0,0,12.0,0,0.0


In [5]:
TARGET = f"incident_{disease}_4yr"

X = df.drop(columns=[TARGET])
y = df[TARGET]

In [6]:
print("TARGET:", TARGET)
print("Overall prevalence:", y.mean())
print("Total positives:", y.sum())
print("Total samples :", len(y))

TARGET: incident_lung_4yr
Overall prevalence: 0.033701207972975364
Total positives: 3811.0
Total samples : 113082


In [7]:
cutoff = df['calendar_year'].quantile(0.8)

train_mask = df['calendar_year'] <= cutoff
test_mask  = df['calendar_year'] > cutoff


train_df = df[train_mask].copy()
test_df  = df[test_mask].copy()

test_persons = set(test_df['person_id'])
train_df = train_df[~train_df['person_id'].isin(test_persons)]

In [8]:
print("\nAfter removing person overlap:")
print("Train prevalence:", y_train.mean())
print("Test prevalence :", y_test.mean())

print("Train positives:", y_train.sum())
print("Test positives :", y_test.sum())

print("Train samples:", len(y_train))
print("Test samples :", len(y_test))


After removing person overlap:


NameError: name 'y_train' is not defined

In [65]:
drop_cols = [TARGET]

if 'person_id' in df.columns:
    drop_cols.append('person_id')

X_train = train_df.drop(columns=drop_cols)
y_train = train_df[TARGET]

X_test  = test_df.drop(columns=drop_cols)
y_test  = test_df[TARGET]

In [66]:
def get_scores(y_true, y_pred_proba, threshold=0.5, verbose=True):
    """
    Medical evaluation metrics:
    - F2 score (recall-focused)
    - PR-AUC (important for imbalance)
    - ROC-AUC (industry benchmark)
    """

    # Convert probabilities to class predictions
    y_pred = (y_pred_proba >= threshold).astype(int)

    # Core metrics
    f2 = fbeta_score(y_true, y_pred, beta=2)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)

    pr_auc = average_precision_score(y_true, y_pred_proba)
    roc_auc = roc_auc_score(y_true, y_pred_proba)

    results = {
        "F2": f2,
        "Precision": precision,
        "Recall": recall,
        "PR-AUC": pr_auc,
        "ROC-AUC": roc_auc
    }

    if verbose:
        print(f"Threshold: {threshold}")
        print(f"F2-score:   {f2:.4f}")
        print(f"Recall:     {recall:.4f}")
        print(f"Precision:  {precision:.4f}")
        print(f"PR-AUC:     {pr_auc:.4f}")
        print(f"ROC-AUC:    {roc_auc:.4f}")

    return results

In [67]:
# Count class distribution in TRAIN set
neg = (y_train == 0).sum()
pos = (y_train == 1).sum()

scale_pos_weight = neg / pos

print("Negatives:", neg)
print("Positives:", pos)
print("scale_pos_weight:", scale_pos_weight)


Negatives: 42149
Positives: 3120
scale_pos_weight: 13.509294871794872


In [68]:
def find_best_f2_threshold(y_true, y_pred_proba):
    best_thresh = 0
    best_f2 = 0

    for t in np.linspace(0.05, 0.5, 100):
        y_pred = (y_pred_proba >= t).astype(int)
        score = fbeta_score(y_true, y_pred, beta=2)

        if score > best_f2:
            best_f2 = score
            best_thresh = t

    return best_thresh, best_f2

In [69]:
lgb_model = lgb.LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.03,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1
)

lgb_model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    eval_metric="auc"
)

[LightGBM] [Info] Number of positive: 3120, number of negative: 42149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005907 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3446
[LightGBM] [Info] Number of data points in the train set: 45269, number of used features: 120
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.068921 -> initscore=-2.603378
[LightGBM] [Info] Start training from score -2.603378


In [70]:
lgb_pred = lgb_model.predict_proba(X_test)[:, 1]

best_t, best_f2 = find_best_f2_threshold(y_test, lgb_pred)

print("Best threshold:", best_t)
print("Best F2:", best_f2)

lgb_results = get_scores(y_test, lgb_pred, threshold=best_t)

Best threshold: 0.5
Best F2: 0.22490400438837085
Threshold: 0.5
F2-score:   0.2249
Recall:     0.3877
Precision:  0.0839
PR-AUC:     0.0736
ROC-AUC:    0.6614


In [71]:
cat_features = X_train.select_dtypes(include=['category', 'object']).columns.tolist()

print(cat_features)

cat_model = CatBoostClassifier(
    iterations=3000,
    learning_rate=0.03,
    depth=6,
    eval_metric="AUC",
    loss_function="Logloss",
    scale_pos_weight=scale_pos_weight,
    random_seed=42,
    verbose=200
)

cat_model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_test, y_test),
    early_stopping_rounds=200,
    use_best_model=True
)

['ethnicity']
0:	test: 0.6929293	best: 0.6929293 (0)	total: 23.2ms	remaining: 1m 9s


200:	test: 0.7087474	best: 0.7130490 (131)	total: 3.36s	remaining: 46.8s
400:	test: 0.7119612	best: 0.7139501 (327)	total: 6.87s	remaining: 44.5s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.7139500886
bestIteration = 327

Shrink model to first 328 iterations.


<catboost.core.CatBoostClassifier at 0x74491113d8b0>

In [72]:
cat_pred = cat_model.predict_proba(X_test)[:, 1]

best_t, best_f2 = find_best_f2_threshold(y_test, cat_pred)

print("Best threshold:", best_t)
print("Best F2:", best_f2)

cat_results = get_scores(y_test, cat_pred, threshold=best_t)

Best threshold: 0.4954545454545454
Best F2: 0.2279724380589356
Threshold: 0.4954545454545454
F2-score:   0.2280
Recall:     0.7352
Precision:  0.0606
PR-AUC:     0.0895
ROC-AUC:    0.7140


In [73]:
importances = cat_model.get_feature_importance()

# Create dataframe
feat_imp = pd.DataFrame({
    "feature": X_train.columns,
    "importance": importances
})

# Sort descending
feat_imp = feat_imp.sort_values(by="importance", ascending=False)

# Show top 15
print(feat_imp.head(15))

                     feature  importance
27               ever_smoked    9.245431
8          self_rated_health    5.106070
12      mobility_limitations    4.689899
28            current_smoker    4.354246
5                  ethnicity    3.712996
1                 birth_year    3.470072
0              calendar_year    2.851832
13  large_muscle_limitations    2.774544
2                        age    2.127393
56    vigorous_activity_lag1    1.820926
31         vigorous_activity    1.762460
3                age_squared    1.758365
78       current_smoker_lag2    1.684297
52          ever_smoked_lag1    1.647570
96   any_mobility_limitation    1.616962


In [74]:
ensemble_pred = (lgb_pred + cat_pred) / 2


best_t_ens, best_f2_ens = find_best_f2_threshold(y_test, ensemble_pred)

print("Best threshold:", best_t_ens)
print("Best F2:", best_f2_ens)

ensemble_results = get_scores(y_test, ensemble_pred, threshold=best_t_ens)

Best threshold: 0.5
Best F2: 0.22972389991371872
Threshold: 0.5
F2-score:   0.2297
Recall:     0.5035
Precision:  0.0724
PR-AUC:     0.0839
ROC-AUC:    0.6961


In [75]:
def compare_models(models):
    metrics = ["F2", "Recall", "Precision", "PR-AUC", "ROC-AUC"]

    rows = []

    for metric in metrics:

        values = [model[1][metric] for model in models]
        max_value = max(values)

        row = [metric]

        for name, results in models:
            value = results[metric]

            if value == max_value:
                row.append(f"**{value:.4f}**")
            else:
                row.append(f"{value:.4f}")

        rows.append(row)

    columns = ["Metric"] + [model[0] for model in models]

    df = pd.DataFrame(rows, columns=columns)

    return df

In [76]:
models = [
    ("LightGBM", lgb_results),
    ("CatBoost", cat_results),
    ("Ensemble", ensemble_results),
]

comparison_table = compare_models(models)

print(f"Disease: {disease}")
print(comparison_table.to_markdown(index=False))


Disease: lung
| Metric    | LightGBM   | CatBoost   | Ensemble   |
|:----------|:-----------|:-----------|:-----------|
| F2        | 0.2249     | 0.2280     | **0.2297** |
| Recall    | 0.3877     | **0.7352** | 0.5035     |
| Precision | **0.0839** | 0.0606     | 0.0724     |
| PR-AUC    | 0.0736     | **0.0895** | 0.0839     |
| ROC-AUC   | 0.6614     | **0.7140** | 0.6961     |
