In [2]:
from zipfile import ZipFile
from pathlib import Path
import pandas as pd

zip_path = Path("data/randhrs1992_2022v1_SAS.zip")
out_path = Path("data/randhrs1992_2022v1.sas7bdat")

if not out_path.exists():
    with ZipFile(zip_path, "r") as z:
        z.extractall("data/")

df = pd.read_sas(out_path)




In [4]:
print(df.shape)
print(df["HHIDPN"].nunique())
print(df["HHIDPN"].isna().sum())

(45234, 19880)
45234
0


In [5]:
cols_to_check = ["HHIDPN"] + [f"R{w}CONDS" for w in range(2, 17)] + [f"R{w}CONDE" for w in range(1, 17)]
[c for c in cols_to_check if c not in df.columns][:20], len([c for c in cols_to_check if c not in df.columns])

([], 0)

In [11]:
import pandas as pd
import numpy as np

waves = list(range(2, 16))

base_feats = ["AGEY_E","SHLT","CESD","BMI","ADL5A","IADL5A","MOBILA","CONDE"]

rows = []
for w in waves:
    X_cols = [f"R{w}{f}" for f in base_feats]
    y_col = f"R{w+1}CONDS"

    tmp = df[["HHIDPN"] + X_cols + [y_col]].copy()
    tmp.columns = ["HHIDPN"] + base_feats + ["CONDS_next"]  # rename to generic
    tmp["wave"] = w
    tmp["y"] = (tmp["CONDS_next"] >= 1).astype(int)
    tmp = tmp.drop(columns=["CONDS_next"])
    rows.append(tmp)

train_long2 = pd.concat(rows, ignore_index=True)

print(train_long2.shape)
print(train_long2["y"].value_counts(normalize=True))
train_long2.head()


(633276, 11)
y
0    0.924805
1    0.075195
Name: proportion, dtype: float64


Unnamed: 0,HHIDPN,AGEY_E,SHLT,CESD,BMI,ADL5A,IADL5A,MOBILA,CONDE,wave,y
0,1010.0,56.0,4.0,,24.4,4.0,,2.0,6.0,2,0
1,2010.0,59.0,3.0,0.0,17.0,0.0,,2.0,2.0,2,1
2,3010.0,58.0,4.0,0.0,28.3,0.0,,0.0,1.0,2,0
3,3020.0,55.0,3.0,0.0,33.3,0.0,,0.0,2.0,2,0
4,10001010.0,55.0,1.0,4.0,22.4,0.0,,0.0,0.0,2,0


In [12]:
from sklearn.model_selection import train_test_split

ids = train_long2["HHIDPN"].unique()
train_ids, val_ids = train_test_split(ids, test_size=0.2, random_state=42)

train_df = train_long2[train_long2["HHIDPN"].isin(train_ids)].copy()
val_df   = train_long2[train_long2["HHIDPN"].isin(val_ids)].copy()

print("unique persons:", train_df.HHIDPN.nunique(), val_df.HHIDPN.nunique())
print("overlap:", len(set(train_df.HHIDPN).intersection(set(val_df.HHIDPN))))

unique persons: 36187 9047
overlap: 0


In [13]:
!pip -q install catboost

[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/lightning_utilities-0.14.3-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/dill-0.3.9-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/looseversion-1.3.0-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/opt_ei

In [14]:
import catboost
print(catboost.__version__)

1.2.8


In [15]:
import numpy as np
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, fbeta_score, precision_score, recall_score

features = ["AGEY_E","SHLT","CESD","BMI","ADL5A","IADL5A","MOBILA","CONDE"]

X_train, y_train = train_df[features], train_df["y"].astype(int)
X_val, y_val     = val_df[features], val_df["y"].astype(int)

pos = (y_train == 1).sum()
neg = (y_train == 0).sum()
scale_pos_weight = neg / pos
print("pos/neg:", pos, neg, "scale_pos_weight:", scale_pos_weight)

model = CatBoostClassifier(
    iterations=3000,
    learning_rate=0.03,
    depth=6,
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=42,
    verbose=200,
    scale_pos_weight=scale_pos_weight
)

model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True)

p_val = model.predict_proba(X_val)[:, 1]

print("ROC-AUC:", roc_auc_score(y_val, p_val))
print("PR-AUC :", average_precision_score(y_val, p_val))

best_thr, best_f2, best_stats = None, -1, None
for thr in np.linspace(0.05, 0.95, 181):
    y_hat = (p_val >= thr).astype(int)
    f2 = fbeta_score(y_val, y_hat, beta=2)
    if f2 > best_f2:
        best_f2 = f2
        best_thr = thr
        best_stats = (
            precision_score(y_val, y_hat, zero_division=0),
            recall_score(y_val, y_hat, zero_division=0),
        )

print("Best F2:", best_f2, "thr:", best_thr, "precision/recall:", best_stats)

pos/neg: 37921 468697 scale_pos_weight: 12.359827008781414
0:	test: 0.7914769	best: 0.7914769 (0)	total: 68.8ms	remaining: 3m 26s
200:	test: 0.8182481	best: 0.8182490 (198)	total: 4.15s	remaining: 57.8s
400:	test: 0.8185814	best: 0.8185855 (398)	total: 8.17s	remaining: 53s
600:	test: 0.8184122	best: 0.8186459 (482)	total: 12.2s	remaining: 48.5s
800:	test: 0.8183309	best: 0.8186459 (482)	total: 16.1s	remaining: 44.3s
1000:	test: 0.8179232	best: 0.8186459 (482)	total: 20s	remaining: 40s
1200:	test: 0.8175926	best: 0.8186459 (482)	total: 24s	remaining: 36s
1400:	test: 0.8173301	best: 0.8186459 (482)	total: 28s	remaining: 32s
1600:	test: 0.8169962	best: 0.8186459 (482)	total: 32.1s	remaining: 28s
1800:	test: 0.8164415	best: 0.8186459 (482)	total: 36s	remaining: 24s
2000:	test: 0.8161047	best: 0.8186459 (482)	total: 40s	remaining: 20s
2200:	test: 0.8158624	best: 0.8186459 (482)	total: 44s	remaining: 16s
2400:	test: 0.8154848	best: 0.8186459 (482)	total: 48s	remaining: 12s
2600:	test: 0.8153

In [16]:
from sklearn.metrics import confusion_matrix

thr = 0.525
y_hat = (p_val >= thr).astype(int)
print(confusion_matrix(y_val, y_hat))
print("predicted positive rate:", y_hat.mean())

[[74183 42777]
 [  703  8995]]
predicted positive rate: 0.40875428318779705


In [17]:
import pandas as pd

imp = model.get_feature_importance()
fi = pd.DataFrame({"feature": features, "importance": imp}).sort_values("importance", ascending=False)
fi

Unnamed: 0,feature,importance
7,CONDE,36.098862
4,ADL5A,23.228685
0,AGEY_E,14.426484
1,SHLT,7.921465
6,MOBILA,7.547121
3,BMI,4.707449
5,IADL5A,3.2127
2,CESD,2.857234
