In [7]:
import os
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
LOG_INDICES = {14,15,16,19,20,22,25,26,27,30,31,32}



In [8]:

varmeans = np.array([84.58144338298742, 97.19395453339598, 36.977228240795384, 123.75046539637763, 82.40009988667639, 63.83055577034239, 18.72649785557987, 32.95765667291276, -0.6899191871174756, \
    24.075480562219358, 0.5548386348703284, 7.37893402619616, 41.02186880800917, 92.65418774854838, 260.22338482309493, 23.91545210569777, 102.48366144100076, 7.557530849328269, 105.82790991400108, \
        1.5106993531749389, 1.8361772575250843, 136.9322832898959, 2.646666023259181, 2.05145021490337, 3.544237652686153, 4.135527970939283, 2.114059461561731, 8.290099451999183, 30.79409334002751, 10.43083278791528, \
            41.231193461563706, 11.446405019759258, 287.38570591681315, 196.01391078961922, 62.00946887985519, 0.5592690422043409, 0.49657112470087744, 0.5034288752991226, -56.12512176894499, 26.994992301299437])

varstds = np.array([17.3252, 2.9369, 0.77, 23.2316, 16.3418, 13.956, 5.0982, 7.9517, 4.2943, 4.3765, 11.1232, 0.0746, 9.2672, 10.893, 855.7468, 19.9943, 120.1227, 2.4332, 5.8805, 1.8056, 3.6941, 51.3107, 2.5262, 0.3979, 1.4233, 0.6421, 4.3115, 24.8062, 5.4917, 1.9687, 26.2177, 7.731, 153.0029, 103.6354, 16.3862, 0.4965, 0.5, 0.5, 162.2569, 29.0054])

varlogmeans = np.array([4.4166, 4.576, 3.6101, 4.8009, 4.3928, 4.1334, 2.8926, 3.4633, 0.1, 0.1, 0.1, 1.9986, 3.6911, 4.5201, 4.104, 2.92, 4.3874, 1.9011, 4.6602, 0.1002, -0.5519, 4.8652, 0.7091, 0.7016, 1.1922, 1.4085, 0.0335, -0.8605, 3.4115, 2.327, 3.6051, 2.3098, 5.5347, 5.1412, 4.0841, 0.1, 0.1, 0.1, 0.1, 2.8862])
varlogstds = np.array([0.2069, 0.0338, 0.0209, 0.1862, 0.1929, 0.2133, 0.2811, 0.2632, 0.1, 0.1, 0.1, 0.0102, 0.2117, 0.1413, 1.3713, 0.6972, 0.6114, 0.6183, 0.0564, 0.6841, 1.4805, 0.3181, 0.6703, 0.1821, 0.3854, 0.1478, 1.0199, 2.723, 0.1788, 0.1896, 0.425, 0.5176, 0.5046, 0.5522, 0.3135, 0.1, 0.1, 0.1, 0.1, 0.9707])

In [9]:
def preprocess(raw_data):
    data = np.copy(raw_data)
    T, F = data.shape
    assert F == 40

    mask = ~np.isnan(data)

    # Forward-fill + impute bằng mean nếu toàn NaN
    forward = np.copy(data[0])
    for j in range(F):
        col = data[:, j]
        nan_mask = np.isnan(col)
        if nan_mask.all():
            col[:] = varmeans[j]
        else:
            first_valid = np.argmax(~nan_mask)
            col[:first_valid] = varmeans[j]
            for t in range(first_valid + 1, T):
                if np.isnan(col[t]):
                    col[t] = col[t-1]

    # Delta
    delta = np.zeros_like(data)
    for t in range(1, T):
        delta[t] = data[t] - data[t-1]

    # Normalize
    for i in range(F):
        if i in LOG_INDICES:
            data[:, i] = np.clip(data[:, i], 1e-6, None)
            data[:, i] = 10 * (np.log(data[:, i]) - varlogmeans[i]) / varlogstds[i]
        else:
            data[:, i] = 10 * (data[:, i] - varmeans[i]) / varstds[i]

    return np.concatenate([data, delta, mask.astype(float)], axis=1)

In [10]:
def build_windows(data_120):
    T, D = data_120.shape
    assert D == 120
    X_out = []

    for t in range(T):
        row = []
        for j in range(6):
            idx = t - j
            if idx < 0:
                row.extend([0]*120)
            else:
                row.extend(data_120[idx])
        X_out.append(row)

    return np.array(X_out)


In [11]:
npz = np.load("/kaggle/input/sepsis-challenge-2019/sepsis_training_data.npz")
X_all = npz["X"]
y_all = npz["y"]
lengths = npz["lengths"]

X_list = []
y_list = []

In [12]:
for i in range(len(lengths)):
    T = lengths[i]
    x_raw = X_all[i, :T, :]
    y_raw = y_all[i, :T]

    valid = (y_raw >= 0)
    x_raw = x_raw[valid]
    y_raw = y_raw[valid]

    if len(y_raw) == 0:
        continue

    feat120 = preprocess(x_raw)
    X_win = build_windows(feat120)

    X_list.append(X_win)
    y_list.append(y_raw)

X = np.vstack(X_list)
y = np.hstack(y_list)

print("Training shape:", X.shape, y.shape)  # expecting (N, 720)

Training shape: (1539217, 720) (1539217,)


In [13]:
dtrain = xgb.DMatrix(X, label=y)

params = {
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "aucpr"],
    "eta": 0.1,
    "max_depth": 4,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "scale_pos_weight": 40,
    "tree_method": "hist",
    "seed": 42,
}

model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=300,
    evals=[(dtrain, "train")],
    verbose_eval=10,
    early_stopping_rounds=20
)


[0]	train-logloss:0.53966	train-aucpr:0.08570
[10]	train-logloss:0.46166	train-aucpr:0.10433
[20]	train-logloss:0.43518	train-aucpr:0.11276
[30]	train-logloss:0.42066	train-aucpr:0.12216
[40]	train-logloss:0.41090	train-aucpr:0.12808
[50]	train-logloss:0.40442	train-aucpr:0.13228
[60]	train-logloss:0.39905	train-aucpr:0.13736
[70]	train-logloss:0.39453	train-aucpr:0.14073
[80]	train-logloss:0.39114	train-aucpr:0.14378
[90]	train-logloss:0.38761	train-aucpr:0.14838
[100]	train-logloss:0.38446	train-aucpr:0.15197
[110]	train-logloss:0.38231	train-aucpr:0.15441
[120]	train-logloss:0.37926	train-aucpr:0.15754
[130]	train-logloss:0.37646	train-aucpr:0.16009
[140]	train-logloss:0.37392	train-aucpr:0.16207
[150]	train-logloss:0.37183	train-aucpr:0.16403
[160]	train-logloss:0.37018	train-aucpr:0.16668
[170]	train-logloss:0.36776	train-aucpr:0.16964
[180]	train-logloss:0.36525	train-aucpr:0.17326
[190]	train-logloss:0.36300	train-aucpr:0.17687
[200]	train-logloss:0.36147	train-aucpr:0.17882
[21

In [14]:

model.save_model("xgb_sepsis_720dim.json")
print("Saved model!")

Saved model!
