In [4]:
import pandas as pd
import numpy as np
from pathlib import Path

HORIZON_HOURS = 1
HORIZON = pd.Timedelta(hours=HORIZON_HOURS)

TRAIN_PARQUET = Path("../data/processed/train_with_hist_shifted_v2.parquet")
TEST_PARQUET  = Path("../data/processed/test_with_hist_shifted_v2.parquet")
OUT_DIR = Path("../data/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [5]:
train = pd.read_parquet(TRAIN_PARQUET)
test  = pd.read_parquet(TEST_PARQUET)

train = train.copy()
test  = test.copy()

train["__origin"] = "train"
test["__origin"]  = "test"

if "ts" not in train.columns:
    train["ts"] = pd.to_datetime(train["window_start"])
if "ts" not in test.columns:
    test["ts"] = pd.to_datetime(test["window_start"])
    
train["ts"] = pd.to_datetime(train["ts"])
test["ts"]  = pd.to_datetime(test["ts"])

full = pd.concat([train, test], ignore_index=True, sort=False)
full = full.sort_values("ts").reset_index(drop=True)


In [6]:
idx_label = np.zeros(len(full), dtype=np.bool_)
idx_censored = np.zeros(len(full), dtype=np.bool_)

grp = full.groupby("segment_id")
for seg, inds in grp.groups.items():
    seg_idx = np.array(inds, dtype=int)
    seg_df = full.iloc[seg_idx].sort_values("ts")
    seg_ts = seg_df["ts"].values.astype("datetime64[ns]")
    severe_mask = seg_df["severe"].astype(int).values
    sev_times = seg_ts[severe_mask == 1]
    if len(sev_times) == 0:
        max_ts = seg_ts.max()
        cens = (seg_ts + HORIZON) > np.datetime64(max_ts)
        idx_label[seg_idx] = False
        idx_censored[seg_idx] = cens
        continue
    left = np.searchsorted(sev_times, seg_ts, side="right")
    right = np.searchsorted(sev_times, seg_ts + HORIZON, side="right")
    has_future = (right - left) > 0
    idx_label[seg_idx] = has_future
    max_ts = seg_ts.max()
    cens = (seg_ts + HORIZON) > np.datetime64(max_ts)
    idx_censored[seg_idx] = cens

In [7]:
full["label_1h"] = idx_label.astype(int)
full["censored_1h"] = idx_censored

n_total = len(full)
n_censored = int(full["censored_1h"].sum())
n_label_pos = int(full["label_1h"].sum())
print(f"Total rows: {n_total:,}")
print(f"Censored rows (no H hours of future data): {n_censored:,} ({n_censored/n_total:.2%})")
print(f"Positive labels (label_1h==1): {n_label_pos:,} ({n_label_pos/n_total:.2%})")

Total rows: 2,196,754
Censored rows (no H hours of future data): 224,699 (10.23%)
Positive labels (label_1h==1): 125,287 (5.70%)


In [8]:
full_nc = full[~full["censored_1h"]].reset_index(drop=True)

train_out = full_nc[full_nc["__origin"] == "train"].drop(columns=["__origin"])
test_out  = full_nc[full_nc["__origin"] == "test"].drop(columns=["__origin"])

train_path = OUT_DIR / "train_with_label_1h.parquet"
test_path  = OUT_DIR / "test_with_label_1h.parquet"

train_out.to_parquet(train_path, index=False, compression="snappy")
test_out.to_parquet(test_path,  index=False, compression="snappy")

print("Saved:")
print(" ", train_path, "rows:", len(train_out))
print(" ", test_path,  "rows:", len(test_out))

Saved:
  ..\data\processed\train_with_label_1h.parquet rows: 1653203
  ..\data\processed\test_with_label_1h.parquet rows: 318852


In [9]:
print("Label distribution (train):")
display(train_out["label_1h"].value_counts(dropna=False))
print("Label distribution (test):")
display(test_out["label_1h"].value_counts(dropna=False))

print("Sample positive-label rows:")
display(full_nc.loc[full_nc["label_1h"]==1, ["segment_id","ts","label_1h"]].head(10))

Label distribution (train):


label_1h
0    1543395
1     109808
Name: count, dtype: int64

Label distribution (test):


label_1h
0    303430
1     15422
Name: count, dtype: int64

Sample positive-label rows:


Unnamed: 0,segment_id,ts,label_1h
5,UNK,2012-07-01 00:21:00,1
11,UNK,2012-07-01 00:30:00,1
19,UNK,2012-07-01 01:00:00,1
76,UNK,2012-07-01 04:45:00,1
77,UNK,2012-07-01 04:50:00,1
160,UNK,2012-07-01 11:00:00,1
171,UNK,2012-07-01 11:30:00,1
172,UNK,2012-07-01 11:30:00,1
174,UNK,2012-07-01 11:35:00,1
179,UNK,2012-07-01 11:45:00,1
