In [None]:
!pip install relbench --quiet

In [None]:
!pip install -U "autogluon.tabular[all]"

In [None]:
from relbench.datasets import get_dataset

dataset = get_dataset(name="rel-trial", download=True)

In [None]:
from relbench.tasks import get_task

db = dataset.get_db()

table = db.table_dict["studies"]
task = get_task("rel-trial", "study-outcome", download=True)

train_task = task.get_table("train").df
val_task = task.get_table("val").df
test_task = task.get_table("test").df

## learning table

train_df = train_task.merge(table.df, how="left")
val_df = val_task.merge(table.df, how="left")
test_df = test_task.merge(table.df, how="left")

In [None]:
OUTPUT_PATH = '/run/'

# Tabular NFA

In [None]:
train_df = train_df.set_index("nct_id", drop=False)
val_df = val_df.set_index("nct_id", drop=False)
test_df= test_df.set_index("nct_id", drop=False)


# sanity check
assert train_df.index.is_unique and val_df.index.is_unique and test_df.index.is_unique
assert train_df.index.intersection(val_df.index).empty
assert train_df.index.intersection(test_df.index).empty
assert val_df.index.intersection(test_df.index).empty

In [None]:
import pandas as pd
import numpy as np
from typing import List, Optional, Tuple, Union, Dict

IndexLike = Union[pd.Index, pd.Series, List[int]]

def nfa_or_time_window_props(
    T: pd.DataFrame,
    df_idx: IndexLike,
    ref_idx: IndexLike,
    *,
    time_col: str,
    window: Union[str, pd.Timedelta],
    group_cols: List[str],
    numeric_cols: Optional[List[str]] = None,
    categorical_cols: Optional[List[str]] = None,
    prefix: str = "nfa",
) -> pd.DataFrame:
    """
    OR-neighborhood NFA with trailing time window and categorical proportions.

    Neighbors(i) = union over c in group_cols of:
        rows j in ref_idx where
          T[j,c] == T[i,c] AND
          t_i - window <= t_j < t_i

    Categorical outputs are proportions (counts / neighbor_count).
    """
    numeric_cols = list(numeric_cols or [])
    categorical_cols = list(categorical_cols or [])
    if isinstance(window, str):
        window = pd.Timedelta(window)

    df = T.loc[pd.Index(df_idx)].copy()
    ref_df = T.loc[pd.Index(ref_idx)]

    df_time = pd.to_datetime(df[time_col])
    ref_time = pd.to_datetime(ref_df[time_col]).to_numpy(dtype="datetime64[ns]")

    # group lookups: value -> ref positions
    group_maps: Dict[str, Dict[object, np.ndarray]] = {
        c: ref_df.groupby(c, sort=False).indices for c in group_cols
    }

    # numeric reference matrix
    ref_num = ref_df[numeric_cols].to_numpy(dtype=float) if numeric_cols else None

    # categorical metadata (freeze levels on reference)
    cat_meta = {}
    for c in categorical_cols:
        levels = pd.Index(pd.unique(ref_df[c].dropna()))
        codes = pd.Categorical(ref_df[c], categories=levels).codes.astype(np.int32)
        cat_meta[c] = {"levels": levels, "codes": codes, "k": len(levels)}

    n = len(df)
    neigh_count = np.zeros(n, dtype=np.int64)

    if numeric_cols:
        mean_out = np.full((n, len(numeric_cols)), np.nan)
        min_out  = np.full((n, len(numeric_cols)), np.nan)
        max_out  = np.full((n, len(numeric_cols)), np.nan)

    cat_prop_blocks = {
        c: np.full((n, meta["k"]), np.nan, dtype=float)
        for c, meta in cat_meta.items()
    }

    for i, (idx, row) in enumerate(df.iterrows()):
        t_i = np.datetime64(df_time.iloc[i].to_datetime64())

        parts = []
        for c in group_cols:
            arr = group_maps[c].get(row[c])
            if arr is not None:
                parts.append(arr)
        if not parts:
            continue

        neigh_pos = np.unique(np.concatenate(parts))

        # time window filter
        t_lo = t_i - np.timedelta64(window.value, "ns")
        t_ref = ref_time[neigh_pos]
        mask = (t_ref >= t_lo) & (t_ref < t_i)
        neigh_pos = neigh_pos[mask]

        if len(neigh_pos) == 0:
            continue

        neigh_count[i] = len(neigh_pos)

        if numeric_cols:
            vals = ref_num[neigh_pos, :]
            with np.errstate(invalid="ignore"):
                mean_out[i, :] = np.nanmean(vals, axis=0)
                min_out[i, :]  = np.nanmin(vals, axis=0)
                max_out[i, :]  = np.nanmax(vals, axis=0)

        for c, meta in cat_meta.items():
            codes = meta["codes"][neigh_pos]
            codes = codes[codes >= 0]
            if codes.size:
                counts = np.bincount(codes, minlength=meta["k"])
                cat_prop_blocks[c][i, :] = counts / neigh_count[i]

    out = [df]

    out.append(pd.DataFrame(
        {f"{prefix}_neighbor_count": neigh_count},
        index=df.index
    ))

    if numeric_cols:
        out.append(pd.DataFrame(
            np.column_stack([mean_out, min_out, max_out]),
            index=df.index,
            columns=[
                *(f"{prefix}_{c}_mean" for c in numeric_cols),
                *(f"{prefix}_{c}_min"  for c in numeric_cols),
                *(f"{prefix}_{c}_max"  for c in numeric_cols),
            ],
        ))

    for c, meta in cat_meta.items():
        cols = [f"{prefix}_prop_{c}__{lvl}" for lvl in meta["levels"].astype(str)]
        out.append(pd.DataFrame(cat_prop_blocks[c], index=df.index, columns=cols))

    return pd.concat(out, axis=1)

In [None]:
def add_nfa_and_trim_train(
  df_train: pd.DataFrame,
  df_val: pd.DataFrame,
  df_test: pd.DataFrame,
  *,
  time_col: str,
  window: Union[str, pd.Timedelta],
  group_cols: List[str],
  numeric_cols: Optional[List[str]] = None,
  categorical_cols: Optional[List[str]] = None,
  train_drop_before: Union[str, pd.Timestamp], # e.g. "2001-01-01"
  prefix: str = "nfa",
  ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
  """
  - Computes NFA features on train / val / test
  - Uses early train rows as NFA context
  - Drops early train rows from supervised training
  """


  T = pd.concat([df_train, df_val, df_test], axis=0)
  T = T.sort_values(time_col)


  tr_idx = df_train.index
  va_idx = df_val.index
  te_idx = df_test.index


  # TRAIN: neighbors from past train
  tr_nfa = nfa_or_time_window_props(
  T, tr_idx, tr_idx,
  time_col=time_col,
  window=window,
  group_cols=group_cols,
  numeric_cols=numeric_cols,
  categorical_cols=categorical_cols,
  prefix=prefix,
  )


  # VAL: neighbors from past train + past val
  va_ref = tr_idx.union(va_idx)
  va_nfa = nfa_or_time_window_props(
  T, va_idx, va_ref,
  time_col=time_col,
  window=window,
  group_cols=group_cols,
  numeric_cols=numeric_cols,
  categorical_cols=categorical_cols,
  prefix=prefix,
  )


  # TEST: neighbors from past train + past val + past test
  te_ref = tr_idx.union(va_idx).union(te_idx)
  te_nfa = nfa_or_time_window_props(
  T, te_idx, te_ref,
  time_col=time_col,
  window=window,
  group_cols=group_cols,
  numeric_cols=numeric_cols,
  categorical_cols=categorical_cols,
  prefix=prefix,
  )


  # Trim early training rows (used only as NFA context)
  cutoff = pd.to_datetime(train_drop_before)
  keep_mask = pd.to_datetime(tr_nfa[time_col]) >= cutoff
  tr_nfa_trimmed = tr_nfa.loc[keep_mask]


  return tr_nfa_trimmed, va_nfa, te_nfa

In [None]:
df_train_nfa, df_val_nfa, df_test_nfa = add_nfa_and_trim_train(
    train_df,
    val_df,
    test_df,
    time_col="start_date",
    window="365D",  # 1 year trailing window
    group_cols=[
        "phase",
        "study_type",
        "source_class",
        "enrollment_type",
        "has_dmc",
    ],
    numeric_cols=[
        "enrollment",
        "number_of_arms",
        "number_of_groups",
        "limitations_and_caveats",
    ],
    categorical_cols=[
        "phase",
        "study_type",
        "enrollment_type",
        "has_dmc",
        "is_fda_regulated_drug",
        "is_fda_regulated_device",
        "is_unapproved_device",
        "is_ppsd",
        "is_us_export",
        "biospec_retention",
        "source_class",
        "fdaaa801_violation",
        "plan_to_share_ipd",
    ],
    train_drop_before="2001-01-01",  # bootstrap year used only as NFA context
)

In [None]:
import numpy as np

# 1) take training schema as the only schema
train_cols = df_train_nfa.columns

# 2) reindex val and test to training schema
df_val_nfa  = df_val_nfa.reindex(columns=train_cols)
df_test_nfa = df_test_nfa.reindex(columns=train_cols)

# (train already has the correct schema)
# df_train_nfa = df_train_nfa.reindex(columns=train_cols)  # optional / no-op

# 3) fill categorical proportions with 0 (correct semantics)
prop_cols = [c for c in train_cols if c.startswith("nfa_prop_")]
df_val_nfa[prop_cols]  = df_val_nfa[prop_cols].fillna(0.0)
df_test_nfa[prop_cols] = df_test_nfa[prop_cols].fillna(0.0)

# 4) fill neighbor count if ever missing
if "nfa_neighbor_count" in train_cols:
    df_val_nfa["nfa_neighbor_count"]  = df_val_nfa["nfa_neighbor_count"].fillna(0).astype(int)
    df_test_nfa["nfa_neighbor_count"] = df_test_nfa["nfa_neighbor_count"].fillna(0).astype(int)

# 5) final guarantee
assert df_train_nfa.columns.equals(df_val_nfa.columns)
assert df_train_nfa.columns.equals(df_test_nfa.columns)

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor


predictor = TabularPredictor(
    label="outcome",
    path=path+"/Tabular/tabular+nfa/",
    eval_metric="roc_auc",  # or "f1",
).fit(
    train_data=df_train_nfa,
    tuning_data=df_val_nfa,
    time_limit=3600,
    presets="medium_quality",
    included_model_types=[
        "GBM",      # LightGBM
        "CAT",      # CatBoost
        "XGB",      # XGBoost
        "RF",       # optional
        "XT",       # optional
        "REALMLP",  # MLP (if available)
    ],
)



In [None]:
predictor.leaderboard(df_val_nfa, silent=True).head(20) # Top 20 models

In [None]:
proba  = predictor.predict_proba(df_test_nfa)
preds_proba = proba[1]

results = task.evaluate(preds_proba)

In [None]:
df_tab_nfa = pd.DataFrame([results])
df_tab_nfa["model"] = "Tab+NFA"
df_tab_nfa["task"] = task

df_tab_nfa.to_csv(OUTPUT_PATH+f"Tab+NFA_Trail_{task}.csv")