In [None]:
import os, random, warnings, json, time, ast, itertools
from pathlib import Path
from collections import defaultdict

import joblib
import optuna
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import ipywidgets as w
from ipywidgets import interact, fixed

import networkx as nx
import statsmodels.api as sm
from scipy.stats import rankdata, pointbiserialr
from tqdm.auto import tqdm

from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import StratifiedGroupKFold, GroupShuffleSplit, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier

from lightgbm import LGBMClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
import xgboost as xgb
from collections import defaultdict

import sklearn, warnings
from packaging import version
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

from boruta_shap import BorutaShap


In [None]:
N_CPU = os.cpu_count()

In [None]:
train_raw = pd.read_csv("datasets/train.csv")
test_raw  = pd.read_csv("datasets/test.csv")
assert train_raw.shape[0] == 10000, "Expecting 10k training sentences"

In [None]:
CENT_CORE = {
    "pagerank"    : nx.pagerank,
    "eigenvector" : lambda G: nx.eigenvector_centrality_numpy(G),
    "voterank"    : lambda G: {n: r/len(G) for r, n in
                               enumerate(nx.voterank(G)[::-1], 1)},
}

CENT_EXTRA = {
    "degree"      : nx.degree_centrality,
    "closeness"   : nx.closeness_centrality,
    "harmonic"    : nx.harmonic_centrality,
    "katz"        : lambda G: nx.katz_centrality_numpy(G, alpha=0.01),
    "betweenness": nx.betweenness_centrality,
    "load"       : nx.load_centrality,
}

CENT_ALL = CENT_CORE | CENT_EXTRA
LOW_BETTER = {"eccentricity", "farness", "depth_leaf"}#lower rank is better


In [None]:
def build_nodes(df, keep_extra=True):
    rows = []
    for _, row in df.iterrows():
        G = nx.from_edgelist(ast.literal_eval(row.edgelist))
        n_tok = len(G)

        cents = {**{k: f(G) for k, f in CENT_CORE.items()},
                 **({k: f(G) for k, f in CENT_EXTRA.items()} if keep_extra else {})}

        #structural extras
        ecc   = nx.eccentricity(G)
        clos  = cents.get("closeness", nx.closeness_centrality(G))
        farness = {n: (1/ c if c else 0) for n, c in clos.items()}

        leaves = [v for v in G if G.degree(v) == 1]
        sp     = dict(nx.all_pairs_shortest_path_length(G))
        depth_leaf = {v: max(sp[v][l] for l in leaves) for v in G}

        #branchiness = leaves within 2 hops
        branch2 = {}
        for v in G:
            branch2[v] = sum(1 for nbr,d in sp[v].items()
                               if d <= 2 and G.degree(nbr) == 1 and nbr != v)

        for n in G:
            feat = {
                "language" : row.language,
                "sentence" : row.sentence,
                "node"     : n,
                "n_tokens" : n_tok,
                **{k: v.get(n, 0.) for k, v in cents.items()},
                "eccentricity": ecc[n],
                "farness"     : farness[n],
                "depth_leaf"  : depth_leaf[n],
                "branch2"     : branch2[n],
            }
            if "root" in row:
                feat["target"] = int(n == row.root)
            rows.append(feat)

    df = pd.DataFrame(rows)

    #scale raw centralities sentence-wise
    raw_cols = list(CENT_CORE) + (list(CENT_EXTRA) if keep_extra else [])
    df[raw_cols] = df.groupby("sentence")[raw_cols] \
                     .transform(lambda x: MinMaxScaler().fit_transform(
                                   x.values.reshape(-1,1)).ravel())

    #position
    df["pos_idx"]  = df["node"].astype(int) - 1
    df["pos_frac"] = df["pos_idx"] / (df["n_tokens"] - 1)
    df["rev_idx"]  = df["n_tokens"] - df["pos_idx"] - 1

    #ranks + percentiles
    rank_cols = raw_cols + ["eccentricity","farness","branch_2","depth_leaf"]
    for c in rank_cols:
        rk = df.groupby("sentence")[c] \
               .transform(lambda x: rankdata(
                   x if c in LOW_BETTER else -x, method="average"))
        df[f"{c}_rank"] = rk
        df[f"{c}_pct"]  = rk / (df["n_tokens"] - 1)

    core_pct = [f"{c}_pct" for c in ["pagerank","eigenvector","voterank"]]
    df["meta_rank"] = df[core_pct].mean(axis=1)
    df["meta_rank_resid"] = df["meta_rank"] - \
                            df.groupby("language")["meta_rank"].transform("mean")
    return df

In [None]:
train_nodes = build_nodes(train_raw, keep_extra=True)
test_nodes  = build_nodes(test_raw , keep_extra=True)

BUCKET_EDGES  = [-1,5,10,15,20,25,30,40,1e9]
BUCKET_LABELS = range(len(BUCKET_EDGES)-1)
for df in (train_nodes, test_nodes):
    df["bucket"] = pd.cut(df["n_tokens"], BUCKET_EDGES, labels=BUCKET_LABELS)
    df[["language","bucket"]] = df[["language","bucket"]].astype("category")


## EDA

In [None]:
#stats
summary = (
    train_nodes
      .drop_duplicates(["language", "sentence"])
      .groupby("language")["n_tokens"]
      .agg(["count", "mean", "median", "min", "max", "std"])
      .sort_values("mean", ascending=False)
)
print(summary.head(21))

In [None]:
summary = (
    test_nodes
      .drop_duplicates(["language", "sentence"])
      .groupby("language")["n_tokens"]
      .agg(["count", "mean", "median", "min", "max", "std"])
      .sort_values("mean", ascending=False)
)
print(summary.head(21))

In [None]:
# Length-based behaviour recognition:

langs = sorted(train_nodes.language.unique())
for LANG in langs:
    roots = (
        train_nodes.query("language == @LANG and target==1")
                   .drop_duplicates("sentence")
    )

    x = roots["n_tokens"].values
    y = roots["pos_frac"].values

    fig, ax = plt.subplots(figsize=(6,4))
    ax.scatter(x, y, s=10, alpha=0.4)

    # LOWESS smoothing (fraction controls window)
    lowess = sm.nonparametric.lowess(y, x, frac=0.2)
    ax.plot(lowess[:,0], lowess[:,1], linewidth=2)

    ax.set_xlabel("sentence length (tokens)")
    ax.set_ylabel("root relative position (0 = first, 1 = last)")
    ax.set_title(f"{LANG}: root position vs length")
    plt.tight_layout()
    plt.show()

    bins = pd.cut(roots["n_tokens"],
              bins=[-1, 5, 10, 15, 20, 25, 30, 40, 1e9],
              labels=["≤5","5–10","11–15","16–20","21–25","26–30","31–40","41+"])
    bucket_stats = roots.groupby(bins)["pos_frac"].agg(["mean", "count"])
    print(bucket_stats)

In [None]:
#sentence-length distribution
sent_train = (
    train_nodes
      .drop_duplicates(['language', 'sentence'])
      .loc[:, ['language', 'n_tokens']]
)

sent_test  = (
    test_nodes
      .drop_duplicates(['language', 'sentence'])
      .loc[:, ['language', 'n_tokens']]
)

def make_bins(data, mode='unit'):
    if mode == 'unit':
        lo, hi = data.min(), data.max()
        return range(lo, hi + 2) 
    else:
        edges = list(range(1, 71, 5))
        return edges

def plot_len_hist(language, dataset='train', binning='unit'):
    df = sent_train if dataset == 'train' else sent_test
    lengths = df.loc[df.language == language, 'n_tokens']
    bins = make_bins(lengths, binning)

    q05, q95 = lengths.quantile([0.05, 0.95]).values

    plt.figure(figsize=(6, 4))
    plt.hist(lengths, bins=bins)

    plt.axvline(q05, linestyle='--', label='q05')
    plt.axvline(q95, linestyle='--', label='q95')

    plt.xlabel('sentence length (tokens)')
    plt.ylabel('number of sentences')
    plt.title(f'Sentence-length distribution – {language} ({dataset})')
    plt.legend()
    plt.tight_layout()
    plt.show()

langs = sorted(sent_train.language.unique())
interact(
    plot_len_hist,
    language = langs,
    dataset  = ['train', 'test'],
    binning  = {'exact (bin = 1)': 'unit', 'bucketised': 'bucket'}
)

In [None]:
#language vs root (absolute and relative)

root_nodes = train_nodes.loc[train_nodes["target"] == 1, ["language", "sentence", "node", "n_tokens"]].copy()

root_nodes["rel"] = root_nodes["node"] / root_nodes["n_tokens"]
lang_order = train_nodes["language"].drop_duplicates().tolist()

def plot_sentence(sent_id: int, view: str = "absolute"):
    df = (root_nodes[root_nodes["sentence"] == sent_id]
            .set_index("language")
            .loc[lang_order]
            .reset_index())

    fig = go.Figure()

    if view == "absolute":
        #bars for sentence length
        fig.add_bar(
            x=df["language"], y=df["n_tokens"],
            name="#tokens",
            marker_color="cornflowerblue",
            text=df["n_tokens"], textposition="outside"
        )
        #dots for root node id
        fig.add_scatter(
            x=df["language"], y=df["node"],
            mode="markers",
            name="root",
            marker=dict(size=11, color="crimson")
        )
        y_title = "value"
    else:  #relative
        #dots at node/length
        fig.add_scatter(
            x=df["language"], y=df["rel"],
            mode="markers",
            name="root / length",
            marker=dict(size=11, color="crimson")
        )
        fig.update_yaxes(range=[0, 1])
        y_title = "relative root position (0–1)"

    fig.update_layout(
        title=f"Sentence {sent_id}: {'absolute' if view=='absolute' else 'relative'} view",
        yaxis_title=y_title,
        xaxis=dict(categoryorder="array", categoryarray=lang_order),
        bargap=0.28 if view == "absolute" else 0.4,
        width=980, height=480,
        legend=dict(yanchor="top", y=0.97, xanchor="left", x=0.01)
    )

    fig.show()

sent_ids = sorted(root_nodes["sentence"].unique())

sent_slider = w.SelectionSlider(
    options=sent_ids, value=sent_ids[0],
    description="sentence:", continuous_update=False
)

view_dd = w.Dropdown(
    options=[("absolute", "absolute"), ("relative (0–1)", "relative")],
    value="absolute",
    description="view:"
)

ui  = w.VBox([sent_slider, view_dd])
out = w.interactive_output(
        plot_sentence,
        {"sent_id": sent_slider, "view": view_dd}
      )

display(ui, out)


In [None]:
if "lang_len_prior" not in train_nodes.columns:
    def _add_prior(train_nodes, other_nodes):
        roots = train_nodes.loc[train_nodes.target == 1,
                                ["language", "bucket", "pos_frac"]]
        tbl = roots.groupby(["language", "bucket"])["pos_frac"].mean()

        for df in (train_nodes, other_nodes):
            idx = pd.MultiIndex.from_arrays([df["language"], df["bucket"]])
            pr  = tbl.reindex(idx).to_numpy()
            df["lang_len_prior"]  = pr
            df["dist_from_prior"] = df["pos_frac"] - pr

    _add_prior(train_nodes, test_nodes)

In [None]:
#centralities
EXTRA_PCT = ["branch2_pct", "depth_leaf_pct"]
CENT_PCT   = [c + "_pct" for c in CENT_ALL] + ["voterank_pct"] + EXTRA_PCT
CENT_PCT = [c+"_pct" for c in CENT_ALL] + ["voterank_pct"]
rank_df = (
    train_nodes
      .melt(id_vars=["language","sentence","node","target"],
            value_vars=CENT_PCT, var_name="cent", value_name="val")
      .assign(rank=lambda d:
              d.groupby(["language","sentence","cent"])["val"]
                .rank("min", ascending=True))   # pct: smaller is better
      .loc[lambda d: d.target==1]
)

disp = (rank_df.groupby(["language","cent"])["rank"]
               .mean().unstack())
disp.style.format("{:.1f}").background_gradient("YlGn_r")


In [None]:
# num_cols = [c for c in train_nodes.columns
#             if pd.api.types.is_numeric_dtype(train_nodes[c])
#             and c not in ["target"]]
# corr_tb = {c: pointbiserialr(train_nodes["target"], train_nodes[c])[:2]
#            for c in num_cols}
# corr_df = pd.DataFrame(corr_tb, index=["r","p"]).T.sort_values("r")
# display(corr_df.tail(15))


In [None]:
cols = ['pos_frac','rev_idx','lang_len_prior','dist_from_prior'] + \
       [c for c in train_nodes.columns if c.endswith('_pct')]

spearman = train_nodes[cols].corr("spearman")
plt.figure(figsize=(14,10))
sns.heatmap(spearman.abs(), cmap="viridis")
plt.show()


In [None]:
#filling NaNs
print("NaNs in train:", train_nodes["lang_len_prior"].isna().sum())
print("NaNs in  test:", test_nodes ["lang_len_prior"].isna().sum())
def missing_pairs(df):
    return (
        df.loc[df["lang_len_prior"].isna(), ["language", "bucket"]]
          .value_counts()
          .rename("rows")
          .reset_index()
          .sort_values("rows", ascending=False)
    )

print("\nMissing pairs in TRAIN:")
display(missing_pairs(train_nodes))

print("\nMissing pairs in TEST:")
display(missing_pairs(test_nodes))


In [None]:
prior_tbl = (train_nodes.loc[train_nodes.target == 1]
                           .groupby(["language", "bucket"])["pos_frac"]
                           .mean())
j0 = (test_nodes["language"] == "Japanese") & (test_nodes["bucket"] == 0)

if j0.any():
    jp_prior_b1 = prior_tbl[("Japanese", 1)] #mean pos_frac for bucket 1
    test_nodes.loc[j0, "lang_len_prior"]  = jp_prior_b1
    test_nodes.loc[j0, "dist_from_prior"] = test_nodes.loc[j0, "pos_frac"] - jp_prior_b1

In [None]:
print("\nMissing pairs in TRAIN:")
display(missing_pairs(train_nodes))

print("\nMissing pairs in TEST:")
display(missing_pairs(test_nodes))

Feature selection with BorutaShap:

In [None]:
FEATURES = [c for c in train_nodes.columns if c not in ["language","sentence","node","target"]]

X_full = train_nodes[FEATURES]
y_full = train_nodes["target"].values
sid    = train_nodes["sentence"].values

boruta = BorutaShap(
    model=LGBMClassifier(
        n_estimators=400, learning_rate=0.03,
        num_leaves=63, max_depth=-1, random_state=0
    ),
    importance_measure="shap",
    classification=True
)
N_ROWS = X_full.shape[0]
boruta.fit(
    X=X_full, y=y_full,
    sample_weight=1.0 / train_nodes["n_tokens"],
    n_trials=100,
    sample=False,
    stratify=y_full,
    verbose=False
)

FEATURES = boruta.Subset().columns.tolist()
print("Boruta kept", len(FEATURES), "features:")
print(FEATURES)


In [None]:
#dedublicated features from previous Boruta clean-up
FEATURES = ['subgraph', 'pagerank_pct', 'branch2_pct',
            'degree_pct', 'betweenness_pct', 'closeness',
            'meta_rank_resid', 'eigenvector', 'farness',
            'voterank_pct', 'harmonic_rank',
            'lang_len_prior', 'degree',
            'subgraph_rank', 'branch2_rank',
            'dist_from_prior', 'pos_frac',
            'language', 'bucket']

----

### Modelling

In [None]:
def per_language_sentence_accuracy(y, p, sid, lang):
    df = pd.DataFrame({"sid": sid, "lang": lang, "y": y, "p": p})
    idx_top = df.groupby(["lang", "sid"])["p"].idxmax()
    df_top  = df.loc[idx_top]

    return df_top.groupby("lang")["y"].mean()


In [None]:
#logreg

from collections import defaultdict
from sklearn.metrics import roc_auc_score, average_precision_score

def sentence_level_table(y, p, sid, lang):
    df = (pd.DataFrame({"sid": sid, "language": lang,"y": y,  "p": p})
            .loc[lambda d: d.groupby("sid")["p"].idxmax()])
    return df

def update_lang_counters(df_sent, lang_ok, lang_tot):
    correct = (df_sent["y"] == 1).astype(int)
    for g, cnt in correct.groupby(df_sent["language"]):
        lang_ok[g]  += cnt.sum()
        lang_tot[g] += cnt.size

CAT_COLS  = ['language', 'bucket']
NUM_COLS  = [c for c in FEATURES if c not in CAT_COLS]

X_full = train_nodes[FEATURES]
y_full = train_nodes["target"].values
sid    = train_nodes["sentence"].values

ohe_kwargs = dict(handle_unknown="ignore")
if version.parse(sklearn.__version__) >= version.parse("1.2"):
    ohe_kwargs["sparse_output"] = True
else:
    ohe_kwargs["sparse"] = True

ohe  = OneHotEncoder(**ohe_kwargs)
prep = ColumnTransformer(
        [("cat", ohe, CAT_COLS),
         ("num", "passthrough", NUM_COLS)],
        remainder="drop")

logit = LogisticRegression(
            penalty="l1", solver="liblinear",
            C=0.8, max_iter=300, random_state=0)

pipe = Pipeline([("prep", prep), ("clf", logit)])

import numpy as np, pandas as pd
from sklearn.metrics import accuracy_score
def acc_at1(y_true, y_pred, sent_ids):
    top = (pd.DataFrame({"sid": sent_ids, "y": y_true, "p": y_pred})
             .loc[lambda d: d.groupby("sid")["p"].idxmax(), "y"])
    return accuracy_score(np.ones_like(top), top)

#outer 5-fold SGKF
from sklearn.model_selection import StratifiedGroupKFold
sent_ids = train_nodes.drop_duplicates("sentence")["sentence"].to_numpy()
bucket_by_sid = (train_nodes
                 .groupby("sentence")["bucket"]
                 .agg(lambda s: s.value_counts().idxmax())
                 .astype(str)
                 .reindex(sent_ids))

length_labels = bucket_by_sid.to_numpy()

sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

acc_outer      = []
node_true_all  = []
node_pred_all  = []
lang_ok        = defaultdict(int)
lang_tot       = defaultdict(int)

for tr_pos, va_pos in sgkf.split(np.zeros_like(sent_ids),
                                 y=length_labels,
                                 groups=sent_ids):

    sent_tr = sent_ids[tr_pos]
    sent_va = sent_ids[va_pos]

    tr_mask = train_nodes["sentence"].isin(sent_tr)
    va_mask = train_nodes["sentence"].isin(sent_va)

    pipe.fit(X_full.loc[tr_mask], y_full[tr_mask],
             clf__sample_weight=1.0 /
                                train_nodes.loc[tr_mask, "n_tokens"])

    #node-level predictions on current outer-val fold
    y_pred_nodes = pipe.predict_proba(X_full.loc[va_mask])[:, 1]
    y_true_nodes = y_full[va_mask]

    node_pred_all.append(y_pred_nodes)
    node_true_all.append(y_true_nodes)

    #sentence-level accuracy
    acc_outer.append(acc_at1(y_true_nodes, y_pred_nodes,sid[va_mask]))

    #per-language counts
    df_val_sent = (pd.DataFrame({"sid": sid[va_mask],
                                 "language": train_nodes.loc[va_mask,"language"],
                                 "y": y_true_nodes,
                                 "p": y_pred_nodes})
                     .loc[lambda d: d.groupby("sid")["p"].idxmax()])

    for g, cnt in (df_val_sent["y"] == 1).groupby(df_val_sent["language"]):
        lang_ok[g]  += cnt.sum()
        lang_tot[g] += cnt.size


print(f"LogReg outer-CV accuracy@1 : "
      f"{np.mean(acc_outer):.3f} ± {np.std(acc_outer):.3f}")

# per-language sentence accuracy
print("\nPer-language sentence accuracy:")
for g, a in sorted({g: lang_ok[g]/lang_tot[g] for g in lang_ok}.items(),
                   key=lambda x: -x[1]):
    print(f"  {g:>3s} : {a:.3f}")

In [None]:
acc_lang_fold = per_language_sentence_accuracy(y_true_nodes, y_pred_nodes, sid[va_mask],train_nodes.loc[va_mask, "language"])
for g, a in acc_lang_fold.items():
    lang_ok[g]  += a * acc_lang_fold.size 
    lang_tot[g] += acc_lang_fold.size 


In [None]:
print("\nPer-language sentence accuracy:")
for g, a in sorted({g: lang_ok[g]/lang_tot[g] for g in lang_tot}.items(),
                   key=lambda x: -x[1]):
    print(f"  {g:>10s} : {a:.3f}")


In [None]:
def collapse_by_lang_sentence(y, p, sid, lang):
    """Return df with one row per (language, sentence)."""
    df_nodes = pd.DataFrame({"sid": sid, "lang": lang, "y": y, "p": p})
    idx_best = df_nodes.groupby(["lang", "sid"])["p"].idxmax()
    return df_nodes.loc[idx_best]

def update_lang_totals(df_sent, correct_tot, all_tot):
    """Increment per-language correct / total counters."""
    for g, grp in df_sent.groupby("lang"):
        all_tot[g]     += len(grp)
        correct_tot[g] += grp["y"].sum()

acc_global_cv  = []
lang_correct   = defaultdict(int)
lang_total     = defaultdict(int)
node_pred_all  = []
node_true_all  = []

for tr_pos, va_pos in sgkf.split(np.zeros_like(sent_ids), y=length_labels, groups=sent_ids):

    sent_tr = sent_ids[tr_pos]
    sent_va = sent_ids[va_pos]

    tr_mask = train_nodes["sentence"].isin(sent_tr)
    va_mask = train_nodes["sentence"].isin(sent_va)

    pipe.fit(X_full.loc[tr_mask], y_full[tr_mask],
             clf__sample_weight=1.0 /
                                train_nodes.loc[tr_mask, "n_tokens"])

    y_pred_nodes = pipe.predict_proba(X_full.loc[va_mask])[:, 1]
    y_true_nodes = y_full[va_mask]
    node_pred_all.append(y_pred_nodes)
    node_true_all.append(y_true_nodes)

    df_sent = collapse_by_lang_sentence(y_true_nodes, y_pred_nodes, sid[va_mask], train_nodes.loc[va_mask,"language"])

    acc_global_cv.append(df_sent["y"].mean())
    update_lang_totals(df_sent, lang_correct, lang_total)

print(f"LogReg outer-CV accuracy@1 : "
      f"{np.mean(acc_global_cv):.3f} ± {np.std(acc_global_cv):.3f}")

print("\nPer-language sentence accuracy:")
for g, acc in sorted({g: lang_correct[g]/lang_total[g]
                      for g in lang_total}.items(),
                     key=lambda x: -x[1]):
    print(f"  {g:>10s} : {acc:.3f}   (n={lang_total[g]})")

# optional node-level metrics
# y_nodes = np.concatenate(node_true_all)
# p_nodes = np.concatenate(node_pred_all)
# print(f"\nNode-level ROC-AUC          : {roc_auc_score(y_nodes, p_nodes):.3f}")
# print(f"Node-level average precision : "
#       f"{average_precision_score(y_nodes, p_nodes):.3f}")


In [None]:
# 5 boosting models + Random Forest
# outer 5-fold StratifiedGroupKFold on (sentence, bucket)
# inner 90 / 10 GroupShuffleSplit + Optuna tuning (no group leakage)
# models: LightGBM, CatBoost, XGBoost, HistGradientBoosting, Random Forest
# accuracy@1 on (language, sentence) + per-language report
# deterministic seeds, Optuna SuccessiveHalving pruner

SEED = 42
random.seed(SEED); np.random.seed(SEED); os.environ["PYTHONHASHSEED"] = str(SEED)
optuna.logging.set_verbosity(optuna.logging.WARNING)
PRUNER = optuna.pruners.SuccessiveHalvingPruner(min_resource=10)

FEATURES = ["subgraph","pagerank_pct","branch2_pct","degree_pct",
            "betweenness_pct","closeness","meta_rank_resid","eigenvector",
            "farness","voterank_pct","harmonic_rank","lang_len_prior",
            "degree","subgraph_rank","branch2_rank","dist_from_prior",
            "pos_frac","n_tokens","language","bucket"]
CAT_COLS  = ["language","bucket"]

X_full    = train_nodes[FEATURES]
y_full    = train_nodes["target"].to_numpy()
sid_full  = train_nodes["sentence"].astype("int64").to_numpy()
lang_full = train_nodes["language"].to_numpy()
w_full    = 1.0 / np.sqrt(train_nodes["n_tokens"].to_numpy())
CAT_IDX   = [X_full.columns.get_loc(c) for c in CAT_COLS]

# numeric frame for models that need it
cat_enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
X_enc = X_full.copy()
X_enc[CAT_COLS] = cat_enc.fit_transform(X_full[CAT_COLS])

In [None]:
import time, sys
class _ConsoleLogger:
    def __init__(self):
        self.best = None
        self.best_id = None
    def __call__(self, study, trial):
        self.best     = study.best_value
        self.best_id  = study.best_trial.number
        msg = (f"[I] {time.strftime('%H:%M:%S')}  Trial {trial.number} "
               f"finished value: {trial.value:.6f}.  "
               f"Best is {self.best_id} ({self.best:.6f})")
        print(msg, file=sys.stderr)


def timeit_step(tag):
    def _wrap(fn):
        def inner(*a, **k):
            t0 = time.time()
            res = fn(*a, **k)
            tqdm.write(f"      ↳ {tag} took {time.time()-t0:5.1f}s")
            return res
        return inner
    return _wrap


In [None]:
def make_group_val_split(X, y, groups, val=0.10, seed=SEED):
    gss = GroupShuffleSplit(n_splits=1, test_size=val, random_state=seed)
    return next(gss.split(X, y, groups))

def acc_at1(y, p, sid, lang):
    df = pd.DataFrame({"sid": sid, "lang": lang, "y": y, "p": p})
    top = df.loc[df.groupby(["lang", "sid"])["p"].idxmax(), "y"]
    return (top == 1).mean()

def lang_acc_series(y, p, sid, lang):
    df = pd.DataFrame({"sid": sid, "lang": lang, "y": y, "p": p})
    idx = df.groupby(["lang", "sid"])["p"].idxmax()
    return df.loc[idx].groupby("lang")["y"].mean()

def _split_inner(X, y, sid, lang, w):
    tr, va = make_group_val_split(X, y, sid)
    return X.iloc[tr], X.iloc[va], y[tr], y[va], sid[tr], sid[va], lang[tr], lang[va], w[tr], w[va]

def _group_lengths_lang_sid(lang, sid):
    """Group lengths for each (lang, sid) in the original row order."""
    key = pd.Series(list(zip(lang, sid)))
    return (key.value_counts(sort=False).loc[key.unique()].to_numpy())


In [None]:
#funcs for optuna
def obj_lgb(trial, X, y, sid, lang, w):
    Xt, Xv, yt, yv, sidt, sidv, langt, langv, wt, wv = _split_inner(X, y, sid, lang, w)
    params = dict(objective="binary", metric="binary_logloss", random_state=SEED,
                  learning_rate=trial.suggest_float("learning_rate",0.01,0.2,log=True),
                  num_leaves=trial.suggest_int("num_leaves",31,255),
                  feature_fraction=trial.suggest_float("feature_fraction",0.6,1.0),
                  bagging_fraction=trial.suggest_float("bagging_fraction",0.6,1.0),
                  bagging_freq=5,
                  num_threads=N_CPU,
                  lambda_l1=trial.suggest_float("lambda_l1",0.0,2.0),
                  lambda_l2=trial.suggest_float("lambda_l2",0.0,2.0),
                  verbose=-1, force_col_wise=True)
    mdl = lgb.LGBMClassifier(**params, n_estimators=3000)
    mdl.fit(Xt, yt, sample_weight=wt, categorical_feature=CAT_IDX,
            eval_set=[(Xv, yv)], eval_sample_weight=[wv],
            callbacks=[lgb.early_stopping(50, verbose=False)])
    trial.set_user_attr("n_round", mdl.best_iteration_ or 3000)
    p = mdl.predict_proba(Xv, num_iteration=mdl.best_iteration_)[:,1]
    return 1 - acc_at1(yv, p, sidv, langv)


def obj_lgb_rank(trial, X, y, sid, lang, w):
    Xt, Xv, yt, yv, sidt, sidv, langt, langv, wt, wv = \
        _split_inner(X, y, sid, lang, w)

    g_tr = _group_lengths_lang_sid(langt, sidt)
    g_va = _group_lengths_lang_sid(langv, sidv)

    params = dict(
        objective        = "lambdarank",
        metric           = "ndcg",
        eval_at          = [1],
        random_state     = SEED,
        learning_rate    = trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        num_leaves       = trial.suggest_int("num_leaves", 31, 255),
        feature_fraction = trial.suggest_float("feature_fraction", 0.6, 1.0),
        bagging_fraction = trial.suggest_float("bagging_fraction", 0.6, 1.0),
        bagging_freq     = 5,
        lambda_l1        = trial.suggest_float("lambda_l1", 0.0, 2.0),
        lambda_l2        = trial.suggest_float("lambda_l2", 0.0, 2.0),
        verbose          = -1,
        num_threads      = N_CPU,
    )

    rk = lgb.LGBMRanker(**params, n_estimators=3000)
    rk.fit(Xt, yt,
           group=g_tr,
           eval_set=[(Xv, yv)],
           eval_group=[g_va],
           sample_weight=wt,
           eval_sample_weight=[wv],
           callbacks=[lgb.early_stopping(50, verbose=False)])

    trial.set_user_attr("n_round", rk.best_iteration_ or 3000)

    p_val = rk.predict(Xv, num_iteration=rk.best_iteration_)
    return 1 - acc_at1(yv, p_val, sidv, langv)


def obj_cat(trial, X, y, sid, lang, w):
    Xt, Xv, yt, yv, sidt, sidv, langt, langv, wt, wv = _split_inner(X, y, sid, lang, w)
    train_pool = Pool(data=Xt, label=yt, cat_features=CAT_IDX, weight=wt)
    valid_pool = Pool(data=Xv, label=yv, cat_features=CAT_IDX, weight=wv)

    params = dict(
        loss_function="Logloss",
        random_seed=SEED,
        verbose=False,
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        depth=trial.suggest_int("depth", 4, 10),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
        border_count=trial.suggest_int("border_count", 32, 255),
        thread_count=N_CPU,
    )

    mdl = CatBoostClassifier(**params, iterations=3000)
    mdl.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=50)

    best_iter = mdl.get_best_iteration() or 3000
    trial.set_user_attr("n_round", best_iter)

    p = mdl.predict_proba(valid_pool)[:, 1]
    return 1 - acc_at1(yv, p, sidv, langv)


def obj_xgb(trial, X, y, sid, lang, w):
    import xgboost as xgb
    Xt, Xv, yt, yv, sidt, sidv, langt, langv, wt, wv = _split_inner(X, y, sid, lang, w)
    Xt = Xt.copy();  Xv = Xv.copy()
    for c in CAT_COLS:
        Xt[c] = Xt[c].cat.codes
        Xv[c] = Xv[c].cat.codes
    dtr = xgb.DMatrix(Xt, label=yt, weight=wt)
    dva = xgb.DMatrix(Xv, label=yv, weight=wv)
    params = dict(objective="binary:logistic", seed=SEED,
                  learning_rate=trial.suggest_float("learning_rate",0.01,0.2,log=True),
                  max_depth=trial.suggest_int("max_depth",3,10),
                  min_child_weight=trial.suggest_int("min_child_weight",1,10),
                  subsample=trial.suggest_float("subsample",0.6,1.0),
                  colsample_bytree=trial.suggest_float("colsample_bytree",0.6,1.0),
                  reg_lambda=trial.suggest_float("reg_lambda",0.0,5.0),
                  alpha=trial.suggest_float("alpha",0.0,5.0),
                  nthread=N_CPU,
                  tree_method="hist", eval_metric="logloss")
    booster = xgb.train(params, dtr, 4000,
                        evals=[(dva,"val")], early_stopping_rounds=50,
                        verbose_eval=False)
    trial.set_user_attr("n_round", booster.best_iteration or 3000)
    p = booster.predict(dva, iteration_range=(0, booster.best_iteration))
    return 1 - acc_at1(yv, p, sidv, langv)


def obj_hgb(trial, X, y, sid, lang, w):
    X_num = X_enc.loc[X.index] 
    Xt, Xv, yt, yv, sidt, sidv, langt, langv, wt, wv = \
        _split_inner(X_num, y, sid, lang, w)

    hgb = HistGradientBoostingClassifier(
        learning_rate    = trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        max_depth        = trial.suggest_int("max_depth", 3, 10),
        max_leaf_nodes   = trial.suggest_int("max_leaf_nodes", 31, 255),
        l2_regularization= trial.suggest_float("l2_regularization", 0.0, 2.0),
        min_samples_leaf = 20,
        random_state     = SEED,
    )
    hgb.fit(Xt, yt, sample_weight=wt)
    p = hgb.predict_proba(Xv)[:, 1]
    return 1 - acc_at1(yv, p, sidv, langv)



def obj_rf(trial, X, y, sid, lang, w):
    X_num = X_enc.loc[X.index]
    Xt, Xv, yt, yv, sidt, sidv, langt, langv, wt, wv = _split_inner(X_num, y, sid, lang, w)

    rf = RandomForestClassifier(
        n_estimators       = trial.suggest_int("n_estimators", 200, 800, 100),
        max_depth          = trial.suggest_int("max_depth", 5, 20),
        max_features       = trial.suggest_float("max_features", 0.3, 1.0),
        min_samples_leaf   = trial.suggest_int("min_samples_leaf", 1, 10),
        bootstrap          = True,
        n_jobs             = -1,
        random_state       = SEED,
    )
    rf.fit(Xt, yt, sample_weight=wt)
    p = rf.predict_proba(Xv)[:, 1]
    return 1 - acc_at1(yv, p, sidv, langv)


OBJECTIVES = {"lightgbm": obj_lgb,
              "lgbm_ranker" : obj_lgb_rank,
              "catboost": obj_cat,
              "xgboost":  obj_xgb,
              "hist_gbm": obj_hgb,
              "random_forest": obj_rf}

#optuna
class _TqdmOptuna:
    def __init__(self, n_trials):
        self.pb = tqdm(total=n_trials, desc="Optuna", leave=False, unit="trial")
    def __call__(self, study, trial):
        best = 1 - study.best_value if study.best_value is not None else None
        if best is not None:
            self.pb.set_postfix(best=f"{best:.3f}")
        self.pb.update(1)
    def close(self):
        self.pb.close()

def tune_model(key, X, y, sid, lang, w, n_trials=30):
    bar    = _TqdmOptuna(n_trials)
    logger = _ConsoleLogger()
    study  = optuna.create_study(direction="minimize",
                                 pruner=PRUNER,
                                 sampler=optuna.samplers.TPESampler(seed=SEED))
    study.optimize(lambda t: OBJECTIVES[key](t, X, y, sid, lang, w),
                   n_trials=n_trials,
                   n_jobs=8,
                   callbacks=[bar, logger],
                   show_progress_bar=False)
    bar.close()
    return study

In [None]:
#Outer Cross-Validation
sent_df = train_nodes.drop_duplicates("sentence")
sent_ids      = sent_df["sentence"].to_numpy()
bucket_labels = (train_nodes.groupby("sentence")["bucket"]
                 .agg(lambda s: s.value_counts().idxmax())
                 .astype(str)
                 .reindex(sent_ids).to_numpy())
SGKF = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)

results, best_folds = {}, {}


def run_model(model_key, X_full=X_full, X_enc=X_enc, w_full=w_full):
    fold_scores, per_fold_params = [], []
    lang_ok, lang_tot = defaultdict(int), defaultdict(int)
    best_so_far = 0.0

    out_dir = Path("models") / model_key
    out_dir.mkdir(parents=True, exist_ok=True)

    for fold, (tr_pos, va_pos) in enumerate(
            SGKF.split(np.zeros_like(sent_ids),
                       y=bucket_labels, groups=sent_ids), 1):

        tqdm.write(f"- {model_key}  fold {fold}/5")

        tr_mask = train_nodes["sentence"].isin(sent_ids[tr_pos])
        va_mask = train_nodes["sentence"].isin(sent_ids[va_pos])

        #tune on outer-train
        study = timeit_step("Optuna")(tune_model)(
                           model_key,
                           X_full.loc[tr_mask], y_full[tr_mask],
                           sid_full[tr_mask],   lang_full[tr_mask],
                           w_full[tr_mask])

        bp = study.best_params
        br = study.user_attrs.get("n_round", 3000)
        per_fold_params.append((bp, br))

        #refit and predict
        @timeit_step("refit+predict")
        def _fit_predict():
            if model_key == "lightgbm":
                import lightgbm as lgb
                mdl = lgb.LGBMClassifier(**bp, n_estimators=br, random_state=SEED)
                mdl.fit(X_full.loc[tr_mask], y_full[tr_mask],
                        sample_weight=w_full[tr_mask],
                        categorical_feature=CAT_IDX)
                return mdl.predict_proba(X_full.loc[va_mask])[:,1], mdl

            elif model_key == "catboost":
                from catboost import CatBoostClassifier, Pool
                train_pool = Pool(
                    X_full.loc[tr_mask], y_full[tr_mask],
                    cat_features = CAT_IDX,
                    weight       = w_full[tr_mask]
                )
                valid_pool = Pool(
                    X_full.loc[va_mask], label=None,
                    cat_features = CAT_IDX
                )
                mdl = CatBoostClassifier(**bp, iterations=br, random_seed=SEED, verbose=False)
                mdl.fit(train_pool)
                return mdl.predict_proba(valid_pool)[:,1], mdl


            elif model_key == "xgboost":
                import xgboost as xgb
                X_tr = X_full.loc[tr_mask].copy();  X_va = X_full.loc[va_mask].copy()
                for c in CAT_COLS:
                    X_tr[c] = X_tr[c].cat.codes;  X_va[c] = X_va[c].cat.codes
                booster = xgb.train(bp,
                                    xgb.DMatrix(X_tr,label=y_full[tr_mask],weight=w_full[tr_mask]),
                                    num_boost_round=br, verbose_eval=False)
                return booster.predict(xgb.DMatrix(X_va)), booster

            elif model_key == "hist_gbm":
                mdl = HistGradientBoostingClassifier(**bp, random_state=SEED)
                mdl.fit(X_enc.loc[tr_mask], y_full[tr_mask],
                        sample_weight=w_full[tr_mask])
                return mdl.predict_proba(X_enc.loc[va_mask])[:,1], mdl

            elif model_key == "lgbm_ranker":
                import lightgbm as lgb
                g_tr = _group_lengths(sid_full[tr_mask])
                rk = lgb.LGBMRanker(**bp, n_estimators=br, random_state=SEED)
                rk.fit(X_full.loc[tr_mask], y_full[tr_mask],
                    group=g_tr,
                    sample_weight=w_full[tr_mask],
                    categorical_feature=CAT_IDX)
                return rk.predict(X_full.loc[va_mask], num_iteration=br), rk


            else:  # random_forest
                mdl = RandomForestClassifier(**bp, n_jobs=-1, random_state=SEED)
                mdl.fit(X_enc.loc[tr_mask], y_full[tr_mask],
                        sample_weight=w_full[tr_mask])
                return mdl.predict_proba(X_enc.loc[va_mask])[:,1], mdl

        p_val, mdl = _fit_predict()

        # save model + params
        joblib.dump(mdl, out_dir / f"fold{fold}.pkl")
        with open(out_dir / f"fold{fold}_params.json","w") as f:
            json.dump({"params":bp,"best_round":br}, f)

        # fold metrics
        acc = acc_at1(y_full[va_mask], p_val, sid_full[va_mask], lang_full[va_mask])
        fold_scores.append(acc);  best_so_far = max(best_so_far, acc)

        lg = lang_acc_series(y_full[va_mask], p_val, sid_full[va_mask], lang_full[va_mask])
        for g,a in lg.items():
            lang_ok[g]  += a * lg.count()
            lang_tot[g] += lg.count()

        tqdm.write(f"    acc@1={acc:.3f}  best_so_far={best_so_far:.3f} | "
                   f"{', '.join(f'{g}:{lg[g]:.3f}' for g in lg.head(21).index)}")

    # summary
    mean_acc, std_acc = float(np.mean(fold_scores)), float(np.std(fold_scores))
    summary = {"mean_acc": mean_acc,
               "std_acc": std_acc,
               "fold_acc": fold_scores,
               "fold_params": [dict(p) for p,_ in per_fold_params]}
    with open(out_dir / "cv_summary.json","w") as f:
        json.dump(summary,f,indent=2)

    print(f"\n{model_key}  acc@1 = {mean_acc:.3f} ± {std_acc:.3f}")
    for g,a in sorted({g: lang_ok[g]/lang_tot[g] for g in lang_tot}.items(),
                      key=lambda x:-x[1]):
        print(f"  {g:>10s}: {a:.3f}")
    print("-"*46)
    return mean_acc,std_acc

In [None]:
#rank models by outer-CV mean 0-1 loss

K_TOP = 6
ranked = sorted(results.items(), key=lambda kv: -kv[1][0])[:K_TOP]
print("\n⸻ Model leader-board (outer-CV) ⸻")
for i,(k,(m,_)) in enumerate(ranked,1):
    print(f"{i:>2}. {k:15s}  {m:.3f}")

#train each top model on ALL data and save predictions
prob_cols   = []
weights_w   = []
TEST_PROB_DIR = Path("test_probs"); TEST_PROB_DIR.mkdir(exist_ok=True)

def _fold_param_aggregate(key):
    fold_dir = Path("models")/key
    js  = sorted(fold_dir.glob("fold*_params.json"))
    spec= [json.load(j.open()) for j in js]
    fold_scores = np.loadtxt(fold_dir/"cv_summary.json", ndmin=1, dtype=float, usecols=0, max_rows=len(js))
    best_idx = int(np.argmin(fold_scores)) if len(js)==len(fold_scores) else 0
    p_best   = spec[best_idx]["params"]
    n_best   = spec[best_idx]["best_round"]
    return p_best, n_best or 3000

for key,_ in ranked:
    print(f"\n- training full-corpus {key}")
    best_params, n_round = _fold_param_aggregate(key)

    if key=="lightgbm":
        import lightgbm as lgb
        mdl = lgb.LGBMClassifier(**best_params, n_estimators=n_round,
                                 random_state=SEED)
        mdl.fit(X_full, y_full, sample_weight=w_full,
                categorical_feature=CAT_IDX)

    elif key=="lgbm_ranker":
        import lightgbm as lgb
        g_all = (pd.Series(sid_full)
                   .value_counts(sort=False)
                   .loc[pd.Series(sid_full).unique()]
                   .to_numpy())
        mdl = lgb.LGBMRanker(**best_params, n_estimators=n_round,
                             random_state=SEED)
        mdl.fit(X_full, y_full, group=g_all,
                sample_weight=w_full, categorical_feature=CAT_IDX)

    elif key=="catboost":
        from catboost import CatBoostClassifier, Pool
        pool = Pool(X_full, y_full, cat_features=CAT_IDX, weight=w_full)
        mdl  = CatBoostClassifier(**best_params, iterations=n_round,
                                  random_seed=SEED, verbose=False)
        mdl.fit(pool)

    elif key=="xgboost":
        import xgboost as xgb
        X_num = X_full.copy()
        for c in CAT_COLS: X_num[c] = X_num[c].cat.codes
        dtrain = xgb.DMatrix(X_num, label=y_full, weight=w_full)
        mdl    = xgb.train(best_params, dtrain, num_boost_round=n_round)

    elif key=="hist_gbm":
        mdl = HistGradientBoostingClassifier(**best_params,
                                             random_state=SEED)
        mdl.fit(X_enc, y_full, sample_weight=w_full)

    else:                       # random_forest
        mdl = RandomForestClassifier(**best_params, n_jobs=-1,
                                     random_state=SEED)
        mdl.fit(X_enc, y_full, sample_weight=w_full)

    joblib.dump(mdl, f"{key}_FULL.pkl")

    #predict on test
    def _proba(m, key):
        if key in ["hist_gbm","random_forest"]:
            return m.predict_proba(X_enc.reindex(test_nodes.index))[:,1]
        if key=="xgboost":
            X_tmp = X_full.reindex(test_nodes.index).copy()
            for c in CAT_COLS: X_tmp[c]=X_tmp[c].cat.codes
            return m.predict(xgb.DMatrix(X_tmp))
        if "ranker" in key:
            return m.predict(X_full.reindex(test_nodes.index))
        return m.predict_proba(X_full.reindex(test_nodes.index))[:,1]

    p_vec = _proba(mdl, key)
    col   = f"p_{key}"
    test_nodes[col] = p_vec
    prob_cols.append(col)
    weights_w.append(results[key][0])

    sub_i = (test_nodes
             .loc[test_nodes.groupby(["language","sentence"])[col].idxmax()]
             .loc[:,["node"]].rename(columns={"node":"root"}))
    sub_i.to_csv(f"submission_{key}.csv", index=False)
    print(f"  saved  {key}_FULL.pkl  &  submission_{key}.csv")

#weighted ensemble of those K probability columns
w_norm = np.array(weights_w)/np.sum(weights_w)
test_nodes["p_ensemble"] = (test_nodes[prob_cols] * w_norm).sum(1)

sub_blend = (test_nodes
             .loc[test_nodes.groupby(["language","sentence"])["p_ensemble"].idxmax()]
             .loc[:,["node"]].rename(columns={"node":"root"}))

sub_blend.to_csv("submission_ensemble.csv", index=False)
print("\n✔ blended submission saved → submission_ensemble.csv")
