In [29]:
# ranker_pipeline.py
# ---------------------------------------------------------------
import ast, json, joblib, warnings, networkx as nx
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold, GroupShuffleSplit
from sklearn.preprocessing import MinMaxScaler

import lightgbm as lgb
from lightgbm.callback import early_stopping, log_evaluation
from catboost import CatBoostRanker,Pool


# ------------------------------ feature engineering -------------
CENT_FUNCS = {
    "pagerank"   : nx.pagerank,
    "betweenness": nx.betweenness_centrality,
    "katz"       : lambda G: nx.katz_centrality_numpy(G, alpha=0.01),
    "closeness"  : nx.closeness_centrality,
}
KEEP_CENTS = list(CENT_FUNCS) + ["voterank"]      # five strongest

def voterank_scores(G):
    seeds = nx.voterank(G)
    score = {n: 0. for n in G}
    for r, n in enumerate(seeds[::-1], 1):
        score[n] = r / len(seeds)
    return score

def build_node_df(df_in: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for _, row in df_in.iterrows():
        G = nx.from_edgelist(ast.literal_eval(row.edgelist))
        cents = {k: f(G) for k, f in CENT_FUNCS.items()}
        cents["voterank"] = voterank_scores(G)
        n_tok = len(G)

        for node in G:
            rec = {
                "language": row.language,
                "sentence": row.sentence,
                "node"    : node,
                "n_tokens": n_tok,
                **{k: cents[k][node] for k in cents},
            }
            if "root" in row:              # train set
                rec["target"] = int(node == row.root)
            rows.append(rec)

    df = pd.DataFrame(rows)

    # min-max within each sentence
    scaler = MinMaxScaler()
    df[KEEP_CENTS] = (
        df.groupby("sentence")[KEEP_CENTS]
          .transform(lambda x: scaler.fit_transform(x.values.reshape(-1, 1)).ravel())
    )
    # categorical dtype for language
    df["language"] = df["language"].astype("category")
    return df

# ------------------------------ utility -------------------------
def make_groups(sorted_sentence_ids: np.ndarray) -> np.ndarray:
    """Return an array of group sizes (one per sentence)."""
    _, counts = np.unique(sorted_sentence_ids, return_counts=True)
    return counts

def sort_by_sentence(idx: np.ndarray, sentence_ids: np.ndarray):
    order = np.argsort(sentence_ids[idx], kind="mergesort")
    return idx[order]

# ------------------------------ CV & training -------------------
def cross_validate_ranker(X: pd.DataFrame,
                          y: np.ndarray,
                          sid: np.ndarray,
                          model_type: str = "lgb",
                          n_splits: int = 5,
                          seed: int = 42):
    """
    Returns
    -------
    cv_acc   : list[float]   accuracy@1 for each fold
    best_it  : list[int]     best trees / iterations per fold
    models   : list          fitted fold models (optional use)
    """
    gkf = GroupKFold(n_splits)
    cv_acc, best_it, fold_models = [], [], []

    for fold, (tr_idx, va_idx) in enumerate(gkf.split(X, y, sid), 1):
        tr_idx = sort_by_sentence(tr_idx, sid)
        va_idx = sort_by_sentence(va_idx, sid)

        X_tr, y_tr, sid_tr = X.iloc[tr_idx], y[tr_idx], sid[tr_idx]
        X_va, y_va, sid_va = X.iloc[va_idx], y[va_idx], sid[va_idx]

        grp_tr = make_groups(sid_tr)
        grp_va = make_groups(sid_va)

        if model_type == "lgb":
            model = lgb.LGBMRanker(
                objective="lambdarank",
                metric="map",
                label_gain=[0, 1],
                n_estimators=1500,
                learning_rate=0.03,
                num_leaves=127,
                min_data_in_leaf=20,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=seed,
            )
            model.fit(
                X_tr, y_tr,
                group=grp_tr,
                eval_set=[(X_va, y_va)],
                eval_group=[grp_va],
                eval_at=[1],
                categorical_feature=["language"],
                callbacks=[early_stopping(50), log_evaluation(100)],
            )

        elif model_type == "cat":
            if not CATBOOST_OK:
                raise ValueError("CatBoost is not installed.")
            pool_tr = Pool(
                X_tr,
                y_tr,
                group_id=pd.Series(sid_tr).astype("category").cat.codes,
                cat_features=[X_tr.columns.get_loc("language")],
            )
            pool_va = Pool(
                X_va,
                y_va,
                group_id=pd.Series(sid_va).astype("category").cat.codes,
                cat_features=[X_tr.columns.get_loc("language")],
            )
            model = CatBoostRanker(
                iterations=1500,
                learning_rate=0.07,
                depth=6,
                loss_function="YetiRankPairwise",
                random_seed=seed,
                early_stopping_rounds=50,
                verbose=False,
            )
            model.fit(pool_tr, eval_set=pool_va, verbose=False)

        else:
            raise ValueError("model_type must be 'lgb' or 'cat'")

        # accuracy@1 on this fold
        prob = model.predict(X_va)
        acc1 = (
            pd.DataFrame({"sid": sid_va, "target": y_va, "prob": prob})
              .loc[lambda d: d.groupby("sid")["prob"].idxmax()]
              ["target"].mean()
        )

        cv_acc.append(acc1)
        fold_models.append(model)
        best_it.append(
            model.best_iteration_ if model_type == "lgb"
            else model.get_best_iteration()
        )
        print(f"fold {fold}: acc@1 = {acc1:.3f}  |  best_iter = {best_it[-1]}")

    return cv_acc, best_it, fold_models

def train_full_ranker(X: pd.DataFrame,
                      y: np.ndarray,
                      sid: np.ndarray,
                      best_iter: int,
                      model_type: str = "lgb",
                      seed: int = 42):
    order = np.argsort(sid, kind="mergesort")
    X_all, y_all, sid_all = X.iloc[order], y[order], sid[order]
    grp_all = make_groups(sid_all)

    if model_type == "lgb":
        model = lgb.LGBMRanker(
            objective="lambdarank",
            metric="map",
            label_gain=[0, 1],
            n_estimators=best_iter,
            learning_rate=0.03,
            num_leaves=127,
            min_data_in_leaf=20,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=seed,
        )
        model.fit(
            X_all, y_all,
            group=grp_all,
            categorical_feature=["language"],
        )
    else:   # cat
        pool_all = Pool(
            X_all,
            y_all,
            group_id=pd.Series(sid_all).astype("category").cat.codes,
            cat_features=[X_all.columns.get_loc("language")],
        )
        model = CatBoostRanker(
            iterations=best_iter,
            learning_rate=0.07,
            depth=6,
            loss_function="YetiRankPairwise",
            random_seed=seed,
            verbose=False,
        )
        model.fit(pool_all, verbose=False)

    return model

# ------------------------------ prediction / submission ----------

def predict_root(model, node_df: pd.DataFrame, model_type: str):
    FEATURES = KEEP_CENTS + ["n_tokens", "language"]

    if model_type == "cat":
        # 1️⃣ sort so rows for the SAME (language, sentence) stay contiguous
        node_df = (
            node_df.sort_values(["language", "sentence"], kind="mergesort")
                   .reset_index(drop=True)
        )

        # 2️⃣ build a UNIQUE query id = (language, sentence)
        lang_sent = list(zip(node_df["language"], node_df["sentence"]))
        group_codes = pd.Series(lang_sent).astype("category").cat.codes

        pool = Pool(
            node_df[FEATURES],
            group_id=group_codes,
            cat_features=[node_df[FEATURES].columns.get_loc("language")]
        )
        prob = model.predict(pool)

    else:                         # LightGBM, XGBoost …
        prob = model.predict(node_df[FEATURES])

    node_df["prob"] = prob

    # pick top node per (language, sentence)
    root_pred = (
        node_df.loc[node_df.groupby(["language", "sentence"])["prob"].idxmax()]
               [["language", "sentence", "node"]]
               .rename(columns={"node": "root"})
    )
    return root_pred


def build_submission(test_raw: pd.DataFrame,
                     root_pred: pd.DataFrame,
                     path: str = "submission.csv"):
    submission = (
        test_raw[["id", "language", "sentence"]]
          .merge(root_pred, on=["language", "sentence"], how="left")
          [["id", "root"]]
    )
    submission.to_csv(path, index=False)
    print(f"✓ {path} written: {submission.shape[0]} rows")
    return submission

# ------------------------------ offline eval ---------------------
def offline_eval(submission: pd.DataFrame,
                 test_raw: pd.DataFrame,
                 labeled_path: str = "datasets/labeled_test.csv"):
    if not Path(labeled_path).exists():
        print("labeled_test.csv not found – skipped offline scoring.")
        return
    labeled = pd.read_csv(labeled_path)
    merged  = labeled.merge(submission, on="id", suffixes=("_true", "_pred"))
    acc     = (merged.root_true == merged.root_pred).mean()
    print(f"Offline sentence accuracy = {acc:0.3f}")
    # per-language table
    acc_by_lang = (
        merged.merge(test_raw[["id", "language"]], on="id")
              .assign(hit=lambda d: d.root_true == d.root_pred)
              .groupby("language")["hit"].mean()
              .sort_values(ascending=False)
    )
    display(acc_by_lang)


In [31]:
TRAIN = pd.read_csv("datasets/train.csv")
TEST  = pd.read_csv("datasets/test.csv")

train_nodes = build_node_df(TRAIN)
test_nodes  = build_node_df(TEST)

FEATURES = KEEP_CENTS + ["n_tokens", "language"]
X_full   = train_nodes[FEATURES]
y_full   = train_nodes["target"].values
sid      = train_nodes["sentence"].values




In [33]:
# ------------------------------------------------------------- choose model
MODEL_TYPE = "cat"      # "lgb" or "cat"
# ------------------------------------------------------------- CV
cv_acc, best_it, _ = cross_validate_ranker(
    X_full, y_full, sid, model_type=MODEL_TYPE, n_splits=5
)
print("CV acc@1  mean =", np.mean(cv_acc).round(3))



fold 1: acc@1 = 0.470  |  best_iter = 178
fold 2: acc@1 = 0.470  |  best_iter = 145
fold 3: acc@1 = 0.590  |  best_iter = 158
fold 4: acc@1 = 0.530  |  best_iter = 114
fold 5: acc@1 = 0.460  |  best_iter = 89
CV acc@1  mean = 0.504


In [34]:
final_iter = int(np.round(np.mean(best_it)))
print("Refitting", MODEL_TYPE, "with", final_iter, "iterations")

# ------------------------------------------------------------- full train
final_model = train_full_ranker(
    X_full, y_full, sid, best_iter=final_iter, model_type=MODEL_TYPE
)

# save artefacts
joblib.dump(final_model, f"{MODEL_TYPE}_ranker_final.pkl")
meta = {"iterations": final_iter,
        "language_categories": train_nodes["language"].cat.categories.tolist()}
json.dump(meta, open(f"{MODEL_TYPE}_meta.json", "w"))

# ------------------------------------------------------------- predict & submit
root_pred_df = predict_root(final_model, test_nodes, model_type=MODEL_TYPE)
submission   = build_submission(TEST, root_pred_df, path=f"{MODEL_TYPE}_submission.csv")

# ------------------------------------------------------------- offline eval
offline_eval(submission, TEST)

Refitting cat with 137 iterations
✓ submission.csv written: 10395 rows
Offline sentence accuracy = 0.345


  node_df.loc[node_df.groupby(["language", "sentence"])["prob"].idxmax()]


language
Icelandic     0.452525
Russian       0.438384
Arabic        0.434343
Indonesian    0.430303
Swedish       0.420202
Finnish       0.404040
Polish        0.400000
Czech         0.395960
Turkish       0.369697
German        0.369697
Korean        0.359596
Chinese       0.331313
English       0.327273
Thai          0.327273
French        0.311111
Spanish       0.311111
Galician      0.292929
Portuguese    0.288889
Italian       0.264646
Hindi         0.234343
Japanese      0.088889
Name: hit, dtype: float64