In [6]:
import pandas as pd, numpy as np, networkx as nx, ast, lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMRanker, early_stopping, log_evaluation



CENT_FUNCS = {
    "degree"       : nx.degree_centrality,
    "harmonic"     : nx.harmonic_centrality,
    "pagerank": nx.pagerank,
    "betweenness": nx.betweenness_centrality,
    "katz": lambda G: nx.katz_centrality_numpy(G, alpha=0.01),
    "closeness": nx.closeness_centrality,
    "subgraph"     : nx.subgraph_centrality,

}

def voterank_scores(G):
    seeds = nx.voterank(G)
    s = {n:0. for n in G}
    for r,n in enumerate(seeds[::-1],1): s[n]=r/len(seeds)
    return s

def build(df):
    rows=[]
    for _,row in df.iterrows():
        G = nx.from_edgelist(ast.literal_eval(row.edgelist))
        cents={k:f(G) for k,f in CENT_FUNCS.items()}
        cents['voterank']=voterank_scores(G)
        n_tok=len(G)
        for n in G:
            rec={'language':row.language,
                 'sentence':row.sentence,
                 'node':n,
                 'n_tokens':n_tok,
                 **{k:cents[k][n] for k in cents}}
            if 'root' in row: rec['target']=int(n==row.root)
            rows.append(rec)
    df_out=pd.DataFrame(rows)
    cent_cols=list(CENT_FUNCS)+['voterank']
    scaler=MinMaxScaler()
    df_out[cent_cols]=(
        df_out.groupby('sentence')[cent_cols]
              .transform(lambda x: scaler.fit_transform(x.values.reshape(-1,1)).ravel())
    )
    return df_out

In [8]:
"""
Optuna hyper-parameter search for the LGBMRanker used in the UPC-ML
root–prediction competition.

* Expects the data-preparation helpers `build`, `make_groups`, … to be
  already defined (same session / same notebook cell as your current code).
* Produces:
    - study object  ->  study_root.pkl
    - best params   ->  best_params.json
    - tuned model   ->  lgbm_ranker_optuna.pkl
"""

import json, joblib, optuna, numpy as np, pandas as pd, lightgbm as lgb
from sklearn.model_selection import GroupKFold
from lightgbm.callback import early_stopping, log_evaluation

# ------------------------------------------------------------------------------
# 0. ----------  DATA -----------------------------------------------------------
# ------------------------------------------------------------------------------

train_raw = pd.read_csv("datasets/train.csv")
test_raw  = pd.read_csv("datasets/test.csv")

train_nodes = build(train_raw)     # <-- your existing helper
test_nodes  = build(test_raw)

for df in (train_nodes, test_nodes):
    df["language"] = df["language"].astype("category")

KEEP_CENTS = ['pagerank','betweenness','katz','voterank',
              'closeness','degree','harmonic']
FEATURES   = KEEP_CENTS + ['n_tokens','language']
cat_feats  = ['language']

X_full = train_nodes[FEATURES]
y_full = train_nodes["target"].values
sid    = train_nodes["sentence"].values          # grouping key for CV

# ------------------------------------------------------------------------------
# 1. ----------  OPTUNA OBJECTIVE  ---------------------------------------------
# ------------------------------------------------------------------------------

def make_groups(sent_ids: np.ndarray):
    """Convert sorted sentence ids into LightGBM group array."""
    _, counts = np.unique(sent_ids, return_counts=True)
    return counts


def objective(trial: optuna.Trial) -> float:
    """Return 1 – accuracy@1  (because Optuna *minimises* the objective)."""
    
    # ---- sample a parameter set ------------------------------------------------
    params = {
        # core
        "objective": "lambdarank",
        "metric": "map",
        "label_gain": [0, 1],
        "boosting_type": "gbdt",
        "random_state": 42,
        # search space ----------------------------------------------------------
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "num_leaves":    trial.suggest_int("num_leaves",    31,  511),
        "max_depth":     trial.suggest_int("max_depth",    -1,   12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 100),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0.0, 1.0),
        "feature_fraction":  trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction":  trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "bagging_freq":      trial.suggest_int("bagging_freq", 1, 10),
        "lambda_l1":         trial.suggest_float("lambda_l1", 0.0, 5.0),
        "lambda_l2":         trial.suggest_float("lambda_l2", 0.0, 5.0),
        # n_estimators will be determined by early-stopping
        "n_estimators": 3000,
    }
    
    # ---- 5-fold GroupKFold (same protocol as before) --------------------------
    gkf = GroupKFold(n_splits=5)
    acc_per_fold = []
    
    for tr_idx, va_idx in gkf.split(X_full, y_full, sid):
        
        # keep nodes from the same sentence contiguous
        order_tr = np.argsort(sid[tr_idx], kind="mergesort")
        order_va = np.argsort(sid[va_idx], kind="mergesort")
        
        X_tr, y_tr, sid_tr = X_full.iloc[tr_idx].iloc[order_tr], y_full[tr_idx][order_tr], sid[tr_idx][order_tr]
        X_va, y_va, sid_va = X_full.iloc[va_idx].iloc[order_va], y_full[va_idx][order_va], sid[va_idx][order_va]
        
        grp_tr = make_groups(sid_tr)
        grp_va = make_groups(sid_va)
        
        ranker = lgb.LGBMRanker(**params)
        ranker.fit(
            X_tr, y_tr,
            group=grp_tr,
            eval_set=[(X_va, y_va)],
            eval_group=[grp_va],
            eval_at=[1],
            categorical_feature=cat_feats,
            callbacks=[early_stopping(75, verbose=False)],
        )
        
        # MAP@1 == accuracy@1
        prob = ranker.predict(X_va, num_iteration=ranker.best_iteration_)
        fold_acc = (
            pd.DataFrame({"sid": sid_va, "target": y_va, "prob": prob})
              .loc[lambda d: d.groupby("sid")["prob"].idxmax()]
              ["target"]
              .mean()
        )
        acc_per_fold.append(fold_acc)
    
    # we *minimise* => return 1 – mean accuracy
    return 1.0 - float(np.mean(acc_per_fold))




In [16]:
lang_cats = train_nodes["language"].cat.categories.tolist()

In [9]:
# ------------------------------------------------------------------------------
# 2. ----------  RUN SEARCH  ----------------------------------------------------
# ------------------------------------------------------------------------------

study = optuna.create_study(
    direction="minimize",
    study_name="lgbm_ranker_root",
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=15),
)

study.optimize(objective, n_trials=120, show_progress_bar=True)

# ----- save stuff ----------------------------------------------------------
joblib.dump(study, "study_root.pkl")
best_params = study.best_trial.params
json.dump(best_params, open("best_params.json", "w"))

print("Best score (CV accuracy@1)  :", 1 - study.best_value)
print("Best param set saved to best_params.json")

# ------------------------------------------------------------------------------
# 3. ----------  TRAIN FINAL MODEL ON *ALL* DATA -------------------------------
# ------------------------------------------------------------------------------
# rebuild groups with sentences contiguous
order = np.argsort(sid, kind="mergesort")
X_all, y_all, sid_all = X_full.iloc[order], y_full[order], sid[order]
grp_all = make_groups(sid_all)

# final_params = {
#     "objective": "lambdarank",
#     "metric": "map",
#     "label_gain": [0, 1],
#     "random_state": 42,
#     # Optuna search winners ------------
#     **best_params,
#     # sanity: let trees run to completion with early stopping inside .fit
#     "n_estimators": 4000,
# }

# ranker_final = lgb.LGBMRanker(**final_params)
# ranker_final.fit(
#     X_all, y_all,
#     group=grp_all,
#     categorical_feature=cat_feats,
#     callbacks=[early_stopping(100), log_evaluation(100)],
# )

# joblib.dump(ranker_final, "lgbm_ranker_optuna.pkl")
# print("◼︎  Tuned model saved as lgbm_ranker_optuna.pkl")


[I 2025-05-26 14:18:52,616] A new study created in memory with name: lgbm_ranker_root


  0%|          | 0/120 [00:00<?, ?it/s]

[LightGBM] [Info] Total groups: 400, total data: 157986
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1778
[LightGBM] [Info] Number of data points in the train set: 157986, number of used features: 9
[LightGBM] [Info] Total groups: 100, total data: 39493
[LightGBM] [Info] Total groups: 400, total data: 157999
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000991 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1774
[LightGBM] [Info] Number of data points in the train set: 157999, number of used features: 9
[LightGBM] [Info] Total groups: 100, total data: 39480
[LightGBM] [Info] Total groups: 400, total data: 157964
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001052 seconds.
You can set `force_col_wise=true` to rem

ValueError: For early stopping, at least one dataset and eval metric is required for evaluation

In [14]:
final_params = {
    "objective": "lambdarank",
    "metric": "map",
    "label_gain": [0, 1],
    "random_state": 42,
    **best_params,          # ← from Optuna
    "n_estimators": 4000,   # generous cap
}

ranker_final = lgb.LGBMRanker(**final_params)

ranker_final.fit(
    X_all, y_all,
    group=grp_all,
    eval_set   = [(X_all, y_all)],
    eval_group = [grp_all],
    eval_at    = [1],
    categorical_feature = cat_feats,
    callbacks  = [early_stopping(100), log_evaluation(100)],
)

joblib.dump(ranker_final, "lgbm_ranker_optuna.pkl")
print("✓ Tuned model saved:", ranker_final.booster_.num_trees(), "trees")


[LightGBM] [Info] Total groups: 500, total data: 197479
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001353 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1777
[LightGBM] [Info] Number of data points in the train set: 197479, number of used features: 9




[100]	training's map@1: 0.518
[200]	training's map@1: 0.536
[300]	training's map@1: 0.542
[400]	training's map@1: 0.542
[500]	training's map@1: 0.542
[600]	training's map@1: 0.542
[700]	training's map@1: 0.542
[800]	training's map@1: 0.542
[900]	training's map@1: 0.542
[1000]	training's map@1: 0.542
[1100]	training's map@1: 0.542
[1200]	training's map@1: 0.542
[1300]	training's map@1: 0.542
[1400]	training's map@1: 0.542
[1500]	training's map@1: 0.542
[1600]	training's map@1: 0.542
[1700]	training's map@1: 0.542
[1800]	training's map@1: 0.542
[1900]	training's map@1: 0.542
[2000]	training's map@1: 0.542
[2100]	training's map@1: 0.542
[2200]	training's map@1: 0.542
[2300]	training's map@1: 0.542
[2400]	training's map@1: 0.542
[2500]	training's map@1: 0.542
[2600]	training's map@1: 0.542
[2700]	training's map@1: 0.542
[2800]	training's map@1: 0.542
[2900]	training's map@1: 0.542
[3000]	training's map@1: 0.542
[3100]	training's map@1: 0.542
[3200]	training's map@1: 0.542
[3300]	training's

In [18]:
with open("final_optuna_meta.json", "w") as fp:
    json.dump({
        "num_boost_round": ranker_final.booster_.num_trees(),
        "language_categories": lang_cats          # <-- save them
    }, fp)

In [22]:
ranker_final = joblib.load("lgbm_ranker_optuna.pkl")
print("model has", ranker_final.booster_.num_trees(), "trees")

model has 225 trees


In [24]:
with open("final_meta.json") as fp:
    meta = json.load(fp)
lang_cats = meta["language_categories"]

# 1. build node-level frame for competition test set --------------------
#test_raw   = pd.read_csv("datasets/test.csv")
#test_nodes = build(test_raw)

# 2. align language codes ----------------------------------------------
test_nodes["language"] = pd.Categorical(test_nodes["language"], categories=lang_cats)

# 3. predict and create submission -------------------------------------
#FEATURES = ['pagerank','betweenness','katz','voterank','closeness','n_tokens','language']
probs = ranker_final.predict(test_nodes[FEATURES])
test_nodes["prob"] = probs

root_pred = (
    test_nodes
      .loc[test_nodes.groupby(['language', 'sentence'])['prob'].idxmax()]
      .rename(columns={'node': 'root'})
      [['language', 'sentence', 'root']]
)


submission = (
    test_raw[['id', 'language', 'sentence']]
      .merge(root_pred, on=['language', 'sentence'], how='left')
      [['id', 'root']]
)
submission.to_csv("submission_optuna.csv", index=False)
print("✓ submission_optuna.csv written:", submission.shape)



✓ submission_optuna.csv written: (10395, 2)


  .loc[test_nodes.groupby(['language', 'sentence'])['prob'].idxmax()]


In [28]:
from pathlib import Path


In [None]:
# 4. optional offline check against labeled_test.csv -------------------
if Path("datasets/labeled_test.csv").exists():
    labeled  = pd.read_csv("datasets/labeled_test.csv")   # has id,root
    merged   = labeled.merge(submission, on="id", suffixes=("_true","_pred"))
    acc      = (merged.root_true == merged.root_pred).mean()
    print(f"Offline sentence accuracy = {acc:0.3f}")
    # (extra) Confusion table of languages
    acc_by_lang = (
        merged
          .merge(test_raw[["id","language"]], on="id")
          .assign(hit = lambda d: d.root_true == d.root_pred)
          .groupby("language")["hit"].mean()
          .sort_values(ascending=False)
    )
    display(acc_by_lang)
else:
    print("labeled_test.csv not found – skipped offline scoring.")