In [None]:
CENTRALITY_FUNCS = {
    "degree"       : nx.degree_centrality,
    "closeness"    : nx.closeness_centrality,
    "harmonic"     : nx.harmonic_centrality,
    "betweenness"  : nx.betweenness_centrality,
    "load"         : nx.load_centrality,
    "eigenvector"  : lambda G: nx.eigenvector_centrality_numpy(G),
    "katz"         : lambda G: nx.katz_centrality_numpy(G, alpha=0.01),
    "pagerank"     : nx.pagerank,
    "current_flow_betweenness": nx.current_flow_betweenness_centrality,
    "current_flow_closeness"  : nx.current_flow_closeness_centrality,
    "subgraph"     : nx.subgraph_centrality,
    "communicability_betw"    : nx.communicability_betweenness_centrality,
    "percolation"  : nx.percolation_centrality,
    "second_order" : nx.second_order_centrality,
}

In [10]:
import pandas as pd, numpy as np, networkx as nx, ast, lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMRanker, early_stopping, log_evaluation



CENT_FUNCS = {
    "degree"       : nx.degree_centrality,
    "harmonic"     : nx.harmonic_centrality,
    "pagerank": nx.pagerank,
    "betweenness": nx.betweenness_centrality,
    "katz": lambda G: nx.katz_centrality_numpy(G, alpha=0.01),
    "closeness": nx.closeness_centrality,
    "subgraph"     : nx.subgraph_centrality,

}

def voterank_scores(G):
    seeds = nx.voterank(G)
    s = {n:0. for n in G}
    for r,n in enumerate(seeds[::-1],1): s[n]=r/len(seeds)
    return s

def build(df):
    rows=[]
    for _,row in df.iterrows():
        G = nx.from_edgelist(ast.literal_eval(row.edgelist))
        cents={k:f(G) for k,f in CENT_FUNCS.items()}
        cents['voterank']=voterank_scores(G)
        n_tok=len(G)
        for n in G:
            rec={'language':row.language,
                 'sentence':row.sentence,
                 'node':n,
                 'n_tokens':n_tok,
                 **{k:cents[k][n] for k in cents}}
            if 'root' in row: rec['target']=int(n==row.root)
            rows.append(rec)
    df_out=pd.DataFrame(rows)
    cent_cols=list(CENT_FUNCS)+['voterank']
    scaler=MinMaxScaler()
    df_out[cent_cols]=(
        df_out.groupby('sentence')[cent_cols]
              .transform(lambda x: scaler.fit_transform(x.values.reshape(-1,1)).ravel())
    )
    return df_out

In [33]:
train_raw=pd.read_csv('datasets/train.csv')
test_raw =pd.read_csv('datasets/test.csv')

train_nodes=build(train_raw)
test_nodes =build(test_raw )
for df in (train_nodes, test_nodes):
    df['language'] = df['language'].astype('category')

KEEP_CENTS=['pagerank','betweenness','katz','voterank','closeness','degree', 'harmonic']
FEATURES  =KEEP_CENTS+['n_tokens','language']
cat_feats = ['language']

X      = train_nodes[FEATURES]
y      = train_nodes['target'].values
#groups = train_nodes['sentence'].values
#gsize  = train_nodes.groupby('sentence').size().loc[groups].values  # per row





In [None]:
# # ------------ NEW: reserve 10 % sentences as a blind hold-out ------------
# from sklearn.model_selection import GroupShuffleSplit
# gss = GroupShuffleSplit(n_splits=1, test_size=0.10, random_state=42)
# train_idx, hold_idx = next(gss.split(X, y, groups))
# # -------------------------------------------------------------------------

# # Use only train_idx for CV and model selection
# X_tr, y_tr        = X.iloc[train_idx], y[train_idx]
# groups_tr         = groups[train_idx]
# gsize_tr          = gsize[train_idx]

# # Hold-out set (never touched until final evaluation)
# X_hold, y_hold    = X.iloc[hold_idx], y[hold_idx]
# groups_hold       = groups[hold_idx]
# gsize_hold        = gsize[hold_idx]

In [13]:
import numpy as np, pandas as pd, lightgbm as lgb
from lightgbm.callback import early_stopping, log_evaluation
from sklearn.model_selection import GroupKFold

# ---------------------------------------------------------------
# helper: turn an array of sentence-ids into “group sizes” vector
# ---------------------------------------------------------------
def make_groups(sent_ids: np.ndarray):
    """
    Parameters
    ----------
    sent_ids : 1-D array of sentence identifiers **already sorted**
               so identical ids are contiguous.

    Returns
    -------
    sizes : 1-D array, len = #sentences, each entry = #nodes in that sentence
    """
    _, counts = np.unique(sent_ids, return_counts=True)
    return counts

# ---------------------------------------------------------------
# prepare data
# ---------------------------------------------------------------
FEATURES   = KEEP_CENTS + ['n_tokens', 'language']
cat_feats  = ['language']          # column names (because X is a DataFrame)

X_full = train_nodes[FEATURES]
y_full = train_nodes['target'].values
sid    = train_nodes['sentence'].values     # sentence ids

gkf = GroupKFold(5)
val_acc = []

cv_best_iters = []



In [14]:
lang_cats = train_nodes["language"].cat.categories.tolist()

In [15]:
lang_cats

['Arabic',
 'Chinese',
 'Czech',
 'English',
 'Finnish',
 'French',
 'Galician',
 'German',
 'Hindi',
 'Icelandic',
 'Indonesian',
 'Italian',
 'Japanese',
 'Korean',
 'Polish',
 'Portuguese',
 'Russian',
 'Spanish',
 'Swedish',
 'Thai',
 'Turkish']

In [16]:

for fold, (tr_idx, va_idx) in enumerate(gkf.split(X_full, y_full, sid), 1):

    # ---- sort TRAIN rows sentence-contiguously ----
    order_tr  = np.argsort(sid[tr_idx], kind='mergesort')
    X_tr      = X_full.iloc[tr_idx].iloc[order_tr]
    y_tr      = y_full[tr_idx][order_tr]
    sid_tr    = sid[tr_idx][order_tr]
    grp_tr    = make_groups(sid_tr)

    # ---- sort VALID rows sentence-contiguously ----
    order_va  = np.argsort(sid[va_idx], kind='mergesort')
    X_va      = X_full.iloc[va_idx].iloc[order_va]
    y_va      = y_full[va_idx][order_va]
    sid_va    = sid[va_idx][order_va]
    grp_va    = make_groups(sid_va)

    # ---- model ----
    ranker = lgb.LGBMRanker(
        objective      = 'lambdarank',
        metric         = 'map',
        label_gain     = [0, 1],
        n_estimators   = 1500,
        learning_rate  = 0.03,
        num_leaves     = 127,
        min_data_in_leaf = 20,
        subsample      = 0.8,
        colsample_bytree = 0.8,
        random_state   = 42,
    )

    ranker.fit(
        X_tr, y_tr,
        group            = grp_tr,
        eval_set         = [(X_va, y_va)],
        eval_group       = [grp_va],
        eval_at          = [1],                 # MAP@1 == accuracy@1
        categorical_feature = cat_feats,
        callbacks        = [early_stopping(50), log_evaluation(50)],
    )

    # ---------- accuracy@1 on this fold ----------
    prob = ranker.predict(X_va)
    sent_acc = (
        pd.DataFrame({'sid': sid_va, 'target': y_va, 'prob': prob})
          .loc[lambda d: d.groupby('sid')['prob'].idxmax()]
          ['target']
          .mean()
    )
    val_acc.append(sent_acc)
    print(f"fold {fold} accuracy@1 = {sent_acc:.3f}")
    cv_best_iters.append(ranker.best_iteration_) 

print("CV accuracy@1 =", np.mean(val_acc))


[LightGBM] [Info] Total groups: 400, total data: 157986
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001042 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1778
[LightGBM] [Info] Number of data points in the train set: 157986, number of used features: 9
[LightGBM] [Info] Total groups: 100, total data: 39493
Training until validation scores don't improve for 50 rounds
[50]	valid_0's map@1: 0.4
Early stopping, best iteration is:
[3]	valid_0's map@1: 0.46
fold 1 accuracy@1 = 0.460
[LightGBM] [Info] Total groups: 400, total data: 157999
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001009 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1774
[LightGBM] [Info] Number of data points in the train set: 157999, number of used features: 9
[LightGBM] [Info] Total groups: 100, total data: 39480
Training until validation 

In [22]:
print("best iters per fold:", cv_best_iters)
final_num_boost = int(np.round(np.mean(cv_best_iters)))   # or max()
print("using", final_num_boost, "trees for final model")


best iters per fold: [3, 65, 64, 26, 49]
using 41 trees for final model


In [24]:
order   = np.argsort(sid, kind='mergesort')      # keep groups contiguous
X_all   = X_full.iloc[order]
y_all   = y_full[order]
grp_all = make_groups(sid[order])

ranker_final = lgb.LGBMRanker(
    objective='lambdarank',
    metric='map',
    label_gain=[0,1],
    n_estimators=final_num_boost,      # <<<<<<<<<<
    learning_rate=0.03,
    num_leaves=127,
    min_data_in_leaf=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
)

ranker_final.fit(
    X_all, y_all,
    group=grp_all,
    categorical_feature=cat_feats
)


[LightGBM] [Info] Total groups: 500, total data: 197479
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001208 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1777
[LightGBM] [Info] Number of data points in the train set: 197479, number of used features: 9


In [26]:
import joblib, json

joblib.dump(ranker_final, "lgbm_ranker_final.pkl")

with open("final_meta.json", "w") as fp:
    json.dump({
        "num_boost_round": final_num_boost,
        "language_categories": lang_cats          # <-- save them
    }, fp)


In [28]:
ranker_final = joblib.load("lgbm_ranker_final.pkl")
print("model has", ranker_final.booster_.num_trees(), "trees")


model has 41 trees


---

In [42]:
from pathlib import Path

# 0. load model + meta --------------------------------------------------
ranker_final = joblib.load("lgbm_ranker_final.pkl")
with open("final_meta.json") as fp:
    meta = json.load(fp)
lang_cats = meta["language_categories"]

# 1. build node-level frame for competition test set --------------------
#test_raw   = pd.read_csv("datasets/test.csv")
#test_nodes = build(test_raw)

# 2. align language codes ----------------------------------------------
test_nodes["language"] = pd.Categorical(test_nodes["language"], categories=lang_cats)

# 3. predict and create submission -------------------------------------
#FEATURES = ['pagerank','betweenness','katz','voterank','closeness','n_tokens','language']
probs = ranker_final.predict(test_nodes[FEATURES])
test_nodes["prob"] = probs

root_pred = (
    test_nodes
      .loc[test_nodes.groupby(['language', 'sentence'])['prob'].idxmax()]
      .rename(columns={'node': 'root'})
      [['language', 'sentence', 'root']]
)


submission = (
    test_raw[['id', 'language', 'sentence']]
      .merge(root_pred, on=['language', 'sentence'], how='left')
      [['id', 'root']]
)
submission.to_csv("submission.csv", index=False)
print("✓ submission.csv written:", submission.shape)



✓ submission.csv written: (10395, 2)


  .loc[test_nodes.groupby(['language', 'sentence'])['prob'].idxmax()]


In [44]:
# 4. optional offline check against labeled_test.csv -------------------
if Path("datasets/labeled_test.csv").exists():
    labeled  = pd.read_csv("datasets/labeled_test.csv")   # has id,root
    merged   = labeled.merge(submission, on="id", suffixes=("_true","_pred"))
    acc      = (merged.root_true == merged.root_pred).mean()
    print(f"Offline sentence accuracy = {acc:0.3f}")
    # (extra) Confusion table of languages
    acc_by_lang = (
        merged
          .merge(test_raw[["id","language"]], on="id")
          .assign(hit = lambda d: d.root_true == d.root_pred)
          .groupby("language")["hit"].mean()
          .sort_values(ascending=False)
    )
    display(acc_by_lang)
else:
    print("labeled_test.csv not found – skipped offline scoring.")

Offline sentence accuracy = 0.355


language
Icelandic     0.450505
Russian       0.444444
Swedish       0.436364
Arabic        0.434343
Indonesian    0.430303
Finnish       0.408081
Polish        0.406061
Czech         0.393939
Turkish       0.371717
German        0.359596
Korean        0.359596
Galician      0.347475
English       0.345455
Spanish       0.341414
French        0.333333
Thai          0.331313
Chinese       0.319192
Italian       0.315152
Portuguese    0.311111
Hindi         0.232323
Japanese      0.074747
Name: hit, dtype: float64

In [69]:
# How many (language, sentence) pairs share the same sentence id?
dup = test_raw.groupby('sentence')['language'].nunique()
print("sentences seen in >1 language:", (dup > 1).sum())


sentences seen in >1 language: 495


In [71]:
test_raw.sort_values(['sentence','language']).head(12)[['id','language','sentence']].head()


Unnamed: 0,id,language,sentence
9405,9406,Arabic,1
8415,8416,Chinese,1
7920,7921,Czech,1
1485,1486,English,1
495,496,Finnish,1


In [73]:
test_nodes.groupby('sentence').first()

Unnamed: 0_level_0,language,node,n_tokens,pagerank,betweenness,katz,closeness,voterank,prob
sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,Japanese,38,43,0.137674,0.057698,0.021463,0.109101,0.545455,-1.609072
3,Japanese,17,49,0.175358,0.119976,0.029934,0.192211,0.913043,-0.728393
4,Japanese,15,38,0.232180,0.335790,0.047725,0.219844,0.850000,-1.200192
6,Japanese,6,19,0.148543,0.148693,0.012601,0.151507,0.333333,-1.559605
7,Japanese,3,12,0.493944,0.787879,0.087372,0.406272,1.000000,-0.477845
...,...,...,...,...,...,...,...,...,...
985,Japanese,2,30,0.335636,0.200693,0.119627,0.040605,0.933333,-0.289833
986,Japanese,16,23,0.227850,0.130492,0.247035,0.171562,0.583333,-1.383158
987,Japanese,35,47,0.099163,0.152643,0.021459,0.175242,0.727273,-1.304062
991,Japanese,16,31,0.206654,0.091304,0.281230,0.164409,0.882353,-0.914346


In [40]:
# # -- make sure language categories match training --
# for df in (test_nodes,):
#     df["language"] = pd.Categorical(
#         df["language"],
#         categories = ranker_final.feature_name_[-1].categories   # last entry is language
#     )

# FEATURES  = ['pagerank','betweenness','katz','voterank','closeness',
#              'n_tokens','language']
# X_test = test_nodes[FEATURES]

# # 3. Predict a probability for every node -------------------------------
# test_nodes["prob"] = ranker_final.predict(X_test)

# # 4. Pick the top node per sentence  (MAP@1 → root guess) ---------------
# root_pred = (
#     test_nodes
#         .loc[test_nodes.groupby("sentence")["prob"].idxmax()]
#         .loc[:, ["sentence", "node"]]
#         .rename(columns={"node": "root"})
# )

# # 5. Build submission.csv  (Kaggle expects id,root) ---------------------
# submission = (
#     raw_test[["id","sentence"]]        # 'id' is the row identifier Kaggle gave you
#         .merge(root_pred, on="sentence", how="left")
#         .loc[:, ["id","root"]]
# )
# submission.to_csv("submission.csv", index=False)
# print("✓ submission.csv written:", submission.shape)



AttributeError: 'str' object has no attribute 'categories'

In [None]:
# -----------------------------------------------------------------------
#           OPTIONAL – Offline accuracy on professor’s labels
# -----------------------------------------------------------------------
if Path("labeled_test.csv").exists():
    labeled = pd.read_csv("labeled_test.csv")              # has columns id,sentence,root
    merged  = labeled.merge(submission, on="id", suffixes=("_true","_pred"))
    sent_acc = (merged.root_true == merged.root_pred).mean()
    print(f"Offline sentence accuracy = {sent_acc:0.3f}")

    # (extra) Confusion table of languages
    acc_by_lang = (
        merged
          .merge(raw_test[["id","language"]], on="id")
          .assign(hit = lambda d: d.root_true == d.root_pred)
          .groupby("language")["hit"].mean()
          .sort_values(ascending=False)
    )
    display(acc_by_lang)

else:
    print("labeled_test.csv not found – skipped offline scoring.")

In [19]:
order_all = np.argsort(sid, kind='mergesort')
X_all     = X_full.iloc[order_all]
y_all     = y_full[order_all]
grp_all   = make_groups(sid[order_all])

ranker.fit(X_all, y_all, group=grp_all, categorical_feature=cat_feats, ...)


SyntaxError: positional argument follows keyword argument (1455971992.py, line 6)

In [23]:
order   = np.argsort(sid, kind='mergesort')
X_all   = X_full.iloc[order]
y_all   = y_full[order]
grp_all = make_groups(sid[order])

ranker_final = lgb.LGBMRanker(
        objective      = 'lambdarank',
        metric         = 'map',
        label_gain     = [0, 1],
        n_estimators   = 1500,
        learning_rate  = 0.03,
        num_leaves     = 127,
        min_data_in_leaf = 20,
        subsample      = 0.8,
        colsample_bytree = 0.8,
        random_state   = 42,
    )
ranker_final.fit(
    X_all, y_all,
    group=grp_all,
    categorical_feature=cat_feats,
    num_boost_round=ranker.best_iteration_   # from CV fold with best MAP@1
)


TypeError: LGBMRanker.fit() got an unexpected keyword argument 'num_boost_round'

In [15]:
gkf = GroupKFold(5)
val_acc = []

for tr, va in gkf.split(X, y, groups):
    ranker = LGBMRanker(
        objective      = 'lambdarank',
        metric         = 'map',
        label_gain     = [0, 1],
        n_estimators   = 1500,
        learning_rate  = 0.03,
        num_leaves     = 127,
        min_data_in_leaf = 20,
        subsample      = 0.8,
        colsample_bytree = 0.8,
        random_state   = 42,
    )

    ranker.fit(
        X.iloc[tr], y[tr],
        group         = group_sizes[tr],
        eval_set      = [(X.iloc[va], y[va])],
        eval_group    = [group_sizes[va]],
        eval_at       = [1],
        categorical_feature = cat_feats,
        callbacks     = [
            early_stopping(50),
            log_evaluation(50)              # set to 0 for silence
        ],
    )

    # ---------- accuracy@1 on this fold ----------
    prob = ranker.predict(X.iloc[va])
    sent_acc = (
        train_nodes.iloc[va]                  # same rows
                   .assign(prob=prob)
                   .groupby('sentence')
                   .apply(lambda g: g.loc[g.prob.idxmax(), 'target'])
                   .mean()
    )
    val_acc.append(sent_acc)
    print(f"fold accuracy@1 : {sent_acc:.3f}")

print("CV accuracy@1 :", np.mean(val_acc))



[LightGBM] [Fatal] Sum of query counts (157986) differs from the length of #data (70852378)


LightGBMError: Sum of query counts (157986) differs from the length of #data (70852378)

In [6]:
gkf = GroupKFold(n_splits=5)
val_scores=[]
pred_proba=np.zeros(len(X))

for tr,va in gkf.split(X,y,groups):
    model=lgb.LGBMRanker(
        objective='lambdarank',
        metric='map',
        label_gain=[0,1],
        n_estimators=1500,
        learning_rate=0.03,
        num_leaves=127,
        min_data_in_leaf=20,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
    )
    model.fit(    
        X.iloc[tr], y[tr],
        group=group_sizes[tr],
        eval_set=[(X.iloc[va], y[va])],
        eval_group=[group_sizes[va]],
        eval_at=[1],
        categorical_feature=cat_feats,
        callbacks=[
        early_stopping(50),
        log_evaluation(50)  # 0 = silent; set to 10 for every 10 rounds
        ]
    )
    prob=model.predict(X.iloc[va])
    pred_proba[va]=prob
    # accuracy@1
    sent_pred=(train_nodes.iloc[va]
               .assign(prob=prob)
               .groupby('sentence')
               .apply(lambda g: g.loc[g.prob.idxmax(),'target'])
               .values)
    acc=sent_pred.mean()
    val_scores.append(acc)
    print(f'fold acc: {acc:.3f}')

print('CV accuracy@1',np.mean(val_scores))


ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: language: object

In [None]:
gkf = GroupKFold(n_splits=5)
val_scores=[]
pred_proba=np.zeros(len(X))

for tr,va in gkf.split(X,y,groups):
    model=lgb.LGBMRanker(
        objective='lambdarank',
        metric='map',
        label_gain=[0,1],
        n_estimators=1500,
        learning_rate=0.03,
        num_leaves=127,
        min_data_in_leaf=20,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
    )
    model.fit(    
        X.iloc[tr], y[tr],
        group=group_sizes[tr],
        eval_set=[(X.iloc[va], y[va])],
        eval_group=[group_sizes[va]],
        eval_at=[1],
        categorical_feature=cat_feats,
        callbacks=[
        early_stopping(50),
        log_evaluation(50)  # 0 = silent; set to 10 for every 10 rounds
        ]
    )
    prob=model.predict(X.iloc[va])
    pred_proba[va]=prob
    # accuracy@1
    sent_pred=(train_nodes.iloc[va]
               .assign(prob=prob)
               .groupby('sentence')
               .apply(lambda g: g.loc[g.prob.idxmax(),'target'])
               .values)
    acc=sent_pred.mean()
    val_scores.append(acc)
    print(f'fold acc: {acc:.3f}')

print('CV accuracy@1',np.mean(val_scores))

Why is this code not saving the models?
How can I later choose the best one? How can I create a submission? How can I apply it to test data?

Please, give me the full code!