In [1]:
# 01_setup.py
import ast, json, warnings, os, gc, random
import pandas as pd, numpy as np, networkx as nx
from tqdm.auto import tqdm
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
import joblib, matplotlib.pyplot as plt
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE); random.seed(RANDOM_STATE)
warnings.filterwarnings('ignore')


In [35]:
CENTRALITY_FUNCS = {
    "degree"       : nx.degree_centrality,
    "closeness"    : nx.closeness_centrality,
    "harmonic"     : nx.harmonic_centrality,
    "betweenness"  : nx.betweenness_centrality,
    "load"         : nx.load_centrality,
    "eigenvector"  : lambda G: nx.eigenvector_centrality_numpy(G),
    "katz"         : lambda G: nx.katz_centrality_numpy(G, alpha=0.01),
    "pagerank"     : nx.pagerank,
    "current_flow_betweenness": nx.current_flow_betweenness_centrality,
    "current_flow_closeness"  : nx.current_flow_closeness_centrality,
    "subgraph"     : nx.subgraph_centrality,
    "communicability_betw"    : nx.communicability_betweenness_centrality,
    "percolation"  : nx.percolation_centrality,
    "second_order" : nx.second_order_centrality,
}

voterank pseudoscores:

The most influential node gets score 1,

The least influential seed gets a small positive score,

All non-seed nodes get 0.

In [38]:
def voterank_scores(G):
    """Convert VoteRank seed list → pseudo-score (1 for first seed, 0 for others)."""
    seeds = nx.voterank(G)
    score = {n: 0. for n in G}
    for rank, node in enumerate(seeds[::-1], 1):
        score[node] = rank / len(seeds)
    return score



In [112]:
KEEP_CENTS = ['pagerank','betweenness','katz','voterank','closeness']


In [59]:
def vizualize_graph(G, row):
    pos = nx.spring_layout(G, seed=RANDOM_STATE)  # or nx.kamada_kawai_layout(G)
    root_node = row.get('root', -1)
    
    node_colors = []
    for node in G.nodes():
        if str(node) == str(root_node):
            node_colors.append('red')  # highlight root
        else:
            node_colors.append('skyblue')
    
    plt.figure(figsize=(5, 4))
    nx.draw_networkx(G, pos, with_labels=True, node_color=node_colors, node_size=600, font_size=10)
    plt.title(f"language: {row.language}; sentence: {row.sentence}")
    plt.axis('off')
    plt.show()


In [124]:
def parse_edgelist(edgelist_str):
    return ast.literal_eval(edgelist_str)

def build_node_dataframe(df_in, visualize=False):
    rows = []
    for sent_id, row in tqdm(df_in.iterrows(), total=len(df_in)):
        edges   = parse_edgelist(row.edgelist)
        G       = nx.from_edgelist(edges)
        if visualize:
            vizualize_graph(G)
        # all classic centralities
        cent    = {name: func(G) for name, func in CENTRALITY_FUNCS.items()}
        cent['voterank'] = voterank_scores(G)
        for node in G.nodes():
            feat = {f"{c}_{name}": cent[name][node] for name in cent for c in ['']}
            feat['language']    = row.language
            feat['sentence']    = row.sentence
            feat['node']        = node
            feat['n_tokens']    = row.n
            # normalised centralities per sentence
            for k in list(feat.keys()): #remove this
                if k.startswith('_'): continue
            rows.append((
                row.language, row.sentence, node,  # id fields
                *[cent[name][node] for name in cent],  # raw
                int(node == row.get('root', -1))       # y (-1 for test)
            ))
    # column names
    names = (['language','sentence','node', 'n_tokens'] +
             list(CENTRALITY_FUNCS.keys()) + ['voterank'] +
             ['target'])
    df_nodes = pd.DataFrame(rows, columns=names)
    # sentence-wise min-max
    scaler = MinMaxScaler()
    centrality_cols = list(CENTRALITY_FUNCS.keys())
    
    df_nodes[centrality_cols] = (
        df_nodes.groupby('sentence')[centrality_cols]
        .transform(lambda x: scaler.fit_transform(x.values.reshape(-1, 1)).ravel())
    )
    return df_nodes

In [None]:
# 03_prepare_data.py
train_raw = pd.read_csv('datasets/train.csv')
test_raw  = pd.read_csv('datasets/test.csv')

train_nodes = build_node_dataframe(train_raw, visualize=True)  # contains 'target'
test_nodes  = build_node_dataframe(test_raw)   # target = -1 placeholder

FEATURES = [c for c in train_nodes.columns if c not in
            ('language','sentence','node','target')]
X = train_nodes[FEATURES].values
y = train_nodes['target'].values
groups = train_nodes['sentence'].values

In [120]:
# 03_prepare_data.py
train_raw = pd.read_csv('datasets/train.csv')
test_raw  = pd.read_csv('datasets/test.csv')
KEEP_CENTS = ['pagerank', 'betweenness', 'katz', 'voterank', 'closeness']

TRAIN_COLS = ['language', 'sentence', 'node', 'target', 'n_tokens'] + KEEP_CENTS
TEST_COLS = [c for c in TRAIN_COLS if c != 'target']

train_nodes = build_node_dataframe(train_raw)[TRAIN_COLS]
test_nodes  = build_node_dataframe(test_raw )[TEST_COLS]     # no ‘target’


FEATURES = KEEP_CENTS + ['n_tokens']
X = train_nodes[FEATURES].values
y = train_nodes['target'].values
groups = train_nodes['sentence'].values
group_sizes = train_nodes.groupby('sentence').size().values


  0%|          | 0/10500 [00:00<?, ?it/s]

KeyError: "['n_tokens'] not in index"

In [67]:
# 04_model_cv.py
gkf = GroupKFold(n_splits=5)
cv_preds, cv_f1 = [], []

ranker = lgb.LGBMRanker(
    objective='lambdarank',
    metric='map',
    label_gain=[0,1],
    n_estimators=4000,
    learning_rate=0.02,
    num_leaves=127,
    scale_pos_weight=18,
    random_state=RANDOM_STATE,
)

for fold,(tr,va) in enumerate(gkf.split(X,y,groups)):
    ranker.fit(
        X[tr], y[tr], group=group_sizes[tr],
        eval_set=[(X[va], y[va])],
        eval_group=[group_sizes[va]],
        eval_at=[1],
        callbacks=[lgb.early_stopping(200)]
    )
    va_pred = model.predict_proba(X[va])[:,1]
    cv_preds.append((va,va_pred))
    cv_f1.append(f1_score(y[va], va_pred>0.5))
    joblib.dump(model, f'model_fold{fold}.pkl')
print('CV F1 mean:', np.mean(cv_f1))


[LightGBM] [Info] Number of positive: 8400, number of negative: 149586
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001648 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3737
[LightGBM] [Info] Number of data points in the train set: 157986, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 8400, number of negative: 149599
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001603 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3733
[LightGBM] [Info] Number of data points in the train set: 157999, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [In

Why LightGBM first? – Tree-boosters automatically capture non-linear interactions among centralities, are robust to feature scaling, support class_weight, and give built-in feature importances (gain + split count). They also train in <1 s per fold for these tiny graphs.

In [69]:
# 05a_single_feature.py
single_scores = {}
for f in FEATURES:
    model = LogisticRegression(class_weight='balanced', max_iter=1000)
    scores = []
    for tr,va in gkf.split(train_nodes[[f]], y, groups):
        model.fit(train_nodes.loc[tr,[f]], y[tr])
        pred = model.predict(train_nodes.loc[va,[f]])
        scores.append(f1_score(y[va],pred))
    single_scores[f] = np.mean(scores)
pd.Series(single_scores).sort_values(ascending=False).head(10)


subgraph                    0.248093
degree                      0.239313
communicability_betw        0.238556
betweenness                 0.237782
load                        0.237782
current_flow_betweenness    0.237782
percolation                 0.237782
eigenvector                 0.234125
pagerank                    0.226809
voterank                    0.211187
dtype: float64

In [71]:
# 05b_perm_importance.py
from sklearn.inspection import permutation_importance
best_model = joblib.load('model_fold0.pkl')
pi = permutation_importance(best_model,
                            X[va], y[va],
                            n_repeats=20,
                            random_state=RANDOM_STATE, n_jobs=-1)
imp = pd.Series(pi.importances_mean, index=FEATURES).sort_values(ascending=False)




In [73]:
# 06_language_breakdown.py
all_va = np.concatenate([idx for idx,_ in cv_preds])
all_pred = np.concatenate([pred for _,pred in cv_preds])
df_check = train_nodes.iloc[all_va].copy()
df_check['pred'] = all_pred
for lang, grp in df_check.groupby('language'):
    f1 = f1_score(grp['target'], grp['pred']>0.5)
    print(f"{lang: <8}: F1 = {f1:0.3f}, N sentences = {grp['sentence'].nunique()}")


Arabic  : F1 = 0.274, N sentences = 500
Chinese : F1 = 0.234, N sentences = 500
Czech   : F1 = 0.279, N sentences = 500
English : F1 = 0.282, N sentences = 500
Finnish : F1 = 0.319, N sentences = 500
French  : F1 = 0.252, N sentences = 500
Galician: F1 = 0.260, N sentences = 500
German  : F1 = 0.287, N sentences = 500
Hindi   : F1 = 0.189, N sentences = 500
Icelandic: F1 = 0.296, N sentences = 500
Indonesian: F1 = 0.291, N sentences = 500
Italian : F1 = 0.252, N sentences = 500
Japanese: F1 = 0.094, N sentences = 500
Korean  : F1 = 0.252, N sentences = 500
Polish  : F1 = 0.286, N sentences = 500
Portuguese: F1 = 0.274, N sentences = 500
Russian : F1 = 0.299, N sentences = 500
Spanish : F1 = 0.273, N sentences = 500
Swedish : F1 = 0.297, N sentences = 500
Thai    : F1 = 0.261, N sentences = 500
Turkish : F1 = 0.291, N sentences = 500


In [77]:
# 06_language_breakdown.py  (continued)

lang_f1 = {lang: f1_score(grp['target'], grp['pred'] > 0.5)
           for lang, grp in df_check.groupby('language')}

overall_mean = np.mean(list(lang_f1.values()))
overall_std  = np.std (list(lang_f1.values()))

# Languages whose F1 is more than 1 standard deviation below the mean
low_performing_langs = [lang for lang, score in lang_f1.items()
                        if score < overall_mean - overall_std]

print("Low-performing languages:", low_performing_langs)


Low-performing languages: ['Hindi', 'Japanese']


Strategy: If a language’s F1 < overall mean – 1 σ, train a language-specific model:

In [79]:
lang_models = {}
for lang in low_performing_langs:
    idx = train_nodes.language == lang
    gkf_lang = GroupKFold(3)
    Xl,yl,gl = train_nodes.loc[idx,FEATURES], y[idx], groups[idx]
    model = XGBClassifier(
        n_estimators=400, max_depth=5,
        learning_rate=0.05, subsample=0.8,
        colsample_bytree=0.8, reg_lambda=1.0,
        scale_pos_weight = (yl==0).sum()/(yl==1).sum(),
        random_state=RANDOM_STATE, objective='binary:logistic'
    )
    model.fit(Xl,yl, eval_set=[(Xl,yl)], verbose=False)
    lang_models[lang] = model


In [81]:
# 07_train_full_and_predict.py
base_model = lgb.LGBMClassifier(
    n_estimators= int(np.mean([model.best_iteration_ for model in
                               [joblib.load(f'model_fold{i}.pkl') for i in range(5)]])),
    learning_rate=0.05, num_leaves=63, random_state=RANDOM_STATE,
    class_weight='balanced', objective='binary'
).fit(X, y)

def predict_nodes(df_nodes):
    X_ = df_nodes[FEATURES].values
    probs = np.zeros(len(X_))
    for i,row in df_nodes.iterrows():
        lang = row.language
        if lang in lang_models:
            probs[i] = lang_models[lang].predict_proba(X_[i].reshape(1,-1))[:,1]
        else:
            probs[i] = base_model.predict_proba(X_[i].reshape(1,-1))[:,1]
    df_nodes['prob'] = probs
    # choose node with highest prob per sentence
    return (df_nodes.sort_values('prob',ascending=False)
                    .groupby('sentence').first()['node'])

test_root_pred = predict_nodes(test_nodes)
submission = (test_raw
              .assign(root = test_raw['sentence'].map(test_root_pred))
              [['id','root']])
submission.to_csv('submission.csv', index=False)


[LightGBM] [Info] Number of positive: 10500, number of negative: 186979
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000616 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3735
[LightGBM] [Info] Number of data points in the train set: 197479, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


In [83]:
labeled = pd.read_csv('datasets/labeled_test.csv')
merged  = labeled.merge(submission, on='id', suffixes=('_true','_pred'))
print("Sentence-level accuracy:", (merged.root_true==merged.root_pred).mean())
print(classification_report(merged.root_true, merged.root_pred))


Sentence-level accuracy: 0.08148148148148149
              precision    recall  f1-score   support

           1       0.09      0.13      0.11       690
           2       0.10      0.14      0.11       675
           3       0.10      0.12      0.11       687
           4       0.10      0.17      0.13       641
           5       0.09      0.12      0.10       693
           6       0.07      0.08      0.07       626
           7       0.10      0.07      0.08       653
           8       0.08      0.08      0.08       607
           9       0.07      0.08      0.07       547
          10       0.08      0.09      0.09       510
          11       0.07      0.04      0.05       491
          12       0.06      0.08      0.07       407
          13       0.08      0.04      0.06       390
          14       0.06      0.07      0.06       346
          15       0.08      0.08      0.08       336
          16       0.05      0.04      0.04       312
          17       0.04      0.02   

-------

Experiment

In [90]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score
import lightgbm as lgb, numpy as np, pandas as pd, joblib, random, gc
RANDOM_STATE = 42
CENTRALITIES = ['pagerank','betweenness','katz','voterank','closeness',
                'degree','harmonic','load','eigenvector','percolation',
                'current_flow_betweenness','current_flow_closeness',
                'subgraph','communicability_betw','second_order']


In [92]:
def map_at_1(df):
    top = (df.sort_values('pred', ascending=False)
             .groupby('sentence').first())
    return (top['target']==1).mean()


In [96]:
gkf = GroupKFold(5)
single_scores = {c: [] for c in CENTRALITIES}

for fold, (tr,va) in enumerate(gkf.split(train_nodes, groups=train_nodes['sentence'])):
    for cent in CENTRALITIES:
        # pick the node with max centrality inside every sentence
        df_va = train_nodes.iloc[va, :][['sentence', 'target', cent]].copy()
        df_va['pred'] = df_va[cent]
        single_scores[cent].append(map_at_1(df_va))

cv1 = (pd.DataFrame(single_scores)
         .agg(['mean','std']).T.sort_values('mean', ascending=False))
display(cv1.head(15))


Unnamed: 0,mean,std
current_flow_closeness,0.404,0.020736
closeness,0.398,0.0249
katz,0.394,0.041593
communicability_betw,0.382,0.035637
current_flow_betweenness,0.37,0.04062
voterank,0.368,0.047645
percolation,0.364,0.037815
betweenness,0.362,0.035637
load,0.362,0.035637
degree,0.358,0.040249


In [98]:
FEATURES = CENTRALITIES         # (or centralities + extra feats)
X  = train_nodes[FEATURES].values
y  = train_nodes['target'].values
groups = train_nodes['sentence'].values
group_sizes = train_nodes.groupby('sentence').size().values

ranker = lgb.LGBMRanker(
    objective='lambdarank', metric='map', label_gain=[0,1],
    n_estimators=2000, learning_rate=0.03, num_leaves=127,
    random_state=RANDOM_STATE)

# Fit once on ALL data (permutation importance is fast)
ranker.fit(X, y, group=group_sizes)

from sklearn.inspection import permutation_importance
pi = permutation_importance(
        ranker, X, y, n_repeats=10, random_state=RANDOM_STATE,
        scoring=lambda est, X_, y_: map_at_1(
            pd.DataFrame({'sentence':train_nodes['sentence'],
                          'target':y_, 'pred':est.predict(X_)}))
)
imp = (pd.Series(pi.importances_mean, index=FEATURES)
         .sort_values(ascending=False))
print(imp.head(10))


[LightGBM] [Info] Total groups: 500, total data: 197479
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001962 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3735
[LightGBM] [Info] Number of data points in the train set: 197479, number of used features: 15
voterank                0.4576
subgraph                0.2736
harmonic                0.0664
eigenvector             0.0624
communicability_betw    0.0408
pagerank                0.0288
betweenness             0.0162
katz                    0.0158
second_order            0.0136
closeness               0.0082
dtype: float64


In [100]:
remaining   = CENTRALITIES.copy()
selected, history = [], []

while remaining:
    best_gain, best_feat = -1, None
    base_feats = selected.copy()
    base_score = 0 if not base_feats else history[-1][1]

    for feat in remaining:
        feats = base_feats + [feat]
        scores = []
        for tr,va in gkf.split(train_nodes, groups=groups):
            X_tr = train_nodes.iloc[tr][feats].values
            X_va = train_nodes.iloc[va][feats].values
            y_tr, y_va = y[tr], y[va]
            gs_tr = group_sizes[tr]

            ranker = lgb.LGBMRanker(
                objective='lambdarank', metric='map', label_gain=[0,1],
                n_estimators=800, learning_rate=0.05,
                num_leaves=63, random_state=fold)
            ranker.fit(X_tr, y_tr, group=gs_tr,
                       verbose=False)
            pred = ranker.predict(X_va)
            acc  = map_at_1(pd.DataFrame(
                   {'sentence':train_nodes.iloc[va]['sentence'],
                    'target':y_va, 'pred':pred}))
            scores.append(acc)
        gain = np.mean(scores) - base_score
        if gain > best_gain:
            best_gain, best_feat = gain, feat

    if best_gain < 0.002:            #   <-- stopping criterion
        break
    selected.append(best_feat)
    remaining.remove(best_feat)
    history.append((best_feat, base_score+best_gain))
    print(f"Added {best_feat:<15}  ⬆ +{best_gain:.4f}")

print("\nSelected order:", selected)


IndexError: index 500 is out of bounds for axis 0 with size 500

In [104]:
def prepare_subset(idx, feats):
    """Return X, y, group_sizes for the rows in `idx`."""
    # 1⃣  Keep rows in sentence order so that each group is contiguous
    sub = train_nodes.iloc[idx].sort_values('sentence')
    X_sub = sub[feats].values
    y_sub = sub['target'].values

    # 2⃣  One size per sentence in this subset
    g_sub = sub.groupby('sentence').size().values          # e.g. [12, 10, 7 …]

    return X_sub, y_sub, g_sub

gkf = GroupKFold(5)
selected, remaining, history = [], CENTRALITIES.copy(), []

for fold, (tr, va) in enumerate(gkf.split(train_nodes, groups=train_nodes['sentence'])):
    base_feats = selected.copy()
    base_score = 0 if not base_feats else history[-1][1]

    best_gain, best_feat = -1, None
    for feat in remaining:
        feats = base_feats + [feat]
        # --- train subset
        X_tr, y_tr, g_tr = prepare_subset(tr, feats)
        X_va, y_va, g_va = prepare_subset(va, feats)


        ranker = lgb.LGBMRanker(
            objective='lambdarank', metric='map', label_gain=[0, 1],
            n_estimators=800, learning_rate=0.05,
            num_leaves=63, random_state=fold,
        )
        
        silent_cb   = lgb.log_evaluation(period=50)        # 0 ⇒ suppress stdout
        early_stop  = lgb.early_stopping(100)             # patience 100 rounds
        
        ranker.fit(
            X_tr, y_tr,
            group=g_tr,
            eval_set=[(X_va, y_va)],
            eval_group=[g_va],
            eval_at=[1],
            callbacks=[early_stop, silent_cb]              # ← instead of verbose=
        )

        # sentence-level accuracy on the fold
        preds_va = ranker.predict(X_va)
        acc = map_at_1(pd.DataFrame({
                  'sentence': train_nodes.iloc[va]['sentence'].values,
                  'target'  : y_va,
                  'pred'    : preds_va}))
        gain = acc - base_score
        if gain > best_gain:
            best_gain, best_feat = gain, feat

    # stop if new feature adds <0.002 to MAP@1
    if best_gain < 0.002:
        break

    selected.append(best_feat)
    remaining.remove(best_feat)
    history.append((best_feat, base_score + best_gain))
    print(f"Added {best_feat:<15} ⬆ +{best_gain:.4f}")

print("\nSelected order:", selected)


[LightGBM] [Info] Total groups: 400, total data: 157986
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000494 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 157986, number of used features: 1
[LightGBM] [Info] Total groups: 100, total data: 39493
Training until validation scores don't improve for 100 rounds
[50]	valid_0's map@1: 0.32
[100]	valid_0's map@1: 0.32
Early stopping, best iteration is:
[2]	valid_0's map@1: 0.32
[LightGBM] [Info] Total groups: 400, total data: 157986
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000821 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 157986, number of used features: 1
[LightGBM] [Info] Total groups: 100, total data: 39493
Training until validatio

In [106]:
summary = (pd.concat([
              cv1['mean'].rename('single_feat_ACC'),
              imp.rename('perm_drop'),
          ], axis=1)
          .loc[CENTRALITIES]
          .sort_values('single_feat_ACC', ascending=False))
display(summary.style.bar(subset=['perm_drop']))


Unnamed: 0,single_feat_ACC,perm_drop
current_flow_closeness,0.404,0.0044
closeness,0.398,0.0082
katz,0.394,0.0158
communicability_betw,0.382,0.0408
current_flow_betweenness,0.37,0.005
voterank,0.368,0.4576
percolation,0.364,0.0004
betweenness,0.362,0.0162
load,0.362,0.0
degree,0.358,0.0072


In [108]:
big5_count = {c:0 for c in CENTRALITIES}
for seed in range(5):
    random.seed(seed); np.random.seed(seed)
    selected = run_forward_selection(random_state=seed)   # wrapper around the loop
    for c in selected[:5]:
        big5_count[c] += 1
pd.Series(big5_count).sort_values(ascending=False)


NameError: name 'run_forward_selection' is not defined