In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../dataset/dataset.csv")

In [3]:
df.head()

Unnamed: 0,question_id,question,truth,prediction,f1,sementic_similarity,tokens_length,isdigit,num_entities,config_id,retriever_id,answer_model_id,question_type_id,Chunk_size,Chunk_overlap,k,Reranker,result,reward,good
0,53,When did Beyonce start becoming popular?,in the late 1990s,late 1990s,0.266667,0.937186,6,False,1,0,1,1,1,400,100,3,False,0.73603,0.73603,1
1,30,What areas did Beyonce compete in when she was...,singing and dancing,singing and dancing,0.333333,1.0,11,False,0,0,1,1,1,400,100,3,False,0.8,0.8,1
2,52,When did Beyonce leave Destiny's Child and bec...,2003,2003,1.0,1.0,10,False,2,0,1,1,1,400,100,3,False,1.0,1.0,1
3,22,In what city and state did Beyonce grow up?,"Houston, Texas",Texas,0.333333,0.770843,9,False,1,0,1,1,2,400,100,3,False,0.63959,0.63959,0
4,24,In which decade did Beyonce become famous?,late 1990s,1990s,0.333333,0.93685,7,False,1,0,1,1,2,400,100,3,False,0.755795,0.755795,1


In [4]:
features = [
    "retriever_id",
    "answer_model_id",
    "question_type_id",
    "Chunk_size",
    "Chunk_overlap",
    "k",
    "Reranker",
    "tokens_length",
    "isdigit",
    "num_entities"
]

X = df[features]
Y = df["reward"]

In [5]:
from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(test_size = 0.2, random_state = 42)
train_idx, val_idx = next(gss.split(X, Y, groups = df["question_id"]))

x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]

In [6]:
groups_train = df["question_id"].iloc[train_idx]
groups_val = df["question_id"].iloc[val_idx]

In [7]:
train_order = groups_train.argsort()
val_order = groups_val.argsort()

x_train, x_val = x_train.iloc[train_order], x_val.iloc[val_order]
y_train, y_val = y_train.iloc[train_order], y_val.iloc[val_order]

groups_train, groups_val = groups_train.iloc[train_order], groups_val.iloc[val_order]

In [8]:
def group_size(qids):
    return qids.value_counts(sort = False).values.tolist()

groups_train = group_size(groups_train)
groups_val = group_size(groups_val)

In [9]:
from lightgbm import LGBMRanker

In [10]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [11]:
def make_relevance(grp):
    ranks = grp["reward"].rank(method="first", ascending=False)

    rel = (len(grp) - ranks).astype(int)

    return rel.clip(0, 3)


df["relevance"] = df.groupby("question_id", group_keys=False).apply(make_relevance)

y_train = df.loc[train_idx, "relevance"]
y_val   = df.loc[val_idx, "relevance"]

  df["relevance"] = df.groupby("question_id", group_keys=False).apply(make_relevance)


In [12]:
ranker.fit(
    x_train,
    y_train,
    group=groups_train,
    eval_set=[(x_val, y_val)],
    eval_group=[groups_val],
    eval_at=[1, 3, 5],
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000178 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 34
[LightGBM] [Info] Number of data points in the train set: 1680, number of used features: 9


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,'lambdarank'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [13]:
from sklearn.metrics import ndcg_score
import numpy as np

ndcgs = []

for qid, grp in df.loc[val_idx].groupby("question_id"):
    y_true = grp["relevance"].values.reshape(1, -1)
    y_score = ranker.predict(X.loc[grp.index]).reshape(1, -1)

    ndcgs.append(ndcg_score(y_true, y_score))

np.mean(ndcgs)

np.float64(0.9812859159053394)

In [14]:
regrets = []

for qid, grp in df.loc[val_idx].groupby("question_id"):
    oracle_reward = grp["reward"].max()

    X_grp = X.loc[grp.index]
    scores = ranker.predict(X_grp)

    # top-2 safety selection
    top2_pos = np.argsort(scores)[-2:]
    top2_rewards = grp.iloc[top2_pos]["reward"].values

    chosen_pos = top2_pos[np.argmax(top2_rewards)]
    chosen_reward = grp.iloc[chosen_pos]["reward"]

    regrets.append(oracle_reward - chosen_reward)

np.mean(regrets), np.median(regrets)


(np.float64(0.1596982192857143), np.float64(0.0))

In [15]:
ranker.booster_.save_model("../models/ranker.txt")

<lightgbm.basic.Booster at 0x7e28f42df8c0>