In [1]:
import pandas as pd

## Dataset loading

In [2]:
data = pd.read_csv("../dataset/dataset.csv")

In [3]:
data.head()

Unnamed: 0,question,truth,prediction,f1,sementic_similarity,tokens_length,isdigit,num_entities,question_type,config_id,Retriever,Chunk_size,Chunk_overlap,k,Answer_model,Reranker,reward,question_id
0,When did Beyonce start becoming popular?,in the late 1990s,late 1990s,0.266667,0.937186,6,False,1,factoid_single,0,faiss,400,100,3,qa,False,0.73603,53
1,What areas did Beyonce compete in when she was...,singing and dancing,singing and dancing,0.333333,1.0,11,False,0,factoid_single,0,faiss,400,100,3,qa,False,0.8,30
2,When did Beyonce leave Destiny's Child and bec...,2003,2003,1.0,1.0,10,False,2,factoid_single,0,faiss,400,100,3,qa,False,1.0,52
3,In what city and state did Beyonce grow up?,"Houston, Texas",Texas,0.333333,0.770843,9,False,1,other,0,faiss,400,100,3,qa,False,0.63959,22
4,In which decade did Beyonce become famous?,late 1990s,1990s,0.333333,0.93685,7,False,1,other,0,faiss,400,100,3,qa,False,0.755795,24


In [4]:
data["good"] = 0

for qid, grp in data.groupby("question_id"):
    top_indices = grp.sort_values("reward", ascending = False).head(5).index
    data.loc[top_indices, "good"] = 1

In [5]:
data.groupby("question_id")["good"].sum().value_counts()

good
5    70
Name: count, dtype: int64

## Model training

In [6]:
data["retriever_id"] = data["Retriever"].astype("category").cat.codes
data["answer_model_id"] = data["Answer_model"].astype("category").cat.codes
data["question_type_id"] = data["question_type"].astype("category").cat.codes

cat_features = [
    "retriever_id",
    "answer_model_id",
    "question_type_id"
]

In [7]:
feature_cols = [
    "tokens_length",
    "isdigit",
    "num_entities",
    "question_type_id",
    "retriever_id",
    "Chunk_size",
    "Chunk_overlap",
    "k",
    "answer_model_id",
    "Reranker"
]

X = data[feature_cols]
Y = data["good"]

In [8]:
from sklearn.model_selection import GroupShuffleSplit

In [9]:
gss = GroupShuffleSplit(test_size = 0.2, random_state = 42)

train, val = next(
    gss.split(X, Y, groups = data["question_id"])
)

In [10]:
x_train, x_val = X.iloc[train], X.iloc[val]
y_train, y_val = Y.iloc[train], Y.iloc[val]

In [11]:
from lightgbm import LGBMClassifier

In [12]:
proposer = LGBMClassifier(
    objective="binary",
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight="balanced",
    random_state=42
)

In [13]:
proposer.fit(
    x_train, y_train,
    categorical_feature=cat_features,
    eval_set=[(x_val, y_val)],
    eval_metric="binary_logloss",
)

[LightGBM] [Info] Number of positive: 280, number of negative: 1400
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000106 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37
[LightGBM] [Info] Number of data points in the train set: 1680, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,300
,subsample_for_bin,200000
,objective,'binary'
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [14]:
proba_val = proposer.predict_proba(x_val)[:, 1]

In [15]:
proba_val.min(), proba_val.max(), proba_val.mean()

(np.float64(4.0985442709827244e-07),
 np.float64(0.9992576480324007),
 np.float64(0.18616817541575606))

In [16]:
import numpy as np

N = 5
hits = []

for qid, grp in data.loc[val].groupby("question_id"):
    oracle_idx = grp["reward"].idxmax()

    X_grp = X.loc[grp.index]
    proba = proposer.predict_proba(X_grp)[:, 1]

    topN_idx = grp.index[np.argsort(proba)[::-1][:N]]

    hits.append(oracle_idx in topN_idx)

recall_at_5 = np.mean(hits)


In [17]:
recall_at_5

np.float64(0.8571428571428571)