In [1]:
# re-install tira from github, for faster prototyping
# !pip3 uninstall -y tira
# !pip3 install git+https://github.com/tira-io/tira.git@development#\&subdirectory=python-client

In [2]:
# %pip install lightgbm
# %pip install scikit-optimize

In [3]:
import pyterrier as pt
import pandas as pd
from tira.third_party_integrations import ensure_pyterrier_is_loaded
from tira.rest_api_client import Client
from feature_extraction import *
import pickle
import lightgbm as lgb
import skopt
import warnings
from pprint import pprint
import json

warnings.filterwarnings('ignore')

ensure_pyterrier_is_loaded()
tira = Client()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [4]:
dataset = pt.get_dataset('irds:ir-benchmarks/longeval-2023-01-20240423-training')

bm25 = tira.pt.from_submission('ir-benchmarks/tira-ir-starter/BM25 Re-Rank (tira-ir-starter-pyterrier)', dataset)
extract_features = get_all_features(tira, dataset)

The download is derived from The LongEval Dataset under the "Qwant LongEval Attribution-NonCommercial-ShareAlike License". Hence, the download is also under this License. By using it, you agree to the terms of this license. Please find details at: https://lindat.mff.cuni.cz/repository/xmlui/page/Qwant_LongEval_BY-NC-SA_License


Download: 3.11MiB [00:00, 6.00MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_runs/ir-benchmarks/longeval-2023-01-20240423-training/ows


In [3]:
BASE_PARAMS = {
    'task': 'train',
    'metric': 'ndcg',
    'importance_type': 'gain',
    'eval_at': '10',
    'early_stopping_rounds': 30,
    'verbosity': -1,
}

In [4]:
SEARCH_PARAMS = {
    'boosting_type': 'gbdt',
    'learning_rate': 0.4,
    # 'max_depth': 7,
    'num_leaves': 31,
    'n_estimators': 100,
    'feature_fraction': 0.8,
    'subsample': 0.2,
}

SPACE = [
    skopt.space.Categorical(['gbdt', 'dart'], name='boosting_type'),
    skopt.space.Real(0.01, 0.5, name='learning_rate', prior='log-uniform'),
    # skopt.space.Integer(1, 15, name='max_depth'),
    skopt.space.Integer(2, 255, name='num_leaves'),
    skopt.space.Integer(2, 500, name='n_estimators'),
    skopt.space.Real(0.1, 1.0, name='feature_fraction', prior='uniform'),
    skopt.space.Real(0.1, 1.0, name='subsample', prior='uniform')
]

In [5]:
from sklearn.model_selection import train_test_split

topics = pd.read_csv('../splits/2024/topics_train.csv')
qrels = pd.read_csv('../splits/2024/qrels_train.csv')

topics_train, topics_val = train_test_split(topics, test_size=0.2)

qrels_train = qrels.loc[qrels['qid'].isin(topics_train['qid'])]
qrels_val = qrels.loc[qrels['qid'].isin(topics_val['qid'])]

assert len(qrels_train) + len(qrels_val) == len(qrels)

# topics_train = pd.read_csv('../splits/2023/topics_train.csv')
# qrels_train = pd.read_csv('../splits/2023/qrels_train.csv')
# topics_val = pd.read_csv('../splits/2023/topics_val.csv')
# qrels_val = pd.read_csv('../splits/2023/qrels_val.csv')

In [6]:
@skopt.utils.use_named_args(SPACE)
def objective(**params):
    return -train_and_evaluate(**params, **BASE_PARAMS)

def train_and_evaluate(**params):
    lmart = lgb.LGBMRanker(**params)

    fit_kwargs = {}
    pipeline = (bm25 % 1000) >> extract_features >> pt.ltr.apply_learned_model(lmart, form="ltr", fit_kwargs=fit_kwargs)

    pipeline.fit(topics_train, qrels_train, topics_val, qrels_val)

    return pt.Experiment(
        [pipeline],
        topics_val,
        qrels_val,
        eval_metrics=['ndcg_cut_10']
    )['ndcg_cut_10'].iloc[0]

In [7]:
results = skopt.forest_minimize(objective, SPACE, n_calls=100, n_random_starts=10)

In [20]:
best_params = {param.name: getattr(value, "item", lambda: value)() for param, value in zip(SPACE, results.x)}

pprint({**best_params, **BASE_PARAMS})

{'boosting_type': 'gbdt',
 'early_stopping_rounds': 30,
 'eval_at': '10',
 'feature_fraction': 0.18997710683028857,
 'importance_type': 'gain',
 'learning_rate': 0.30611910385605584,
 'metric': 'ndcg',
 'n_estimators': 395,
 'num_leaves': 3,
 'subsample': 0.27877340579034704,
 'task': 'train',
 'verbosity': -1}


In [None]:
with open('params.json', 'w') as f:
    json.dump({**best_params, **BASE_PARAMS}, f)

In [6]:
with open('params.json') as f:
    train_params = json.load(f)

In [7]:
lmart = lgb.LGBMRanker(**train_params)

In [8]:

pipeline = (bm25 % 1000) >> extract_features >> pt.ltr.apply_learned_model(lmart, form="ltr")

In [9]:
pipeline.fit(topics_train, qrels_train, topics_val, qrels_val)

In [10]:
with open('lmart.pkl', 'wb') as f:
    pickle.dump(lmart, f)
