In [1]:
# re-install tira from github, for faster prototyping
# !pip3 uninstall -y tira
# !pip3 install git+https://github.com/tira-io/tira.git@development#\&subdirectory=python-client

In [2]:
# %pip install lightgbm
# %pip install scikit-optimize

In [3]:
import pickle
import warnings
import json

import pyterrier as pt
import pandas as pd
import lightgbm as lgb
import skopt

from tira.third_party_integrations import ensure_pyterrier_is_loaded
from tira.rest_api_client import Client

from config import MODELS

warnings.filterwarnings('ignore')

ensure_pyterrier_is_loaded()
tira = Client()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [4]:
dataset = pt.get_dataset('irds:ir-benchmarks/longeval-2023-01-20240423-training')

bm25 = tira.pt.from_submission('ir-benchmarks/tira-ir-starter/BM25 Re-Rank (tira-ir-starter-pyterrier)', dataset)

In [5]:
BASE_PARAMS = {
    'task': 'train',
    'metric': 'ndcg',
    'importance_type': 'gain',
    'eval_at': '10',
    'early_stopping_rounds': 30,
    'verbosity': -1,
}

In [6]:
SEARCH_PARAMS = {
    'boosting_type': 'gbdt',
    'learning_rate': 0.4,
    # 'max_depth': 7,
    'num_leaves': 31,
    'n_estimators': 100,
    'feature_fraction': 0.8,
    'subsample': 0.2,
}

SPACE = [
    skopt.space.Categorical(['gbdt', 'dart'], name='boosting_type'),
    skopt.space.Real(0.01, 0.5, name='learning_rate', prior='log-uniform'),
    # skopt.space.Integer(1, 15, name='max_depth'),
    skopt.space.Integer(2, 255, name='num_leaves'),
    skopt.space.Integer(2, 500, name='n_estimators'),
    skopt.space.Real(0.1, 1.0, name='feature_fraction', prior='uniform'),
    skopt.space.Real(0.1, 1.0, name='subsample', prior='uniform')
]

In [7]:
# from sklearn.model_selection import train_test_split

# topics = pd.read_csv('../splits/2024/topics_train.csv')
# qrels = pd.read_csv('../splits/2024/qrels_train.csv')

# topics_train, topics_val = train_test_split(topics, test_size=0.2)

# qrels_train = qrels.loc[qrels['qid'].isin(topics_train['qid'])]
# qrels_val = qrels.loc[qrels['qid'].isin(topics_val['qid'])]

# assert len(qrels_train) + len(qrels_val) == len(qrels)

topics_train = pd.read_csv('../splits/2024/topics_train.csv')
qrels_train = pd.read_csv('../splits/2024/qrels_train.csv')
topics_val = pd.read_csv('../splits/2024/topics_val.csv')
qrels_val = pd.read_csv('../splits/2024/qrels_val.csv')

In [8]:
def train_pipeline(extract_features, **params):
    lmart = lgb.LGBMRanker(**params)

    pipeline = (bm25 % 1000) >> extract_features >> pt.ltr.apply_learned_model(lmart, form="ltr")
    pipeline.fit(topics_train, qrels_train, topics_val, qrels_val)

    return pipeline, lmart

def get_train_objective(extract_features):
    @skopt.utils.use_named_args(SPACE)
    def objective(**params):
        return -train_and_evaluate(**params, **BASE_PARAMS)

    def train_and_evaluate(**params):
        pipeline, _ = train_pipeline(extract_features, **params)        

        return pt.Experiment(
            [pipeline],
            topics_val,
            qrels_val,
            eval_metrics=['ndcg_cut_10']
        )['ndcg_cut_10'].iloc[0]

    return objective

In [10]:
OPTIMIZE_HYPERPARAMS = True

for name, get_feature_extractor in MODELS.items():
    extract_features = get_feature_extractor(tira, dataset)

    if OPTIMIZE_HYPERPARAMS:
        print(f'Optimizing parameters for {name}')

        results = skopt.forest_minimize(get_train_objective(extract_features), SPACE, n_calls=100, n_random_starts=10)

        tuned_params = {param.name: getattr(value, "item", lambda: value)() for param, value in zip(SPACE, results.x)}
        all_params = {**tuned_params, **BASE_PARAMS}

        with open(f'models/params_{name}.json', 'w') as f:
            json.dump(all_params, f)

    else:
        print(f'Using pre-optimized parameters for {name}')

        with open(f'models/params_{name}.json', 'r') as f:
            all_params = json.load(f)

    _, lmart = train_pipeline(extract_features, **all_params)

    with open(f'models/lmart_{name}.pkl', 'wb') as f:
        pickle.dump(lmart, f)

Optimizing parameters for wows_only
Optimizing parameters for wows_base_rerank
Optimizing parameters for wows_all_rerank
Optimizing parameters for wows_rerank_and_keyquery
Optimizing parameters for wows_rerank_and_reverted_index
Optimizing parameters for all
