In [None]:
# import libs

import pandas as pd
import numpy as np

from sklearn.model_selection import GroupShuffleSplit
from catboost import CatBoostRanker, Pool
from copy import deepcopy

In [2]:
# read data

df  = pd.read_csv('data.csv')

In [3]:
# check for missing values

df.isna().sum()

rank           0
query_id       0
feature_0      0
feature_1      0
feature_2      0
              ..
feature_139    0
feature_140    0
feature_141    0
feature_142    0
feature_143    0
Length: 146, dtype: int64

In [4]:
# train/test split

gss = GroupShuffleSplit(test_size=.20, n_splits=1, random_state=42).split(df, groups=df['query_id'])

X_train_inds, X_test_inds = next(gss)

train_data= df.iloc[X_train_inds]
test_data= df.iloc[X_test_inds]

In [5]:
# train/test split

X_train = train_data.drop(['rank', 'query_id'], axis=1).values
y_train = train_data['rank'].values
queries_train = train_data['query_id'].values

X_test = test_data.drop(['rank', 'query_id'], axis=1).values
y_test = test_data['rank'].values
queries_test = test_data['query_id'].values

In [6]:
# relevance labels statistics

from collections import Counter
Counter(y_train).items()

dict_items([(0, 97647), (1, 60601), (2, 25863), (4, 1473), (3, 3405)])

In [7]:
# cast to [0,1]

max_relevance = np.max(y_train)
y_train = y_train / max_relevance
y_test = y_test / max_relevance

In [8]:
# reation of CatBoost pools

train = Pool(
    data=X_train,
    label=y_train,
    group_id=queries_train
)

test = Pool(
    data=X_test,
    label=y_test,
    group_id=queries_test
)


In [9]:
# set params

params = {
    'iterations': 3000,
    'custom_metric': ['NDCG:top=5', 'PFound:top=5', 'AverageGain:top=5'],
    'verbose': False,
    'random_seed': 0,
    'thread_count': -1,
}

In [10]:
# create func for fitting

def fit_model(loss_function, additional_params=None, train_pool=train, test_pool=test):
    parameters = deepcopy(params)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function

    if additional_params is not None:
        parameters.update(additional_params)

    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True)

    return model

In [11]:
# fit model

model = fit_model('QueryRMSE', params, train, test)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [23]:
model.best_score_['validation']

{'PFound:top=5': 0.6975775578613285,
 'NDCG:top=5;type=Base': 0.5790497505705205,
 'QueryRMSE': 0.17765738143770826,
 'AverageGain:top=5': 0.3385416666666665}