# <i>CatBoost learning to rank on Microsoft dataset</i>

In [1]:
from catboost import CatBoostRanker, Pool, MetricVisualizer
from copy import deepcopy
import numpy as np
import os
import pandas as pd

In [2]:
from catboost.datasets import msrank_10k
train_df, test_df = msrank_10k()

X_train = train_df.drop([0, 1], axis=1).values
y_train = train_df[0].values
queries_train = train_df[1].values

X_test = test_df.drop([0, 1], axis=1).values
y_test = test_df[0].values
queries_test = test_df[1].values

In [9]:
X_train

array([[3., 3., 0., ..., 0., 0., 0.],
       [3., 0., 3., ..., 0., 0., 0.],
       [3., 0., 2., ..., 0., 0., 0.],
       ...,
       [2., 0., 2., ..., 0., 0., 0.],
       [2., 0., 1., ..., 0., 0., 0.],
       [2., 1., 1., ..., 0., 0., 0.]])

In [10]:
y_train

array([0.5 , 0.5 , 0.  , ..., 0.5 , 0.5 , 0.25])

In [11]:
queries_train

array([   1,    1,    1, ..., 1291, 1291, 1291], dtype=int64)

In [3]:
max_relevance = np.max(y_train)
y_train /= max_relevance
y_test /= max_relevance

__Number of queries__

In [4]:
num_queries = np.unique(queries_train).shape[0]
num_queries

87

### Creation of CatBoost pools

In [5]:
train = Pool(
    data=X_train,
    label=y_train,
    group_id=queries_train
)

test = Pool(
    data=X_test,
    label=y_test,
    group_id=queries_test
)

In [6]:
default_parameters = {
    'iterations': 500,
    'custom_metric': ['NDCG', 'PFound', 'AverageGain:top=10'],
    'verbose': False,
    'random_seed': 0,
}

parameters = {}

In [7]:
def fit_model(loss_function, additional_params=None, train_pool=train, test_pool=test):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function
    
    if additional_params is not None:
        parameters.update(additional_params)
        
    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=False)
    
    return model

In [11]:
model = fit_model('RMSE', {'custom_metric': ['PrecisionAt:top=10', 'RecallAt:top=10', 'MAP:top=10']})



In [8]:
model = fit_model('YetiRank')

In [None]:
fit_model('YetiRank', {'train_dir': 'YetiRank-lr-0.3', 'learning_rate': 0.3})

In [None]:
fit_model('YetiRank', {'metric_period': 50})

In [None]:
fit_model('YetiRank', {'task_type': 'GPU'})