In [1]:
from catboost import CatBoostRanker, Pool, MetricVisualizer
from copy import deepcopy
import numpy as np
import os
import pandas as pd
import ipywidgets

# Data preprocessing

In [2]:
def preprocess(dataset_name):
    with open('imat2009_' + dataset_name + '_new.txt', 'r') as original_dataset:
        lines = original_dataset.read()
        lines = lines.split('\n')
        lines.pop(-1)

        with open('imat2009_' + dataset_name + '_preprocessed.csv', 'w') as processed_dataset:
            processed_dataset.write('rel_val,')

            for num in range(1, 246):
                processed_dataset.write(str(num) + ',')

            processed_dataset.write('q_id\r\n')

            for line in lines:
                vals = line.split()
                features = {}

                processed_dataset.write(str(vals.pop(0)) + ',')

                for i in range(0, len(vals) - 2):
                    f_id, f_val = vals[i].split(':')
                    features[f_id] = f_val

                for num in range(1, 246):
                    processed_dataset.write(str(features.get(str(num), 0)) + ',')

                processed_dataset.write(str(vals[-1]) + '\r\n')

preprocess('train')
preprocess('test')

# Reading the dataset

In [3]:
train_df = pd.read_csv('imat2009_train_preprocessed.csv')
test_df = pd.read_csv('imat2009_test_preprocessed.csv')

In [4]:
x_train = train_df.drop(['rel_val', 'q_id'], axis=1).values
y_train = train_df['rel_val'].values
queries_train = train_df['q_id'].values

x_test = test_df.drop(['rel_val', 'q_id'], axis=1).values
y_test = test_df['rel_val'].values
queries_test = test_df['q_id'].values

# Dataset analysis

Number of documents in training and testing datasets, respectively

In [5]:
num_train_documents = x_train.shape[0]
num_test_documents = x_test.shape[0]
print(num_train_documents, ',', num_test_documents)

77714 , 19576


Number of features

In [6]:
x_train.shape[1]

245

# Relevance labels statistics

0 - irrelevant, 4 - highly relevant. Table represents number of documents for each value.

In [7]:
from collections import Counter
Counter(y_train).items()

dict_items([(1.0, 20086), (0.0, 25776), (2.0, 24424), (4.0, 952), (3.0, 1744), (0.5, 1982), (1.5, 1033), (0.25, 77), (1.33333, 110), (1.2, 3), (2.37037, 39), (0.666671, 340), (2.33333, 79), (0.333329, 268), (2.16049, 19), (2.5, 337), (2.87037, 26), (1.66667, 107), (2.12037, 4), (2.25, 19), (2.24074, 25), (0.2, 10), (1.6, 6), (0.8, 5), (0.6, 10), (0.875, 1), (2.66667, 31), (3.1625, 2), (1.75, 12), (0.75, 55), (2.61111, 4), (0.222229, 1), (0.4, 5), (1.25, 23), (1.97143, 2), (3.5, 16), (2.24691, 10), (2.16667, 1), (1.95239, 1), (1.4, 4), (3.66667, 5), (3.8, 2), (0.125, 1), (2.05556, 2), (3.33333, 4), (2.2, 5), (2.58025, 2), (1.16667, 2), (2.91358, 1), (2.07407, 3), (2.11729, 1), (3.25, 1), (2.375, 1), (3.21666, 1), (2.74074, 5), (2.12346, 3), (0.166671, 8), (0.833329, 5), (1.14286, 1), (3.53, 1), (3.4, 1), (2.75, 1), (3.58125, 1), (2.40741, 1), (0.583329, 1), (1.8, 1), (2.42857, 1), (2.0463, 1), (1.77143, 1), (3.75, 1), (0.888886, 1)])

Number of queries in training and testing datasets, respectively

In [9]:
num_train_queries = np.unique(queries_train).shape[0]
num_test_queries = np.unique(queries_test).shape[0]
print(num_train_queries, ',', num_test_queries)

7300 , 1824


# Creation of CatBoost pools

In [10]:
train = Pool(
    data=x_train,
    label=y_train,
    group_id=queries_train
)

test = Pool(
    data=x_test,
    label=y_test,
    group_id=queries_test
)

## Fitting model

In [11]:
default_parameters = {
    'iterations': 2000,
    'custom_metric': ['NDCG', 'PrecisionAt:top=10', 'PrecisionAt:top=20', 'MAP'],
    'verbose': False,
    'random_seed': 0,
}

parameters = {}

In [12]:
def fit_model(loss_function, additional_params=None, train_pool=train, test_pool=test):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function
    
    if additional_params is not None:
        parameters.update(additional_params)
        
    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True)
    
    return model

In [13]:
model = fit_model('YetiRankPairwise')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [15]:
model2 = fit_model('PairLogitPairwise')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRanker at 0x16b2cbcd0>

In [16]:
model3 = fit_model('QueryRMSE')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [22]:
model4 = fit_model('PairLogit')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [23]:
model5 = fit_model('YetiRank')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [24]:
model6 = fit_model('RMSE')



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))