In [1]:
# !pip install catboost

In [74]:
import torch
import torchvision.utils

import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.metrics import ndcg_score
from scipy import stats as sts

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cuda:0'

In [3]:
from math import log2
from torch import Tensor, sort

def num_swapped_pairs(ys_true: Tensor, ys_pred: Tensor) -> int:
    _, sorted_ys_true_idx = sort(ys_true, descending=True)

    sorted_preds_by_true = ys_pred[sort(ys_true, descending=True)[1]]
    count = 0

    for i in range(len(sorted_preds_by_true)):
        for j in range(i+1, len(sorted_preds_by_true)):
            if sorted_preds_by_true[i] < sorted_preds_by_true[j]:
                count += 1
    return count


def compute_gain(y_value: float, gain_scheme: str) -> float:
  assert gain_scheme in ['const', 'exp2']
  if gain_scheme == 'const':
      return y_value
  elif gain_scheme == 'exp2':
      return 2 ** y_value - 1


def dcg(ys_true: Tensor, ys_pred: Tensor, gain_scheme: str) -> float:
    dcg_value = 0
    _, sorted_ys_pred_idx = sort(ys_pred, descending=True)
    for i, rel in enumerate(ys_true[sorted_ys_pred_idx]):
        dcg_value += compute_gain(rel, gain_scheme=gain_scheme) / log2(i+2)
    return dcg_value


def ndcg(ys_true: Tensor, ys_pred: Tensor, gain_scheme: str = 'const') -> float:
    """
    https://en.wikipedia.org/wiki/Discounted_cumulative_gain#cite_note-4
    """
    dcg_value = dcg(ys_true, ys_pred, gain_scheme=gain_scheme)
    perfect_dcg = dcg(ys_true, ys_true, gain_scheme=gain_scheme)

    return dcg_value / perfect_dcg

# RankNet

Обучается ставить score выше для того, кто должен быть выше в списке. В итоге модель выдает величину, которую можно сравнить и выводить топ, может обучаться на данных одного ранга, тогда таргет = 0.5

$$C_{ij}=C(o_{ij})=-\bar{P_{ij}}log(P_{ij})-(1-\bar{P_{ij}})log(1-P_{ij})$$

$$o_{ij}=f(x_i)-f(x_j)$$

$$P_{ij}=\frac{e^{o_{ij}}}{1+e^{o_{ij}}}$$

$$\text{out}_{i} = \frac{1}{1 + e^{-\text{input}_{i}}}$$

In [4]:
class RankNet(torch.nn.Module):
    def __init__(self, num_input_features, hidden_dim=16):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.model = torch.nn.Sequential(
            torch.nn.Linear(num_input_features, self.hidden_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(self.hidden_dim, 1),
        )

        self.out_activation = torch.nn.Sigmoid()

    def forward(self, input_1, input_2):
        logits_1 = self.predict(input_1)
        logits_2 = self.predict(input_2)

        logits_diff = logits_1 - logits_2
        out = self.out_activation(logits_diff)

        return out

    def predict(self, inp):
        logits = self.model(inp)
        return logits

In [5]:
ranknet_model = RankNet(num_input_features=10)

In [6]:
inp_1, inp_2 = torch.rand(4, 10), torch.rand(4, 10)
# batch_size x input_dim
inp_2

tensor([[0.8161, 0.5248, 0.7568, 0.1942, 0.9722, 0.1711, 0.7715, 0.9777, 0.8148,
         0.2025],
        [0.7843, 0.1213, 0.2313, 0.8766, 0.3193, 0.9272, 0.1633, 0.6434, 0.1508,
         0.8880],
        [0.4826, 0.0589, 0.0111, 0.6643, 0.9005, 0.6499, 0.3386, 0.2823, 0.5310,
         0.3075],
        [0.5934, 0.3708, 0.3285, 0.3355, 0.2387, 0.2685, 0.3257, 0.3929, 0.5490,
         0.9919]])

In [7]:
preds = ranknet_model(inp_1, inp_2)
preds

tensor([[0.5015],
        [0.4910],
        [0.4852],
        [0.4768]], grad_fn=<SigmoidBackward0>)

In [8]:
first_linear_layer = ranknet_model.model[0]

In [9]:
first_linear_layer.weight.grad

In [10]:
criterion = torch.nn.BCELoss()
loss = criterion(preds, torch.ones_like(preds))
loss.backward()

In [11]:
first_linear_layer.weight.grad

tensor([[-0.0118,  0.0006, -0.0014, -0.0043, -0.0133, -0.0004, -0.0066, -0.0130,
         -0.0051, -0.0046],
        [ 0.0175,  0.0027,  0.0052,  0.0196,  0.0071,  0.0207,  0.0037,  0.0144,
          0.0034,  0.0199],
        [ 0.0099, -0.0031,  0.0039,  0.0060,  0.0100,  0.0084,  0.0024,  0.0142,
          0.0060,  0.0132],
        [-0.0143,  0.0360,  0.0239,  0.0092, -0.0049,  0.0084,  0.0097, -0.0226,
          0.0136, -0.0219],
        [-0.0084,  0.0106,  0.0050, -0.0002, -0.0083, -0.0038, -0.0002, -0.0107,
         -0.0020, -0.0109],
        [-0.0042,  0.0069,  0.0084, -0.0011, -0.0035,  0.0004,  0.0010, -0.0071,
          0.0014, -0.0018],
        [-0.0036,  0.0011, -0.0014, -0.0022, -0.0037, -0.0031, -0.0009, -0.0052,
         -0.0022, -0.0049],
        [-0.0045,  0.0113,  0.0075,  0.0029, -0.0015,  0.0026,  0.0030, -0.0071,
          0.0043, -0.0069],
        [ 0.0089, -0.0223, -0.0148, -0.0057,  0.0030, -0.0052, -0.0060,  0.0140,
         -0.0084,  0.0136],
        [ 0.0020,  

In [12]:
ranknet_model.zero_grad()

## Попробуем обучить

In [13]:
from catboost.datasets import msrank_10k

msrank_10k_train, msrank_10k_test = msrank_10k()
msrank_10k_train = msrank_10k_train.rename(columns={0: 'target', 1: 'query'}).rename(columns={i: f'feature_{i}' for i in range(2, 138)})
msrank_10k_test = msrank_10k_test.rename(columns={0: 'target', 1: 'query'}).rename(columns={i: f'feature_{i}' for i in range(2, 138)})

for feature in msrank_10k_train.columns:
    msrank_10k_train[feature] = msrank_10k_train[feature].astype(float)

for feature in msrank_10k_test.columns:
    msrank_10k_test[feature] = msrank_10k_test[feature].astype(float)

msrank_10k_train.head()

Unnamed: 0,target,query,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_128,feature_129,feature_130,feature_131,feature_132,feature_133,feature_134,feature_135,feature_136,feature_137
0,2.0,1.0,3.0,3.0,0.0,0.0,3.0,1.0,1.0,0.0,...,62.0,11089534.0,2.0,116.0,64034.0,13.0,3.0,0.0,0.0,0.0
1,2.0,1.0,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,54.0,11089534.0,2.0,124.0,64034.0,1.0,2.0,0.0,0.0,0.0
2,0.0,1.0,3.0,0.0,2.0,0.0,3.0,1.0,0.0,0.666667,...,45.0,3.0,1.0,124.0,3344.0,14.0,67.0,0.0,0.0,0.0
3,2.0,1.0,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,56.0,11089534.0,13.0,123.0,63933.0,1.0,3.0,0.0,0.0,0.0
4,1.0,1.0,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,64.0,5.0,7.0,256.0,49697.0,1.0,13.0,0.0,0.0,0.0


### Надо отнормировать признаки перед подачей в нейросеть, нормировать надо по группам, так как разные признаки соответствуют разным запросам

In [14]:
msrank_10k_train['query'].unique()

array([1.000e+00, 1.600e+01, 3.100e+01, 4.600e+01, 6.100e+01, 7.600e+01,
       9.100e+01, 1.060e+02, 1.210e+02, 1.360e+02, 1.510e+02, 1.660e+02,
       1.810e+02, 1.960e+02, 2.110e+02, 2.260e+02, 2.410e+02, 2.560e+02,
       2.710e+02, 2.860e+02, 3.010e+02, 3.160e+02, 3.310e+02, 3.460e+02,
       3.610e+02, 3.760e+02, 3.910e+02, 4.060e+02, 4.210e+02, 4.360e+02,
       4.510e+02, 4.660e+02, 4.810e+02, 4.960e+02, 5.110e+02, 5.260e+02,
       5.410e+02, 5.560e+02, 5.710e+02, 5.860e+02, 6.010e+02, 6.160e+02,
       6.310e+02, 6.460e+02, 6.610e+02, 6.760e+02, 6.910e+02, 7.060e+02,
       7.210e+02, 7.360e+02, 7.510e+02, 7.660e+02, 7.810e+02, 7.960e+02,
       8.110e+02, 8.260e+02, 8.410e+02, 8.560e+02, 8.710e+02, 8.860e+02,
       9.010e+02, 9.160e+02, 9.310e+02, 9.460e+02, 9.610e+02, 9.760e+02,
       9.910e+02, 1.006e+03, 1.021e+03, 1.036e+03, 1.051e+03, 1.066e+03,
       1.081e+03, 1.096e+03, 1.111e+03, 1.126e+03, 1.141e+03, 1.156e+03,
       1.171e+03, 1.186e+03, 1.201e+03, 1.216e+03, 

In [16]:
from sklearn.preprocessing import StandardScaler

for query in msrank_10k_train['query'].unique():
    scaler = StandardScaler()
    msrank_10k_train.loc[msrank_10k_train['query'] == query, 'feature_2':] = scaler\
    .fit_transform(msrank_10k_train.loc[msrank_10k_train['query'] == query, 'feature_2':])

for query in msrank_10k_test['query'].unique():
    scaler = StandardScaler()
    msrank_10k_test.loc[msrank_10k_test['query'] == query, 'feature_2':] = scaler\
    .fit_transform(msrank_10k_test.loc[msrank_10k_test['query'] == query, 'feature_2':])

In [17]:
x_train = msrank_10k_train.drop(['target', 'query'], axis=1).values
y_train = msrank_10k_train['target'].values
query_train = msrank_10k_train['query'].values.astype(int)

x_test = msrank_10k_test.drop(['target', 'query'], axis=1).values
y_test = msrank_10k_test['target'].values
query_test = msrank_10k_test['query'].values.astype(int)

У нас есть датасет, в котором представлены запросы в 1 колонке, релевантности в 0 колонке, остальные колонки - это признаки. Необходимо для RankNet сделать такой датасет, что в рамках одного запроса мы будем видеть пары, если первый более релевантен, значит таргет 1, иначе 0.

In [18]:
np.unique(query_train)

array([   1,   16,   31,   46,   61,   76,   91,  106,  121,  136,  151,
        166,  181,  196,  211,  226,  241,  256,  271,  286,  301,  316,
        331,  346,  361,  376,  391,  406,  421,  436,  451,  466,  481,
        496,  511,  526,  541,  556,  571,  586,  601,  616,  631,  646,
        661,  676,  691,  706,  721,  736,  751,  766,  781,  796,  811,
        826,  841,  856,  871,  886,  901,  916,  931,  946,  961,  976,
        991, 1006, 1021, 1036, 1051, 1066, 1081, 1096, 1111, 1126, 1141,
       1156, 1171, 1186, 1201, 1216, 1231, 1246, 1261, 1276, 1291])

In [19]:
# # 1 if i-th score larger than j-th

# comparison = []

# for query in tqdm(np.unique(query_train)):
#     subset = msrank_10k_train[msrank_10k_train['query'] == query]
#     indexes = subset.index

#     for i, idx1 in enumerate(indexes):
#         for idx2 in indexes[i:]:
#             val1 = subset.loc[idx1, 'target']
#             val2 = subset.loc[idx2, 'target']
#             if val1 > val2:
#                 comparison.append((query, idx1, idx2, 1))  # i должен быть отранжирован выше j
#             elif val1 < val2:
#                 comparison.append((query, idx1, idx2, 0))  # j должен быть отранжирован выше i
#             else:
#                 comparison.append((query, idx1, idx2, 0.5))  # одинаковый ранг

# print(f'Length of comparison: {len(comparison)}')
# print(f'Classes balance:\n{pd.Series(np.array([x[3] for x in comparison])).value_counts()}')

# comparison[:10]

In [63]:
# 1 if i-th score larger than j-th
# отдельно для каждого запроса

comparisons = {}

for query in tqdm(np.unique(query_train), position=0, leave=None):
    subset = msrank_10k_train[msrank_10k_train['query'] == query]
    indexes = subset.index
    comparisons[query] = []

    for i, idx1 in enumerate(indexes):
        # for idx2 in indexes[i+1:]:
        for idx2 in indexes:
            val1 = subset.loc[idx1, 'target']
            val2 = subset.loc[idx2, 'target']
            if val1 > val2:
                comparisons[query].append((query, idx1, idx2, 1))  # i должен быть отранжирован выше j
            elif val1 < val2:
                comparisons[query].append((query, idx1, idx2, 0))  # j должен быть отранжирован выше i
            else:
                comparisons[query].append((query, idx1, idx2, 0.5))  # одинаковый ранг

    # print(f'Length of comparison {query}: {len(comparisons[query])}')
    # print(f'Classes balance:\n{pd.Series(np.array([x[3] for x in comparisons[query]])).value_counts()}')

comparisons[query][:10]

100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:12<00:00,  6.79it/s]


[(1291, 9876, 9876, 0.5),
 (1291, 9876, 9877, 0.5),
 (1291, 9876, 9878, 0),
 (1291, 9876, 9879, 0.5),
 (1291, 9876, 9880, 0.5),
 (1291, 9876, 9881, 0.5),
 (1291, 9876, 9882, 0.5),
 (1291, 9876, 9883, 0),
 (1291, 9876, 9884, 0.5),
 (1291, 9876, 9885, 0.5)]

In [64]:
msrank_10k_train.iloc[9876:9885]

Unnamed: 0,target,query,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_128,feature_129,feature_130,feature_131,feature_132,feature_133,feature_134,feature_135,feature_136,feature_137
9876,0.0,1291.0,0.478644,-0.412082,-0.067505,-0.767129,0.444457,0.478644,-0.412082,-0.067505,...,-0.751362,-0.234745,-0.335436,-1.257367,1.063976,-0.61028,0.853848,0.0,-0.155194,-0.372095
9877,0.0,1291.0,0.478644,-0.412082,-0.067505,-0.767129,0.444457,0.478644,-0.412082,-0.067505,...,-1.033051,-0.234745,-0.46421,-1.257326,1.088024,-0.655702,-1.008002,0.0,-0.155194,-0.372095
9878,1.0,1291.0,0.478644,-0.412082,-0.067505,0.79228,0.444457,0.478644,-0.412082,-0.067505,...,0.868354,-0.226583,-0.46421,-1.259872,0.506192,1.910695,2.048535,0.0,-0.155194,-0.372095
9879,0.0,1291.0,0.478644,1.717007,-0.067505,0.79228,0.444457,0.478644,1.717007,-0.067505,...,1.502156,-0.233817,-0.206662,-1.110848,1.817741,-0.814683,-1.209702,0.0,-0.155194,-0.372095
9880,0.0,1291.0,0.478644,-0.412082,1.327604,0.79228,0.444457,0.478644,-0.412082,1.327604,...,-0.187982,-0.234745,-0.46421,-1.259215,0.008214,3.591345,0.838333,0.0,-0.155194,-0.372095
9881,0.0,1291.0,0.478644,-0.412082,-0.067505,0.79228,0.444457,0.478644,-0.412082,-0.067505,...,0.868354,-0.199377,0.952306,-0.481614,-0.043994,-0.474011,0.621117,0.0,-0.155194,-0.372095
9882,0.0,1291.0,-1.217124,-0.412082,-0.067505,0.79228,-1.392631,-1.217124,-0.412082,-0.067505,...,-0.892207,-0.234745,-0.46421,1.36527,0.418512,-0.474011,-0.852847,0.0,-0.155194,-0.372095
9883,1.0,1291.0,0.478644,-0.412082,1.327604,2.35169,0.444457,0.478644,-0.412082,1.327604,...,0.516242,-0.234745,-0.335436,-1.170679,1.426349,-0.337742,1.071064,0.0,-0.155194,-0.372095
9884,0.0,1291.0,0.478644,-0.412082,-1.462614,-0.767129,0.444457,0.478644,-0.412082,-1.462614,...,-1.033051,-0.234745,-0.46421,-0.305077,-1.228906,0.343603,1.738227,0.0,-0.155194,-0.372095


In [65]:
class MSRankPairDataset(torch.utils.data.Dataset):
    def __init__(self, data, comparison):
        super().__init__()
        self.data = data
        self.comparison = comparison

    def __len__(self):
        return len(self.comparison)

    def __getitem__(self, index):
        query, i, j, target = self.comparison[index]
        x_i, x_j = self.data.loc[i, 'feature_2':], self.data.loc[j, 'feature_2':]

        return torch.as_tensor(query, dtype=torch.float32),\
        torch.as_tensor(x_i.values, dtype=torch.float32),\
        torch.as_tensor(x_j.values, dtype=torch.float32),\
        torch.as_tensor(target, dtype=torch.float32)

In [66]:
msrank_dataset = MSRankPairDataset(msrank_10k_train, comparisons[query])
msrank_dataloader = torch.utils.data.DataLoader(msrank_dataset, batch_size=128, shuffle=True)

In [67]:
for query, first, second, tgt in msrank_dataloader:
    print(query)
    print(first)
    print(second)
    print(tgt)
    break

tensor([1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291.,
        1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291.,
        1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291.,
        1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291.,
        1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291.,
        1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291.,
        1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291.,
        1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291.,
        1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291.,
        1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291.,
        1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291.,
        1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291.,
        1291., 1291., 1291., 1291., 1291., 1291., 1291., 1291.])

In [68]:
# test the model works

ranknet_model = RankNet(num_input_features=136)

criterion = torch.nn.BCELoss()

preds = ranknet_model(first, second)
criterion(preds, tgt.view(-1, 1))

tensor(0.6999, grad_fn=<BinaryCrossEntropyBackward0>)

### Нужно пройтись по каждому запросу в обучении

In [69]:
datasets = {}
dataloaders = {}

for query in np.unique(query_train):
    subset = msrank_10k_train.loc[msrank_10k_train['query'] == query]
    dataset = MSRankPairDataset(data=subset, comparison=comparisons[query])
    datasets[query] = dataset
    dataloaders[query] = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=True)

In [70]:
# случайная подвыборка тестового датасета

# idx = np.random.choice(msrank_10k_test.shape[0], 1000)
# test_x = torch.as_tensor(msrank_10k_test.loc[idx, 'feature_2':].values, dtype=torch.float32).to(device)
# test_y = torch.as_tensor(msrank_10k_test.loc[idx, 'target'].values, dtype=torch.float32).to(device)
# test_query = msrank_10k_test.loc[idx, 'query'].values

# N_valid = test_x.shape[0]

In [71]:
def compute_gain(y_value: float, gain_scheme: str) -> float:
  assert gain_scheme in ['const', 'exp2']
  if gain_scheme == 'const':
      return y_value
  elif gain_scheme == 'exp2':
      return 2 ** y_value - 1


def dcg_k(ys_true: Tensor, ys_pred: Tensor, gain_scheme: str, k: int) -> float:
    dcg_value = 0
    _, sorted_ys_pred_idx = sort(ys_pred, descending=True)
    for i, rel in enumerate(ys_true[sorted_ys_pred_idx][:k]):
        dcg_value += compute_gain(rel, gain_scheme=gain_scheme) / log2(i+2)
    return dcg_value


def ndcg_k(ys_true: Tensor, ys_pred: Tensor, gain_scheme: str = 'const', k=5) -> float:
    dcg_value = dcg_k(ys_true, ys_pred, gain_scheme=gain_scheme, k=k)
    perfect_dcg = dcg_k(ys_true, ys_true, gain_scheme=gain_scheme, k=k)

    return dcg_value / perfect_dcg

In [75]:
ranknet_model = RankNet(num_input_features=136, hidden_dim=32)
ranknet_model.to(device)

criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(ranknet_model.parameters())

In [76]:
epochs = 5
num_batches = sum([len(dataset) for dataset in datasets.values()])

for epoch in range(epochs):
    print(f'Epoch: {epoch}')
    train_losses = []
    for query, msrank_dataloader in dataloaders.items():
        for it, (_, batch_x1, batch_x2, tgt) in enumerate(msrank_dataloader):
    
            optimizer.zero_grad()
            batch_x1 = batch_x1.to(device)
            batch_x2 = batch_x2.to(device)
            tgt = tgt.to(device)
    
            batch_pred = ranknet_model(batch_x1, batch_x2)
            batch_loss = criterion(batch_pred, tgt.view(-1, 1))
            batch_loss.backward()
    
            optimizer.step()

            train_losses.append(batch_loss.item() * len(batch_x1))

            
    print(f'Train loss: {np.sum(train_losses) / num_batches}')
    
    ndcg_scores = []
    with torch.no_grad():
        for query_test in msrank_10k_test['query'].unique():
            test_x = torch.as_tensor(msrank_10k_test.loc[msrank_10k_test['query']==query_test, 'feature_2':].values, dtype=torch.float32).to(device)
            test_y = torch.as_tensor(msrank_10k_test.loc[msrank_10k_test['query']==query_test, 'target'].values, dtype=torch.float32).to(device)

            total_pairs = len(test_x) * (len(test_x) - 1) // 2
            
            valid_pred = ranknet_model.predict(test_x).cpu().flatten()
            
            valid_swapped_pairs = -(sts.kendalltau(test_y.cpu().numpy(), valid_pred.numpy()).statistic * total_pairs - total_pairs) // 2
            ndcg_score_ = ndcg_k(test_y.cpu().flatten(), valid_pred, gain_scheme='const', k=10)
            ndcg_scores.append(ndcg_score_)
            print(f"epoch: {epoch + 1}.\tNumber of swapped pairs: "
                  f"{valid_swapped_pairs}/{total_pairs}\t"
            f"nDCG_at_10: {ndcg_score_:.4f}")
        
    print(f'Mean NDCG: {np.mean(ndcg_scores)}')

Epoch: 0
Train loss: 0.5657872904218946
epoch: 1.	Number of swapped pairs: 4608.0/9453	nDCG_at_10: 0.2531
epoch: 1.	Number of swapped pairs: 1550.0/4371	nDCG_at_10: 0.3187
epoch: 1.	Number of swapped pairs: 1832.0/3655	nDCG_at_10: 0.0881
epoch: 1.	Number of swapped pairs: 4735.0/10878	nDCG_at_10: 0.3027
epoch: 1.	Number of swapped pairs: 3109.0/7503	nDCG_at_10: 0.6334
epoch: 1.	Number of swapped pairs: 5050.0/14028	nDCG_at_10: 0.5545
epoch: 1.	Number of swapped pairs: 3579.0/7260	nDCG_at_10: 0.3395
epoch: 1.	Number of swapped pairs: 3492.0/9316	nDCG_at_10: 0.4362
epoch: 1.	Number of swapped pairs: 641.0/1711	nDCG_at_10: 0.3176
epoch: 1.	Number of swapped pairs: 2766.0/6555	nDCG_at_10: 0.0000
epoch: 1.	Number of swapped pairs: 4067.0/8646	nDCG_at_10: 0.1328
epoch: 1.	Number of swapped pairs: 1596.0/3570	nDCG_at_10: 0.2582
epoch: 1.	Number of swapped pairs: 8558.0/19503	nDCG_at_10: 0.2516
epoch: 1.	Number of swapped pairs: 3646.0/7875	nDCG_at_10: 0.0848
epoch: 1.	Number of swapped pairs:

In [31]:
with torch.no_grad():
    for query_test in msrank_10k_test['query'].unique():
            test_x = torch.as_tensor(msrank_10k_test.loc[msrank_10k_test['query']==query_test, 'feature_2':].values, dtype=torch.float32).to(device)
            test_y = torch.as_tensor(msrank_10k_test.loc[msrank_10k_test['query']==query_test, 'target'].values, dtype=torch.float32).to(device)

            total_pairs = len(test_x) * (len(test_x) - 1) // 2
            
            valid_pred = ranknet_model.predict(test_x).cpu().flatten()
            
            valid_swapped_pairs = -(sts.kendalltau(test_y.cpu().numpy(), valid_pred.numpy()).statistic * total_pairs - total_pairs) // 2
            ndcg_score_ = ndcg_k(test_y.cpu().flatten(), valid_pred, gain_scheme='const', k=10)
            ndcg_scores.append(ndcg_score_)
            print(f"epoch: {epoch + 1}.\tNumber of swapped pairs: "
                  f"{valid_swapped_pairs}/{total_pairs}\t"
            f"nDCG_at_10: {ndcg_score_:.4f}")
        
print(f'Mean NDCG: {np.mean(ndcg_scores)}')

epoch: 1.	Number of swapped pairs: 4242.0/9453	nDCG_at_10: 0.3823
epoch: 1.	Number of swapped pairs: 1608.0/4371	nDCG_at_10: 0.3889
epoch: 1.	Number of swapped pairs: 1890.0/3655	nDCG_at_10: 0.2322
epoch: 1.	Number of swapped pairs: 5014.0/10878	nDCG_at_10: 0.3858
epoch: 1.	Number of swapped pairs: 3418.0/7503	nDCG_at_10: 0.3377
epoch: 1.	Number of swapped pairs: 4907.0/14028	nDCG_at_10: 0.4049
epoch: 1.	Number of swapped pairs: 3515.0/7260	nDCG_at_10: 0.2354
epoch: 1.	Number of swapped pairs: 3376.0/9316	nDCG_at_10: 0.5581
epoch: 1.	Number of swapped pairs: 611.0/1711	nDCG_at_10: 0.3101
epoch: 1.	Number of swapped pairs: 2731.0/6555	nDCG_at_10: 0.0000
epoch: 1.	Number of swapped pairs: 4213.0/8646	nDCG_at_10: 0.2755
epoch: 1.	Number of swapped pairs: 1305.0/3570	nDCG_at_10: 0.4272
epoch: 1.	Number of swapped pairs: 8367.0/19503	nDCG_at_10: 0.2839
epoch: 1.	Number of swapped pairs: 3232.0/7875	nDCG_at_10: 0.1878
epoch: 1.	Number of swapped pairs: 3534.0/7875	nDCG_at_10: 0.2985
epoch: 1

### Синтетические данные

In [32]:
def make_dataset(N_train, N_valid, vector_dim):
    fake_weights = torch.randn(vector_dim, 1)

    X_train = torch.randn(N_train, vector_dim)
    X_valid = torch.randn(N_valid, vector_dim)

    ys_train_score = torch.mm(X_train, fake_weights)
    ys_train_score += torch.randn_like(ys_train_score)

    ys_valid_score = torch.mm(X_valid, fake_weights)
    ys_valid_score += torch.randn_like(ys_valid_score)

#     bins = [-1, 1]  # 3 relevances
    bins = [-1, 0, 1, 2]  # 5 relevances
    ys_train_rel = torch.Tensor(
        np.digitize(ys_train_score.clone().detach().numpy(), bins=bins)
    )
    ys_valid_rel = torch.Tensor(
        np.digitize(ys_valid_score.clone().detach().numpy(), bins=bins)
    )

    return X_train, X_valid, ys_train_rel, ys_valid_rel

In [33]:
N_train = 1000
N_valid = 500

vector_dim = 100
epochs = 2

batch_size = 16

X_train, X_valid, ys_train, ys_valid = make_dataset(N_train, N_valid, vector_dim)

In [34]:
# 1 if i-th score larger than j-th

comparison = []
ys_to_check = ys_train.flatten().numpy()

for i, idx1 in enumerate(ys_to_check):
    for j, idx2 in enumerate(ys_to_check):
        if i >= j:
            continue
        if idx1 > idx2:
            comparison.append((i, j, 1))  # i должен быть отранжирован выше j
        elif idx1 < idx2:
            comparison.append((i, j, 0))  # j должен быть отранжирован выше i
        else:
            comparison.append((i, j, 0.5))  # одинаковый ранг

print(f'Length of comparison: {len(comparison)}')
print(f'Classes balance:\n{pd.Series(np.array([x[2] for x in comparison])).value_counts()}')

comparison[:10]

Length of comparison: 499500
Classes balance:
0.5    191760
0.0    160154
1.0    147586
Name: count, dtype: int64


[(0, 1, 0),
 (0, 2, 0.5),
 (0, 3, 0),
 (0, 4, 0),
 (0, 5, 0.5),
 (0, 6, 0),
 (0, 7, 0.5),
 (0, 8, 0),
 (0, 9, 0),
 (0, 10, 0.5)]

In [35]:
(1000 * 999) / 2

499500.0

In [36]:
class SynthPairDataset(torch.utils.data.Dataset):
    def __init__(self, data, comparison):
        super().__init__()
        self.data = data
        self.comparison = comparison

    def __len__(self):
        return len(self.comparison)

    def __getitem__(self, index):
        i, j, target = self.comparison[index]
        x_i, x_j = self.data[i], self.data[j]

        return torch.as_tensor(x_i, dtype=torch.float32),\
        torch.as_tensor(x_j, dtype=torch.float32),\
        torch.as_tensor(target, dtype=torch.float32)

In [37]:
synth_dataset = SynthPairDataset(X_train, comparison)
synth_dataloader = torch.utils.data.DataLoader(synth_dataset, batch_size=32, shuffle=True)

In [38]:
for first, second, tgt in synth_dataloader:
    print(first)
    print(second)
    print(tgt)
    break

tensor([[-0.7833, -0.9256, -0.3572,  ..., -1.3975,  0.2452, -0.3351],
        [-0.0642,  1.2438,  0.1781,  ..., -0.1576,  0.6264,  0.9809],
        [ 0.0308, -0.6891,  0.8141,  ...,  0.9109, -1.0071,  0.4412],
        ...,
        [ 1.9322,  0.5748, -0.9847,  ..., -0.9142, -0.6096,  0.1959],
        [-0.4562,  0.3792, -0.6419,  ...,  1.8431,  0.3598, -0.5866],
        [-0.0494,  1.9040,  1.1757,  ..., -1.1783,  0.9214,  1.1035]])
tensor([[-5.8592e-02, -1.3301e+00,  1.5334e+00,  ..., -1.3745e-01,
          1.0013e+00,  5.8660e-01],
        [ 7.3646e-01,  3.6403e-01,  1.3213e+00,  ..., -5.2435e-02,
         -2.0775e+00,  1.5789e+00],
        [-1.1648e-01, -8.1707e-01, -5.2319e-01,  ..., -1.0099e+00,
          1.7143e-01,  5.4690e-01],
        ...,
        [-1.1360e+00, -1.3855e+00,  4.1111e-01,  ...,  3.7682e-01,
          1.0825e-03,  6.8818e-01],
        [ 1.6941e-01,  2.8234e-01, -1.2595e+00,  ..., -5.0628e-01,
         -1.3584e+00, -6.5198e-02],
        [ 6.5739e-01,  5.4291e-01, -5.

In [39]:
# test the model works

ranknet_model = RankNet(num_input_features=100)

criterion = torch.nn.BCELoss()

preds = ranknet_model(first, second)
criterion(preds, tgt.view(-1, 1))

tensor(0.7355, grad_fn=<BinaryCrossEntropyBackward0>)

In [40]:
ranknet_model = RankNet(num_input_features=100, hidden_dim=16)
ranknet_model.to(device)

criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(ranknet_model.parameters())

synth_dataset = SynthPairDataset(X_train, comparison)
synth_dataloader = torch.utils.data.DataLoader(synth_dataset, batch_size=32, shuffle=True)

In [41]:
epochs = 1
num_batches = len(synth_dataset)

for epoch in range(epochs):
    print(f'Epoch: {epoch}')
    train_losses = []
    for it, (batch_x1, batch_x2, tgt) in enumerate(synth_dataloader):

        optimizer.zero_grad()
        batch_x1 = batch_x1.to(device)
        batch_x2 = batch_x2.to(device)
        tgt = tgt.to(device)

        batch_pred = ranknet_model(batch_x1, batch_x2)
        batch_loss = criterion(batch_pred, tgt.view(-1, 1))
        batch_loss.backward()

        optimizer.step()

        train_losses.append(batch_loss.item() * len(batch_x1))

        if it % 1000 == 0:
            with torch.no_grad():
                total_pairs = len(X_valid) * (len(X_valid) - 1) // 2
                
                valid_pred = ranknet_model.predict(X_valid.to(device)).cpu().flatten()
                
                valid_swapped_pairs = -(sts.kendalltau(ys_valid.numpy(), valid_pred.numpy()).statistic * total_pairs - total_pairs) // 2
                ndcg_score_ = ndcg_k(ys_valid.flatten(), valid_pred, gain_scheme='const', k=100)
                print(f"Number of swapped pairs: "
                      f"{valid_swapped_pairs}/{total_pairs}\t"
                f"nDCG_at_100: {ndcg_score_:.4f}")
                
    print(f'Train loss: {np.sum(train_losses) / num_batches}')

Epoch: 0
Number of swapped pairs: 67424.0/124750	nDCG_at_100: 0.4230
Number of swapped pairs: 18891.0/124750	nDCG_at_100: 1.0000
Number of swapped pairs: 21212.0/124750	nDCG_at_100: 1.0000
Number of swapped pairs: 22988.0/124750	nDCG_at_100: 0.9907
Number of swapped pairs: 24157.0/124750	nDCG_at_100: 0.9907
Number of swapped pairs: 24407.0/124750	nDCG_at_100: 0.9904
Number of swapped pairs: 24976.0/124750	nDCG_at_100: 0.9827
Number of swapped pairs: 24986.0/124750	nDCG_at_100: 0.9865
Number of swapped pairs: 25162.0/124750	nDCG_at_100: 0.9863
Number of swapped pairs: 25264.0/124750	nDCG_at_100: 0.9864
Number of swapped pairs: 25568.0/124750	nDCG_at_100: 0.9865
Number of swapped pairs: 25708.0/124750	nDCG_at_100: 0.9863
Number of swapped pairs: 25790.0/124750	nDCG_at_100: 0.9844
Number of swapped pairs: 25788.0/124750	nDCG_at_100: 0.9847
Number of swapped pairs: 25749.0/124750	nDCG_at_100: 0.9847
Number of swapped pairs: 25911.0/124750	nDCG_at_100: 0.9848
Train loss: 0.29744707984299035

# ListNet

In [42]:
from itertools import combinations
import numpy as np

In [43]:
class ListNet(torch.nn.Module):
    def __init__(self, num_input_features, hidden_dim=10):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.model = torch.nn.Sequential(
            torch.nn.Linear(num_input_features, self.hidden_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(self.hidden_dim, 1),
        )


    def forward(self, input_1):
        logits = self.model(input_1)
        return logits


$$CE = -\sum ^{N}_{j=1} (P_y^i(j) * log(P_z^i(j)))$$

$$\text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}$$

In [44]:
def listnet_ce_loss(y_i, z_i):
    """
    y_i: (n_i, 1) GT
    z_i: (n_i, 1) preds
    """

    P_y_i = torch.softmax(y_i, dim=0)
    P_z_i = torch.softmax(z_i, dim=0)
    return -torch.sum(P_y_i * torch.log(P_z_i))

def listnet_kl_loss(y_i, z_i):
    """
    y_i: (n_i, 1) GT
    z_i: (n_i, 1) preds
    """
    P_y_i = torch.softmax(y_i, dim=0)
    P_z_i = torch.softmax(z_i, dim=0)
    return -torch.sum(P_y_i * torch.log(P_z_i/P_y_i))


def make_dataset(N_train, N_valid, vector_dim):
    fake_weights = torch.randn(vector_dim, 1)

    X_train = torch.randn(N_train, vector_dim)
    X_valid = torch.randn(N_valid, vector_dim)

    ys_train_score = torch.mm(X_train, fake_weights)
    ys_train_score += torch.randn_like(ys_train_score)

    ys_valid_score = torch.mm(X_valid, fake_weights)
    ys_valid_score += torch.randn_like(ys_valid_score)

#     bins = [-1, 1]  # 3 relevances
    bins = [-1, 0, 1, 2]  # 5 relevances
    ys_train_rel = torch.Tensor(
        np.digitize(ys_train_score.clone().detach().numpy(), bins=bins)
    )
    ys_valid_rel = torch.Tensor(
        np.digitize(ys_valid_score.clone().detach().numpy(), bins=bins)
    )

    return X_train, X_valid, ys_train_rel, ys_valid_rel

In [45]:
N_train = 1000
N_valid = 500

vector_dim = 100
epochs = 2

batch_size = 16

X_train, X_valid, ys_train, ys_valid = make_dataset(N_train, N_valid, vector_dim)

net = ListNet(num_input_features=vector_dim)
opt = torch.optim.Adam(net.parameters())


In [46]:
torch.unique(ys_train)

tensor([0., 1., 2., 3., 4.])

In [47]:
for epoch in range(epochs):
    idx = torch.randperm(N_train)

    X_train = X_train[idx]
    ys_train = ys_train[idx]

    cur_batch = 0
    for it in range(N_train // batch_size):
        batch_X = X_train[cur_batch: cur_batch + batch_size]
        batch_ys = ys_train[cur_batch: cur_batch + batch_size]
        cur_batch += batch_size

        opt.zero_grad()
        if len(batch_X) > 0:
            batch_pred = net(batch_X)
            batch_loss = listnet_kl_loss(batch_ys, batch_pred)
#             batch_loss = listnet_ce_loss(batch_ys, batch_pred)
            batch_loss.backward(retain_graph=True)
            opt.step()

        if it % 10 == 0:
            with torch.no_grad():
                valid_pred = net(X_valid)
                valid_swapped_pairs = num_swapped_pairs(ys_valid.flatten(),
                                                        valid_pred.flatten())
                ndcg_score = ndcg(ys_valid.flatten(), valid_pred.flatten())
            print(f"epoch: {epoch + 1}.\tNumber of swapped pairs: "
                  f"{valid_swapped_pairs}/{N_valid * (N_valid - 1) // 2}\t"
                  f"nDCG: {ndcg_score:.4f}")

epoch: 1.	Number of swapped pairs: 55083/124750	nDCG: 0.9127
epoch: 1.	Number of swapped pairs: 52399/124750	nDCG: 0.9272
epoch: 1.	Number of swapped pairs: 50034/124750	nDCG: 0.9373
epoch: 1.	Number of swapped pairs: 47562/124750	nDCG: 0.9448
epoch: 1.	Number of swapped pairs: 45282/124750	nDCG: 0.9522
epoch: 1.	Number of swapped pairs: 42893/124750	nDCG: 0.9595
epoch: 1.	Number of swapped pairs: 40969/124750	nDCG: 0.9654
epoch: 2.	Number of swapped pairs: 40608/124750	nDCG: 0.9664
epoch: 2.	Number of swapped pairs: 39256/124750	nDCG: 0.9703
epoch: 2.	Number of swapped pairs: 38130/124750	nDCG: 0.9732
epoch: 2.	Number of swapped pairs: 37106/124750	nDCG: 0.9758
epoch: 2.	Number of swapped pairs: 36152/124750	nDCG: 0.9780
epoch: 2.	Number of swapped pairs: 35198/124750	nDCG: 0.9804
epoch: 2.	Number of swapped pairs: 34187/124750	nDCG: 0.9826


# Task solution

In [48]:
import math

import numpy as np
import torch
from catboost.datasets import msrank_10k
from sklearn.preprocessing import StandardScaler

from typing import List


class ListNet(torch.nn.Module):
    def __init__(self, num_input_features: int, hidden_dim: int):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.model = torch.nn.Sequential(
            torch.nn.Linear(num_input_features, hidden_dim),
            torch.nn.Dropout(0.3),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_dim, 1)
        )

    def forward(self, input_1: torch.Tensor) -> torch.Tensor:
        logits = self.model(input_1)
        return logits


class Solution:
    def __init__(self, n_epochs: int = 5, listnet_hidden_dim: int = 30,
                 lr: float = 0.001, ndcg_top_k: int = 10):
        self._prepare_data()
        self.num_input_features = self.X_train.shape[1]
        self.ndcg_top_k = ndcg_top_k
        self.n_epochs = n_epochs

        self.model = self._create_model(
            self.num_input_features, listnet_hidden_dim)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)

    def _get_data(self) -> List[np.ndarray]:
        train_df, test_df = msrank_10k()

        X_train = train_df.drop([0, 1], axis=1).values
        y_train = train_df[0].values
        query_ids_train = train_df[1].values.astype(int)

        X_test = test_df.drop([0, 1], axis=1).values
        y_test = test_df[0].values
        query_ids_test = test_df[1].values.astype(int)

        return [X_train, y_train, query_ids_train, X_test, y_test, query_ids_test]

    def _prepare_data(self) -> None:
        (X_train, y_train, self.query_ids_train,
            X_test, y_test, self.query_ids_test) = self._get_data()
        
        self.X_train = self._scale_features_in_query_groups(X_train, self.query_ids_train)
        self.X_test = self._scale_features_in_query_groups(X_test, self.query_ids_test)

        self.ys_train = torch.as_tensor(y_train, dtype=torch.float32)
        self.ys_test = torch.as_tensor(y_test, dtype=torch.float32)

    def _scale_features_in_query_groups(self, inp_feat_array: np.ndarray,
                                        inp_query_ids: np.ndarray) -> np.ndarray:

        scaled_features = np.zeros_like(inp_feat_array)
        
        for query in np.unique(inp_query_ids):
            idxs = np.where(inp_query_ids == query)[0]
            scaled_features[idxs] = StandardScaler().fit_transform(inp_feat_array[idxs])
        return torch.as_tensor(scaled_features, dtype=torch.float32)

    def _create_model(self, listnet_num_input_features: int,
                      listnet_hidden_dim: int) -> torch.nn.Module:
        torch.manual_seed(0)
        net = ListNet(listnet_num_input_features, listnet_hidden_dim)
        return net

    def fit(self) -> List[float]:
        ndcgs = []
        for epoch in range(self.n_epochs):
            self._train_one_epoch()

            ndcg = self._eval_test_set()
            ndcgs.append(ndcg)
            print(f'Epoch: {epoch+1}. NDCG_{self.ndcg_top_k}: {ndcg}')
        return ndcgs

    def _calc_loss(self, batch_ys: torch.FloatTensor,
                   batch_pred: torch.FloatTensor) -> torch.FloatTensor:
        P_y_i = torch.softmax(batch_ys, dim=0)
        P_z_i = torch.softmax(batch_pred, dim=0)
        return -torch.sum(P_y_i * torch.log(P_z_i))

    def _train_one_epoch(self) -> None:
        self.model.train()
        for query in np.unique(self.query_ids_train):
            query_idxs = np.where(self.query_ids_train == query)[0]
            query_x = self.X_train[query_idxs]
            query_y = self.ys_train[query_idxs]
            
            # здесь можно добавить итерацию по батчам
            self.optimizer.zero_grad()
            
            preds = self.model(query_x).flatten()
            loss = self._calc_loss(query_y, preds)
            loss.backward()
            self.optimizer.step()

    def _eval_test_set(self) -> float:
        with torch.no_grad():
            self.model.eval()
            ndcgs = []
            for query in np.unique(self.query_ids_test):
                query_idxs = np.where(self.query_ids_test == query)[0]
                query_x = self.X_test[query_idxs]
                query_y = self.ys_test[query_idxs].flatten()
                
                preds = self.model(query_x).flatten()
                ndcg = self._ndcg_k(query_y, preds, ndcg_top_k=self.ndcg_top_k)

                ndcgs.append(ndcg)
            return np.mean(ndcgs)

    def _ndcg_k(self, ys_true: torch.Tensor, ys_pred: torch.Tensor,
                ndcg_top_k: int) -> float:
        
        def compute_gain(y_value: float, gain_scheme: str) -> float:
            assert gain_scheme in ['const', 'exp2']
            if gain_scheme == 'const':
                return y_value
            elif gain_scheme == 'exp2':
                return 2 ** y_value - 1
        
        
        def dcg_k(ys_true: Tensor, ys_pred: Tensor, gain_scheme: str, k: int) -> float:
            dcg_value = 0
            _, sorted_ys_pred_idx = sort(ys_pred, descending=True)
            for i, rel in enumerate(ys_true[sorted_ys_pred_idx][:k]):
                dcg_value += compute_gain(rel, gain_scheme=gain_scheme) / log2(i+2)
            return dcg_value

        # расчет по экспоненциальной формуле
        dcg_value = dcg_k(ys_true, ys_pred, gain_scheme='exp2', k=ndcg_top_k)
        perfect_dcg = dcg_k(ys_true, ys_true, gain_scheme='exp2', k=ndcg_top_k)
        
        return dcg_value / perfect_dcg


In [49]:
solution = Solution(listnet_hidden_dim=30)

In [50]:
solution.fit()

Epoch: 1. NDCG_10: 0.41444405913352966
Epoch: 2. NDCG_10: 0.43271610140800476
Epoch: 3. NDCG_10: 0.4335741698741913
Epoch: 4. NDCG_10: 0.4383959472179413
Epoch: 5. NDCG_10: 0.43431270122528076


[0.41444406, 0.4327161, 0.43357417, 0.43839595, 0.4343127]