In [1]:
# !pip install catboost

In [1]:
import torch
import torchvision.utils

import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.metrics import ndcg_score
from scipy import stats as sts

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cuda:0'

In [2]:
from math import log2
from torch import Tensor, sort

def num_swapped_pairs(ys_true: Tensor, ys_pred: Tensor) -> int:
    _, sorted_ys_true_idx = sort(ys_true, descending=True)

    sorted_preds_by_true = ys_pred[sort(ys_true, descending=True)[1]]
    count = 0

    for i in range(len(sorted_preds_by_true)):
        for j in range(i+1, len(sorted_preds_by_true)):
            if sorted_preds_by_true[i] < sorted_preds_by_true[j]:
                count += 1
    return count


def compute_gain(y_value: float, gain_scheme: str) -> float:
  assert gain_scheme in ['const', 'exp2']
  if gain_scheme == 'const':
      return y_value
  elif gain_scheme == 'exp2':
      return 2 ** y_value - 1


def dcg(ys_true: Tensor, ys_pred: Tensor, gain_scheme: str) -> float:
    dcg_value = 0
    _, sorted_ys_pred_idx = sort(ys_pred, descending=True)
    for i, rel in enumerate(ys_true[sorted_ys_pred_idx]):
        dcg_value += compute_gain(rel, gain_scheme=gain_scheme) / log2(i+2)
    return dcg_value


def ndcg(ys_true: Tensor, ys_pred: Tensor, gain_scheme: str = 'const') -> float:
    """
    https://en.wikipedia.org/wiki/Discounted_cumulative_gain#cite_note-4
    """
    dcg_value = dcg(ys_true, ys_pred, gain_scheme=gain_scheme)
    perfect_dcg = dcg(ys_true, ys_true, gain_scheme=gain_scheme)

    return dcg_value / perfect_dcg

# RankNet

Обучается ставить score выше для того, кто должен быть выше в списке. В итоге модель выдает величину, которую можно сравнить и выводить топ, может обучаться на данных одного ранга, тогда таргет = 0.5

$$C_{ij}=C(o_{ij})=-\bar{P_{ij}}log(P_{ij})-(1-\bar{P_{ij}})log(1-P_{ij})$$

$$o_{ij}=f(x_i)-f(x_j)$$

$$P_{ij}=\frac{e^{o_{ij}}}{1+e^{o_{ij}}}$$

$$\text{out}_{i} = \frac{1}{1 + e^{-\text{input}_{i}}}$$

In [3]:
class RankNet(torch.nn.Module):
    def __init__(self, num_input_features, hidden_dim=16):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.model = torch.nn.Sequential(
            torch.nn.Linear(num_input_features, self.hidden_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(self.hidden_dim, 1),
        )

        self.out_activation = torch.nn.Sigmoid()

    def forward(self, input_1, input_2):
        logits_1 = self.predict(input_1)
        logits_2 = self.predict(input_2)

        logits_diff = logits_1 - logits_2
        out = self.out_activation(logits_diff)

        return out

    def predict(self, inp):
        logits = self.model(inp)
        return logits

In [4]:
ranknet_model = RankNet(num_input_features=10)

In [5]:
inp_1, inp_2 = torch.rand(4, 10), torch.rand(4, 10)
# batch_size x input_dim
inp_2

tensor([[0.0919, 0.9394, 0.2377, 0.3483, 0.1489, 0.9118, 0.6660, 0.8441, 0.2611,
         0.8819],
        [0.3265, 0.1266, 0.9855, 0.1064, 0.8750, 0.5784, 0.8128, 0.1228, 0.6015,
         0.2889],
        [0.8808, 0.6673, 0.2847, 0.2359, 0.8470, 0.1170, 0.3532, 0.5040, 0.0520,
         0.8451],
        [0.8741, 0.9857, 0.7272, 0.5581, 0.8266, 0.0216, 0.7462, 0.9887, 0.5326,
         0.1212]])

In [6]:
preds = ranknet_model(inp_1, inp_2)
preds

tensor([[0.4863],
        [0.4918],
        [0.4986],
        [0.4696]], grad_fn=<SigmoidBackward0>)

In [7]:
first_linear_layer = ranknet_model.model[0]

In [8]:
first_linear_layer.weight.grad

In [9]:
criterion = torch.nn.BCELoss()
loss = criterion(preds, torch.ones_like(preds))
loss.backward()

In [10]:
first_linear_layer.weight.grad

tensor([[-1.7947e-04,  5.5189e-03, -8.6106e-04, -6.7795e-03,  5.1461e-03,
         -1.9429e-03,  1.1229e-03,  3.2511e-03, -5.4011e-03, -4.3615e-03],
        [-1.5919e-02,  1.4320e-02,  6.2754e-03,  1.4060e-03,  1.9067e-03,
          2.2156e-02,  5.2619e-03,  6.3696e-03, -1.3810e-02, -2.4862e-03],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-5.0858e-03, -4.3651e-03, -5.7997e-03, -9.0197e-03, -2.9486e-03,
         -5.2925e-03, -4.3932e-03, -4.4423e-03, -4.8907e-03, -7.0829e-03],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 4.5874e-03,  1.7782e-03,  1.3847e-02,  1.4946e-03,  1.2294e-02,
          8.1261e-03,  1.1420e-02,  1.7257e-03,  8.4518e-03,  4.0598e-03],
        [-3.6782e-04,  1.1311e-02, -1.7647e-03, -1.3894e-02,  1.0547e-02,
         -3.9818e-03,  2.3013e-0

In [11]:
ranknet_model.zero_grad()

## Попробуем обучить

In [12]:
from catboost.datasets import msrank_10k

msrank_10k_train, msrank_10k_test = msrank_10k()
msrank_10k_train = msrank_10k_train.rename(columns={0: 'target', 1: 'query'}).rename(columns={i: f'feature_{i}' for i in range(2, 138)})
msrank_10k_test = msrank_10k_test.rename(columns={0: 'target', 1: 'query'}).rename(columns={i: f'feature_{i}' for i in range(2, 138)})

for feature in msrank_10k_train.columns:
    msrank_10k_train[feature] = msrank_10k_train[feature].astype(float)

for feature in msrank_10k_test.columns:
    msrank_10k_test[feature] = msrank_10k_test[feature].astype(float)

msrank_10k_train.head()

Unnamed: 0,target,query,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_128,feature_129,feature_130,feature_131,feature_132,feature_133,feature_134,feature_135,feature_136,feature_137
0,2.0,1.0,3.0,3.0,0.0,0.0,3.0,1.0,1.0,0.0,...,62.0,11089534.0,2.0,116.0,64034.0,13.0,3.0,0.0,0.0,0.0
1,2.0,1.0,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,54.0,11089534.0,2.0,124.0,64034.0,1.0,2.0,0.0,0.0,0.0
2,0.0,1.0,3.0,0.0,2.0,0.0,3.0,1.0,0.0,0.666667,...,45.0,3.0,1.0,124.0,3344.0,14.0,67.0,0.0,0.0,0.0
3,2.0,1.0,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,56.0,11089534.0,13.0,123.0,63933.0,1.0,3.0,0.0,0.0,0.0
4,1.0,1.0,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,64.0,5.0,7.0,256.0,49697.0,1.0,13.0,0.0,0.0,0.0


### Надо отнормировать признаки перед подачей в нейросеть, нормировать надо по группам, так как разные признаки соответствуют разным запросам

In [13]:
msrank_10k_train['query'].unique()

array([1.000e+00, 1.600e+01, 3.100e+01, 4.600e+01, 6.100e+01, 7.600e+01,
       9.100e+01, 1.060e+02, 1.210e+02, 1.360e+02, 1.510e+02, 1.660e+02,
       1.810e+02, 1.960e+02, 2.110e+02, 2.260e+02, 2.410e+02, 2.560e+02,
       2.710e+02, 2.860e+02, 3.010e+02, 3.160e+02, 3.310e+02, 3.460e+02,
       3.610e+02, 3.760e+02, 3.910e+02, 4.060e+02, 4.210e+02, 4.360e+02,
       4.510e+02, 4.660e+02, 4.810e+02, 4.960e+02, 5.110e+02, 5.260e+02,
       5.410e+02, 5.560e+02, 5.710e+02, 5.860e+02, 6.010e+02, 6.160e+02,
       6.310e+02, 6.460e+02, 6.610e+02, 6.760e+02, 6.910e+02, 7.060e+02,
       7.210e+02, 7.360e+02, 7.510e+02, 7.660e+02, 7.810e+02, 7.960e+02,
       8.110e+02, 8.260e+02, 8.410e+02, 8.560e+02, 8.710e+02, 8.860e+02,
       9.010e+02, 9.160e+02, 9.310e+02, 9.460e+02, 9.610e+02, 9.760e+02,
       9.910e+02, 1.006e+03, 1.021e+03, 1.036e+03, 1.051e+03, 1.066e+03,
       1.081e+03, 1.096e+03, 1.111e+03, 1.126e+03, 1.141e+03, 1.156e+03,
       1.171e+03, 1.186e+03, 1.201e+03, 1.216e+03, 

In [14]:
from sklearn.preprocessing import StandardScaler

for query in msrank_10k_train['query'].unique():
    scaler = StandardScaler()
    msrank_10k_train.loc[msrank_10k_train['query'] == query, 'feature_2':] = scaler\
    .fit_transform(msrank_10k_train.loc[msrank_10k_train['query'] == query, 'feature_2':])

for query in msrank_10k_test['query'].unique():
    scaler = StandardScaler()
    msrank_10k_test.loc[msrank_10k_test['query'] == query, 'feature_2':] = scaler\
    .fit_transform(msrank_10k_test.loc[msrank_10k_test['query'] == query, 'feature_2':])

In [15]:
x_train = msrank_10k_train.drop(['target', 'query'], axis=1).values
y_train = msrank_10k_train['target'].values
query_train = msrank_10k_train['query'].values.astype(int)

x_test = msrank_10k_test.drop(['target', 'query'], axis=1).values
y_test = msrank_10k_test['target'].values
query_test = msrank_10k_test['query'].values.astype(int)

У нас есть датасет, в котором представлены запросы в 1 колонке, релевантности в 0 колонке, остальные колонки - это признаки. Необходимо для RankNet сделать такой датасет, что в рамках одного запроса мы будем видеть пары, если первый более релевантен, значит таргет 1, иначе 0.

In [16]:
np.unique(query_train)

array([   1,   16,   31,   46,   61,   76,   91,  106,  121,  136,  151,
        166,  181,  196,  211,  226,  241,  256,  271,  286,  301,  316,
        331,  346,  361,  376,  391,  406,  421,  436,  451,  466,  481,
        496,  511,  526,  541,  556,  571,  586,  601,  616,  631,  646,
        661,  676,  691,  706,  721,  736,  751,  766,  781,  796,  811,
        826,  841,  856,  871,  886,  901,  916,  931,  946,  961,  976,
        991, 1006, 1021, 1036, 1051, 1066, 1081, 1096, 1111, 1126, 1141,
       1156, 1171, 1186, 1201, 1216, 1231, 1246, 1261, 1276, 1291])

In [17]:
# 1 if i-th score larger than j-th

comparison = []

for query in tqdm(np.unique(query_train)):
    subset = msrank_10k_train[msrank_10k_train['query'] == query]
    indexes = subset.index

    for i, idx1 in enumerate(indexes):
        for idx2 in indexes[i:]:
            val1 = subset.loc[idx1, 'target']
            val2 = subset.loc[idx2, 'target']
            if val1 > val2:
                comparison.append((query, idx1, idx2, 1))  # i должен быть отранжирован выше j
            elif val1 < val2:
                comparison.append((query, idx1, idx2, 0))  # j должен быть отранжирован выше i
            else:
                comparison.append((query, idx1, idx2, 0.5))  # одинаковый ранг

print(f'Length of comparison: {len(comparison)}')
print(f'Classes balance:\n{pd.Series(np.array([x[3] for x in comparison])).value_counts()}')

comparison[:10]

100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:06<00:00, 13.24it/s]

Length of comparison: 750487
Classes balance:
0.5    333916
1.0    209955
0.0    206616
Name: count, dtype: int64





[(1, 0, 0, 0.5),
 (1, 0, 1, 0.5),
 (1, 0, 2, 1),
 (1, 0, 3, 0.5),
 (1, 0, 4, 1),
 (1, 0, 5, 1),
 (1, 0, 6, 1),
 (1, 0, 7, 0.5),
 (1, 0, 8, 1),
 (1, 0, 9, 1)]

In [18]:
# # 1 if i-th score larger than j-th
# # отдельно для каждого запроса

# comparisons = {}

# for query in tqdm(np.unique(query_train), position=0, leave=None):
#     subset = msrank_10k_train[msrank_10k_train['query'] == query]
#     indexes = subset.index
#     comparisons[query] = []

#     for i, idx1 in enumerate(indexes):
#         # for idx2 in indexes[i+1:]:
#         for idx2 in indexes:
#             val1 = subset.loc[idx1, 'target']
#             val2 = subset.loc[idx2, 'target']
#             if val1 > val2:
#                 comparisons[query].append((query, idx1, idx2, 1))  # i должен быть отранжирован выше j
#             elif val1 < val2:
#                 comparisons[query].append((query, idx1, idx2, 0))  # j должен быть отранжирован выше i
#             else:
#                 comparisons[query].append((query, idx1, idx2, 0.5))  # одинаковый ранг

#     # print(f'Length of comparison {query}: {len(comparisons[query])}')
#     # print(f'Classes balance:\n{pd.Series(np.array([x[3] for x in comparisons[query]])).value_counts()}')

# comparisons[query][:10]

In [19]:
msrank_10k_train.iloc[:10]

Unnamed: 0,target,query,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_128,feature_129,feature_130,feature_131,feature_132,feature_133,feature_134,feature_135,feature_136,feature_137
0,2.0,1.0,0.316064,4.817052,-2.175931,-0.300235,0.316064,0.316064,4.817052,-2.175932,...,0.890364,3.122498,-0.86291,-0.862552,0.770712,0.281923,-0.469119,-0.111758,-0.195935,-0.266225
1,2.0,1.0,0.316064,-0.234978,0.617055,-0.300235,0.316064,0.316064,-0.234978,0.617055,...,0.391122,3.122498,-0.86291,-0.862023,0.770712,-0.434689,-0.507108,-0.111758,-0.195935,-0.266225
2,0.0,1.0,0.316064,-0.234978,-0.31394,-0.300235,0.316064,0.316064,-0.234978,-0.31394,...,-0.170526,-0.320409,-0.938022,-0.862023,-2.197231,0.341641,1.962172,-0.111758,-0.195935,-0.266225
3,2.0,1.0,0.316064,-0.234978,0.617055,-0.300235,0.316064,0.316064,-0.234978,0.617055,...,0.515932,3.122498,-0.036682,-0.862089,0.765773,-0.434689,-0.469119,-0.111758,-0.195935,-0.266225
4,1.0,1.0,0.316064,-0.234978,0.617055,-0.300235,0.316064,0.316064,-0.234978,0.617055,...,1.015175,-0.320408,-0.487352,-0.853286,0.069585,-0.434689,-0.08923,-0.111758,-0.195935,-0.266225
5,1.0,1.0,0.316064,-0.234978,0.617055,-0.300235,0.316064,0.316064,-0.234978,0.617055,...,0.890364,-0.320408,-0.487352,-0.85633,0.080637,-0.195818,-0.013252,-0.111758,-0.195935,-0.266225
6,1.0,1.0,0.316064,-0.234978,0.617055,-0.300235,0.316064,0.316064,-0.234978,0.617055,...,0.890364,-0.320408,-0.487352,-0.853286,0.085527,-0.315254,-0.051241,-0.111758,-0.195935,-0.266225
7,2.0,1.0,0.316064,-0.234978,0.617055,-0.300235,0.316064,0.316064,-0.234978,0.617055,...,1.576823,-0.32041,0.263764,-0.862618,0.735697,-0.434689,-0.469119,-0.111758,-0.195935,-0.266225
8,1.0,1.0,0.316064,-0.234978,0.617055,-0.300235,0.316064,0.316064,-0.234978,0.617055,...,0.203906,3.122498,0.113541,-0.862023,0.737898,-0.434689,-0.469119,-0.111758,-0.195935,-0.266225
9,0.0,1.0,0.316064,-0.234978,0.617055,-0.300235,0.316064,0.316064,-0.234978,0.617055,...,0.01669,-0.320409,-0.938022,-0.834421,0.737898,-0.434689,-0.393141,-0.111758,-0.195935,-0.266225


In [20]:
class MSRankPairDataset(torch.utils.data.Dataset):
    def __init__(self, data, comparison):
        super().__init__()
        self.data = data
        self.comparison = comparison

    def __len__(self):
        return len(self.comparison)

    def __getitem__(self, index):
        query, i, j, target = self.comparison[index]
        x_i, x_j = self.data.loc[i, 'feature_2':], self.data.loc[j, 'feature_2':]

        return torch.as_tensor(query, dtype=torch.float32),\
        torch.as_tensor(x_i.values, dtype=torch.float32),\
        torch.as_tensor(x_j.values, dtype=torch.float32),\
        torch.as_tensor(target, dtype=torch.float32)

In [23]:
# msrank_dataset = MSRankPairDataset(msrank_10k_train, comparisons[query])
msrank_dataset = MSRankPairDataset(msrank_10k_train, comparison)
msrank_dataloader = torch.utils.data.DataLoader(msrank_dataset, batch_size=128, shuffle=True)

In [24]:
for query, first, second, tgt in msrank_dataloader:
    print(query)
    print(first)
    print(second)
    print(tgt)
    break

tensor([1.3600e+02, 6.1600e+02, 6.7600e+02, 6.9100e+02, 4.5100e+02, 7.6600e+02,
        2.5600e+02, 5.7100e+02, 1.9600e+02, 1.0810e+03, 2.5600e+02, 5.4100e+02,
        4.2100e+02, 1.0210e+03, 5.5600e+02, 6.1600e+02, 1.2310e+03, 1.9600e+02,
        4.6000e+01, 4.2100e+02, 7.2100e+02, 8.8600e+02, 1.1560e+03, 4.0600e+02,
        1.9600e+02, 1.1260e+03, 2.5600e+02, 8.5600e+02, 2.7100e+02, 4.2100e+02,
        1.0360e+03, 1.0510e+03, 5.7100e+02, 4.0600e+02, 4.9600e+02, 1.3600e+02,
        5.5600e+02, 1.0210e+03, 1.9600e+02, 4.8100e+02, 5.2600e+02, 6.4600e+02,
        9.9100e+02, 5.4100e+02, 5.7100e+02, 1.0810e+03, 6.3100e+02, 9.0100e+02,
        6.9100e+02, 4.2100e+02, 6.4600e+02, 5.8600e+02, 6.1600e+02, 5.4100e+02,
        5.2600e+02, 6.1600e+02, 9.0100e+02, 1.1260e+03, 5.7100e+02, 6.0100e+02,
        5.7100e+02, 5.5600e+02, 1.9600e+02, 5.7100e+02, 9.0100e+02, 6.1600e+02,
        2.1100e+02, 9.7600e+02, 7.0600e+02, 3.7600e+02, 1.2460e+03, 1.9600e+02,
        6.7600e+02, 1.9600e+02, 3.1000e+

In [25]:
# test the model works

ranknet_model = RankNet(num_input_features=136)

criterion = torch.nn.BCELoss()

preds = ranknet_model(first, second)
criterion(preds, tgt.view(-1, 1))

tensor(0.6978, grad_fn=<BinaryCrossEntropyBackward0>)

### Нужно пройтись по каждому запросу в обучении

In [26]:
# datasets = {}
# dataloaders = {}

# for query in np.unique(query_train):
#     subset = msrank_10k_train.loc[msrank_10k_train['query'] == query]
#     dataset = MSRankPairDataset(data=subset, comparison=comparisons[query])
#     datasets[query] = dataset
#     dataloaders[query] = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=True)

In [70]:
# случайная подвыборка тестового датасета

# idx = np.random.choice(msrank_10k_test.shape[0], 1000)
# test_x = torch.as_tensor(msrank_10k_test.loc[idx, 'feature_2':].values, dtype=torch.float32).to(device)
# test_y = torch.as_tensor(msrank_10k_test.loc[idx, 'target'].values, dtype=torch.float32).to(device)
# test_query = msrank_10k_test.loc[idx, 'query'].values

# N_valid = test_x.shape[0]

In [27]:
def compute_gain(y_value: float, gain_scheme: str) -> float:
  assert gain_scheme in ['const', 'exp2']
  if gain_scheme == 'const':
      return y_value
  elif gain_scheme == 'exp2':
      return 2 ** y_value - 1


def dcg_k(ys_true: Tensor, ys_pred: Tensor, gain_scheme: str, k: int) -> float:
    dcg_value = 0
    _, sorted_ys_pred_idx = sort(ys_pred, descending=True)
    for i, rel in enumerate(ys_true[sorted_ys_pred_idx][:k]):
        dcg_value += compute_gain(rel, gain_scheme=gain_scheme) / log2(i+2)
    return dcg_value


def ndcg_k(ys_true: Tensor, ys_pred: Tensor, gain_scheme: str = 'const', k=5) -> float:
    dcg_value = dcg_k(ys_true, ys_pred, gain_scheme=gain_scheme, k=k)
    perfect_dcg = dcg_k(ys_true, ys_true, gain_scheme=gain_scheme, k=k)

    return dcg_value / perfect_dcg

In [38]:
ranknet_model = RankNet(num_input_features=136, hidden_dim=32)
ranknet_model.to(device)

criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(ranknet_model.parameters())

In [39]:
epochs = 5
len_dataset = len(msrank_dataset)

for epoch in range(epochs):
    print(f'Epoch: {epoch}')
    train_losses = []
    # for query, msrank_dataloader in dataloaders.items():
    for it, (_, batch_x1, batch_x2, tgt) in enumerate(msrank_dataloader):

        optimizer.zero_grad()
        batch_x1 = batch_x1.to(device)
        batch_x2 = batch_x2.to(device)
        tgt = tgt.to(device)

        batch_pred = ranknet_model(batch_x1, batch_x2)
        batch_loss = criterion(batch_pred, tgt.view(-1, 1))
        batch_loss.backward()

        optimizer.step()

        train_losses.append(batch_loss.item() * len(batch_x1))

            
    print(f'Train loss: {np.sum(train_losses) / len_dataset}')
    
    ndcg_scores = []
    with torch.no_grad():
        for query_test in msrank_10k_test['query'].unique():
            test_x = torch.as_tensor(msrank_10k_test.loc[msrank_10k_test['query']==query_test, 'feature_2':].values, dtype=torch.float32).to(device)
            test_y = torch.as_tensor(msrank_10k_test.loc[msrank_10k_test['query']==query_test, 'target'].values, dtype=torch.float32).to(device)

            total_pairs = len(test_x) * (len(test_x) - 1) // 2
            
            valid_pred = ranknet_model.predict(test_x).cpu().flatten()
            
            valid_swapped_pairs = -(sts.kendalltau(test_y.cpu().numpy(), valid_pred.numpy()).statistic * total_pairs - total_pairs) // 2
            ndcg_score_ = ndcg_k(test_y.cpu().flatten(), valid_pred, gain_scheme='const', k=10)
            ndcg_scores.append(ndcg_score_)
            print(f"epoch: {epoch + 1}.\tNumber of swapped pairs: "
                  f"{valid_swapped_pairs}/{total_pairs}\t"
            f"nDCG_at_10: {ndcg_score_:.4f}")
        
    print(f'Mean NDCG: {np.mean(ndcg_scores)}')

Epoch: 0
Train loss: 0.6381915278817132
epoch: 1.	Number of swapped pairs: 4406.0/9453	nDCG_at_10: 0.3652
epoch: 1.	Number of swapped pairs: 1731.0/4371	nDCG_at_10: 0.4355
epoch: 1.	Number of swapped pairs: 1382.0/3655	nDCG_at_10: 0.5105
epoch: 1.	Number of swapped pairs: 5048.0/10878	nDCG_at_10: 0.2458
epoch: 1.	Number of swapped pairs: 3300.0/7503	nDCG_at_10: 0.5711
epoch: 1.	Number of swapped pairs: 5591.0/14028	nDCG_at_10: 0.5415
epoch: 1.	Number of swapped pairs: 3541.0/7260	nDCG_at_10: 0.3957
epoch: 1.	Number of swapped pairs: 3225.0/9316	nDCG_at_10: 0.5379
epoch: 1.	Number of swapped pairs: 705.0/1711	nDCG_at_10: 0.2631
epoch: 1.	Number of swapped pairs: 2700.0/6555	nDCG_at_10: 0.0886
epoch: 1.	Number of swapped pairs: 3483.0/8646	nDCG_at_10: 0.2317
epoch: 1.	Number of swapped pairs: 1892.0/3570	nDCG_at_10: 0.1662
epoch: 1.	Number of swapped pairs: 8656.0/19503	nDCG_at_10: 0.1516
epoch: 1.	Number of swapped pairs: 3410.0/7875	nDCG_at_10: 0.1640
epoch: 1.	Number of swapped pairs:

In [40]:
with torch.no_grad():
    for query_test in msrank_10k_test['query'].unique():
            test_x = torch.as_tensor(msrank_10k_test.loc[msrank_10k_test['query']==query_test, 'feature_2':].values, dtype=torch.float32).to(device)
            test_y = torch.as_tensor(msrank_10k_test.loc[msrank_10k_test['query']==query_test, 'target'].values, dtype=torch.float32).to(device)

            total_pairs = len(test_x) * (len(test_x) - 1) // 2
            
            valid_pred = ranknet_model.predict(test_x).cpu().flatten()
            
            valid_swapped_pairs = -(sts.kendalltau(test_y.cpu().numpy(), valid_pred.numpy()).statistic * total_pairs - total_pairs) // 2
            ndcg_score_ = ndcg_k(test_y.cpu().flatten(), valid_pred, gain_scheme='const', k=10)
            ndcg_scores.append(ndcg_score_)
            print(f"epoch: {epoch + 1}.\tNumber of swapped pairs: "
                  f"{valid_swapped_pairs}/{total_pairs}\t"
            f"nDCG_at_10: {ndcg_score_:.4f}")
        
print(f'Mean NDCG: {np.mean(ndcg_scores)}')

epoch: 5.	Number of swapped pairs: 4581.0/9453	nDCG_at_10: 0.3998
epoch: 5.	Number of swapped pairs: 1928.0/4371	nDCG_at_10: 0.4825
epoch: 5.	Number of swapped pairs: 1273.0/3655	nDCG_at_10: 0.3562
epoch: 5.	Number of swapped pairs: 5920.0/10878	nDCG_at_10: 0.2025
epoch: 5.	Number of swapped pairs: 3405.0/7503	nDCG_at_10: 0.4183
epoch: 5.	Number of swapped pairs: 6796.0/14028	nDCG_at_10: 0.3871
epoch: 5.	Number of swapped pairs: 3112.0/7260	nDCG_at_10: 0.4653
epoch: 5.	Number of swapped pairs: 3758.0/9316	nDCG_at_10: 0.3245
epoch: 5.	Number of swapped pairs: 707.0/1711	nDCG_at_10: 0.3703
epoch: 5.	Number of swapped pairs: 2669.0/6555	nDCG_at_10: 0.1894
epoch: 5.	Number of swapped pairs: 3387.0/8646	nDCG_at_10: 0.3169
epoch: 5.	Number of swapped pairs: 2111.0/3570	nDCG_at_10: 0.0000
epoch: 5.	Number of swapped pairs: 9245.0/19503	nDCG_at_10: 0.2560
epoch: 5.	Number of swapped pairs: 3348.0/7875	nDCG_at_10: 0.3336
epoch: 5.	Number of swapped pairs: 3610.0/7875	nDCG_at_10: 0.1420
epoch: 5

### Синтетические данные

In [41]:
def make_dataset(N_train, N_valid, vector_dim):
    fake_weights = torch.randn(vector_dim, 1)

    X_train = torch.randn(N_train, vector_dim)
    X_valid = torch.randn(N_valid, vector_dim)

    ys_train_score = torch.mm(X_train, fake_weights)
    ys_train_score += torch.randn_like(ys_train_score)

    ys_valid_score = torch.mm(X_valid, fake_weights)
    ys_valid_score += torch.randn_like(ys_valid_score)

#     bins = [-1, 1]  # 3 relevances
    bins = [-1, 0, 1, 2]  # 5 relevances
    ys_train_rel = torch.Tensor(
        np.digitize(ys_train_score.clone().detach().numpy(), bins=bins)
    )
    ys_valid_rel = torch.Tensor(
        np.digitize(ys_valid_score.clone().detach().numpy(), bins=bins)
    )

    return X_train, X_valid, ys_train_rel, ys_valid_rel

In [42]:
N_train = 1000
N_valid = 500

vector_dim = 100
epochs = 2

batch_size = 16

X_train, X_valid, ys_train, ys_valid = make_dataset(N_train, N_valid, vector_dim)

In [43]:
# 1 if i-th score larger than j-th

comparison = []
ys_to_check = ys_train.flatten().numpy()

for i, idx1 in enumerate(ys_to_check):
    for j, idx2 in enumerate(ys_to_check):
        if i >= j:
            continue
        if idx1 > idx2:
            comparison.append((i, j, 1))  # i должен быть отранжирован выше j
        elif idx1 < idx2:
            comparison.append((i, j, 0))  # j должен быть отранжирован выше i
        else:
            comparison.append((i, j, 0.5))  # одинаковый ранг

print(f'Length of comparison: {len(comparison)}')
print(f'Classes balance:\n{pd.Series(np.array([x[2] for x in comparison])).value_counts()}')

comparison[:10]

Length of comparison: 499500
Classes balance:
0.5    202335
0.0    154062
1.0    143103
Name: count, dtype: int64


[(0, 1, 1),
 (0, 2, 0.5),
 (0, 3, 1),
 (0, 4, 1),
 (0, 5, 0.5),
 (0, 6, 0.5),
 (0, 7, 1),
 (0, 8, 0.5),
 (0, 9, 1),
 (0, 10, 1)]

In [44]:
(1000 * 999) / 2

499500.0

In [45]:
class SynthPairDataset(torch.utils.data.Dataset):
    def __init__(self, data, comparison):
        super().__init__()
        self.data = data
        self.comparison = comparison

    def __len__(self):
        return len(self.comparison)

    def __getitem__(self, index):
        i, j, target = self.comparison[index]
        x_i, x_j = self.data[i], self.data[j]

        return torch.as_tensor(x_i, dtype=torch.float32),\
        torch.as_tensor(x_j, dtype=torch.float32),\
        torch.as_tensor(target, dtype=torch.float32)

In [46]:
synth_dataset = SynthPairDataset(X_train, comparison)
synth_dataloader = torch.utils.data.DataLoader(synth_dataset, batch_size=32, shuffle=True)

In [47]:
for first, second, tgt in synth_dataloader:
    print(first)
    print(second)
    print(tgt)
    break

tensor([[ 1.6010e-01, -3.5654e-01, -3.0648e-01,  ...,  1.5710e-01,
         -7.0745e-02,  2.3923e-01],
        [ 5.9007e-02, -2.0888e-01, -4.9197e-01,  ..., -4.6880e-02,
          1.0705e+00,  8.4041e-01],
        [ 2.6172e-01, -1.7634e-03,  6.1484e-01,  ..., -4.5866e-01,
         -1.3954e+00,  5.9419e-01],
        ...,
        [ 2.3660e-01,  5.8804e-02, -6.9478e-02,  ..., -5.6859e-01,
          1.0640e+00, -2.1690e+00],
        [ 8.1490e-01, -1.1789e+00, -1.2862e-01,  ...,  1.1786e+00,
          1.3115e-01,  1.0842e+00],
        [-7.7179e-01, -2.6162e-01, -3.7023e-02,  ...,  4.9731e-01,
         -4.6563e-02,  5.4360e-01]])
tensor([[-0.5818,  0.9190, -0.8213,  ...,  0.2124, -0.1681,  0.0421],
        [-1.0465,  0.0703,  0.6797,  ..., -1.7304, -0.3119,  0.7299],
        [-0.5743,  0.0293,  1.0397,  ..., -0.1790,  1.5311,  0.3444],
        ...,
        [ 0.0120,  0.3378,  0.0336,  ...,  0.2011, -0.7678, -0.8371],
        [-0.2948, -0.1830,  0.8458,  ...,  0.4807, -1.6280, -1.9756],
     

In [48]:
# test the model works

ranknet_model = RankNet(num_input_features=100)

criterion = torch.nn.BCELoss()

preds = ranknet_model(first, second)
criterion(preds, tgt.view(-1, 1))

tensor(0.7088, grad_fn=<BinaryCrossEntropyBackward0>)

In [49]:
ranknet_model = RankNet(num_input_features=100, hidden_dim=16)
ranknet_model.to(device)

criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(ranknet_model.parameters())

synth_dataset = SynthPairDataset(X_train, comparison)
synth_dataloader = torch.utils.data.DataLoader(synth_dataset, batch_size=32, shuffle=True)

In [50]:
epochs = 1
num_batches = len(synth_dataset)

for epoch in range(epochs):
    print(f'Epoch: {epoch}')
    train_losses = []
    for it, (batch_x1, batch_x2, tgt) in enumerate(synth_dataloader):

        optimizer.zero_grad()
        batch_x1 = batch_x1.to(device)
        batch_x2 = batch_x2.to(device)
        tgt = tgt.to(device)

        batch_pred = ranknet_model(batch_x1, batch_x2)
        batch_loss = criterion(batch_pred, tgt.view(-1, 1))
        batch_loss.backward()

        optimizer.step()

        train_losses.append(batch_loss.item() * len(batch_x1))

        if it % 1000 == 0:
            with torch.no_grad():
                total_pairs = len(X_valid) * (len(X_valid) - 1) // 2
                
                valid_pred = ranknet_model.predict(X_valid.to(device)).cpu().flatten()
                
                valid_swapped_pairs = -(sts.kendalltau(ys_valid.numpy(), valid_pred.numpy()).statistic * total_pairs - total_pairs) // 2
                ndcg_score_ = ndcg_k(ys_valid.flatten(), valid_pred, gain_scheme='const', k=100)
                print(f"Number of swapped pairs: "
                      f"{valid_swapped_pairs}/{total_pairs}\t"
                f"nDCG_at_100: {ndcg_score_:.4f}")
                
    print(f'Train loss: {np.sum(train_losses) / num_batches}')

Epoch: 0
Number of swapped pairs: 63783.0/124750	nDCG_at_100: 0.4547
Number of swapped pairs: 19038.0/124750	nDCG_at_100: 1.0000
Number of swapped pairs: 21343.0/124750	nDCG_at_100: 0.9980
Number of swapped pairs: 22901.0/124750	nDCG_at_100: 0.9782
Number of swapped pairs: 23531.0/124750	nDCG_at_100: 0.9805
Number of swapped pairs: 24154.0/124750	nDCG_at_100: 0.9744
Number of swapped pairs: 24366.0/124750	nDCG_at_100: 0.9777
Number of swapped pairs: 24758.0/124750	nDCG_at_100: 0.9700
Number of swapped pairs: 24924.0/124750	nDCG_at_100: 0.9649
Number of swapped pairs: 24961.0/124750	nDCG_at_100: 0.9630
Number of swapped pairs: 25069.0/124750	nDCG_at_100: 0.9691
Number of swapped pairs: 25242.0/124750	nDCG_at_100: 0.9647
Number of swapped pairs: 25100.0/124750	nDCG_at_100: 0.9619
Number of swapped pairs: 25039.0/124750	nDCG_at_100: 0.9701
Number of swapped pairs: 25192.0/124750	nDCG_at_100: 0.9697
Number of swapped pairs: 25121.0/124750	nDCG_at_100: 0.9623
Train loss: 0.3091944767968194


# ListNet

In [51]:
from itertools import combinations
import numpy as np

In [52]:
class ListNet(torch.nn.Module):
    def __init__(self, num_input_features, hidden_dim=10):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.model = torch.nn.Sequential(
            torch.nn.Linear(num_input_features, self.hidden_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(self.hidden_dim, 1),
        )


    def forward(self, input_1):
        logits = self.model(input_1)
        return logits


$$CE = -\sum ^{N}_{j=1} (P_y^i(j) * log(P_z^i(j)))$$

$$\text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}$$

In [53]:
def listnet_ce_loss(y_i, z_i):
    """
    y_i: (n_i, 1) GT
    z_i: (n_i, 1) preds
    """

    P_y_i = torch.softmax(y_i, dim=0)
    P_z_i = torch.softmax(z_i, dim=0)
    return -torch.sum(P_y_i * torch.log(P_z_i))

def listnet_kl_loss(y_i, z_i):
    """
    y_i: (n_i, 1) GT
    z_i: (n_i, 1) preds
    """
    P_y_i = torch.softmax(y_i, dim=0)
    P_z_i = torch.softmax(z_i, dim=0)
    return -torch.sum(P_y_i * torch.log(P_z_i/P_y_i))


def make_dataset(N_train, N_valid, vector_dim):
    fake_weights = torch.randn(vector_dim, 1)

    X_train = torch.randn(N_train, vector_dim)
    X_valid = torch.randn(N_valid, vector_dim)

    ys_train_score = torch.mm(X_train, fake_weights)
    ys_train_score += torch.randn_like(ys_train_score)

    ys_valid_score = torch.mm(X_valid, fake_weights)
    ys_valid_score += torch.randn_like(ys_valid_score)

#     bins = [-1, 1]  # 3 relevances
    bins = [-1, 0, 1, 2]  # 5 relevances
    ys_train_rel = torch.Tensor(
        np.digitize(ys_train_score.clone().detach().numpy(), bins=bins)
    )
    ys_valid_rel = torch.Tensor(
        np.digitize(ys_valid_score.clone().detach().numpy(), bins=bins)
    )

    return X_train, X_valid, ys_train_rel, ys_valid_rel

In [54]:
N_train = 1000
N_valid = 500

vector_dim = 100
epochs = 2

batch_size = 16

X_train, X_valid, ys_train, ys_valid = make_dataset(N_train, N_valid, vector_dim)

net = ListNet(num_input_features=vector_dim)
opt = torch.optim.Adam(net.parameters())


In [55]:
torch.unique(ys_train)

tensor([0., 1., 2., 3., 4.])

In [56]:
for epoch in range(epochs):
    idx = torch.randperm(N_train)

    X_train = X_train[idx]
    ys_train = ys_train[idx]

    cur_batch = 0
    for it in range(N_train // batch_size):
        batch_X = X_train[cur_batch: cur_batch + batch_size]
        batch_ys = ys_train[cur_batch: cur_batch + batch_size]
        cur_batch += batch_size

        opt.zero_grad()
        if len(batch_X) > 0:
            batch_pred = net(batch_X)
            batch_loss = listnet_kl_loss(batch_ys, batch_pred)
#             batch_loss = listnet_ce_loss(batch_ys, batch_pred)
            batch_loss.backward(retain_graph=True)
            opt.step()

        if it % 10 == 0:
            with torch.no_grad():
                valid_pred = net(X_valid)
                valid_swapped_pairs = num_swapped_pairs(ys_valid.flatten(),
                                                        valid_pred.flatten())
                ndcg_score = ndcg(ys_valid.flatten(), valid_pred.flatten())
            print(f"epoch: {epoch + 1}.\tNumber of swapped pairs: "
                  f"{valid_swapped_pairs}/{N_valid * (N_valid - 1) // 2}\t"
                  f"nDCG: {ndcg_score:.4f}")

epoch: 1.	Number of swapped pairs: 65481/124750	nDCG: 0.8216
epoch: 1.	Number of swapped pairs: 62606/124750	nDCG: 0.8362
epoch: 1.	Number of swapped pairs: 59760/124750	nDCG: 0.8553
epoch: 1.	Number of swapped pairs: 57234/124750	nDCG: 0.8702
epoch: 1.	Number of swapped pairs: 54528/124750	nDCG: 0.8853
epoch: 1.	Number of swapped pairs: 51845/124750	nDCG: 0.9033
epoch: 1.	Number of swapped pairs: 49097/124750	nDCG: 0.9222
epoch: 2.	Number of swapped pairs: 48541/124750	nDCG: 0.9240
epoch: 2.	Number of swapped pairs: 46233/124750	nDCG: 0.9352
epoch: 2.	Number of swapped pairs: 44302/124750	nDCG: 0.9443
epoch: 2.	Number of swapped pairs: 42614/124750	nDCG: 0.9529
epoch: 2.	Number of swapped pairs: 40890/124750	nDCG: 0.9613
epoch: 2.	Number of swapped pairs: 39246/124750	nDCG: 0.9678
epoch: 2.	Number of swapped pairs: 37845/124750	nDCG: 0.9719


# Task solution

In [57]:
import math

import numpy as np
import torch
from catboost.datasets import msrank_10k
from sklearn.preprocessing import StandardScaler

from typing import List


class ListNet(torch.nn.Module):
    def __init__(self, num_input_features: int, hidden_dim: int):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.model = torch.nn.Sequential(
            torch.nn.Linear(num_input_features, hidden_dim),
            torch.nn.Dropout(0.3),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_dim, 1)
        )

    def forward(self, input_1: torch.Tensor) -> torch.Tensor:
        logits = self.model(input_1)
        return logits


class Solution:
    def __init__(self, n_epochs: int = 5, listnet_hidden_dim: int = 30,
                 lr: float = 0.001, ndcg_top_k: int = 10):
        self._prepare_data()
        self.num_input_features = self.X_train.shape[1]
        self.ndcg_top_k = ndcg_top_k
        self.n_epochs = n_epochs

        self.model = self._create_model(
            self.num_input_features, listnet_hidden_dim)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)

    def _get_data(self) -> List[np.ndarray]:
        train_df, test_df = msrank_10k()

        X_train = train_df.drop([0, 1], axis=1).values
        y_train = train_df[0].values
        query_ids_train = train_df[1].values.astype(int)

        X_test = test_df.drop([0, 1], axis=1).values
        y_test = test_df[0].values
        query_ids_test = test_df[1].values.astype(int)

        return [X_train, y_train, query_ids_train, X_test, y_test, query_ids_test]

    def _prepare_data(self) -> None:
        (X_train, y_train, self.query_ids_train,
            X_test, y_test, self.query_ids_test) = self._get_data()
        
        self.X_train = self._scale_features_in_query_groups(X_train, self.query_ids_train)
        self.X_test = self._scale_features_in_query_groups(X_test, self.query_ids_test)

        self.ys_train = torch.as_tensor(y_train, dtype=torch.float32)
        self.ys_test = torch.as_tensor(y_test, dtype=torch.float32)

    def _scale_features_in_query_groups(self, inp_feat_array: np.ndarray,
                                        inp_query_ids: np.ndarray) -> np.ndarray:

        scaled_features = np.zeros_like(inp_feat_array)
        
        for query in np.unique(inp_query_ids):
            idxs = np.where(inp_query_ids == query)[0]
            scaled_features[idxs] = StandardScaler().fit_transform(inp_feat_array[idxs])
        return torch.as_tensor(scaled_features, dtype=torch.float32)

    def _create_model(self, listnet_num_input_features: int,
                      listnet_hidden_dim: int) -> torch.nn.Module:
        torch.manual_seed(0)
        net = ListNet(listnet_num_input_features, listnet_hidden_dim)
        return net

    def fit(self) -> List[float]:
        ndcgs = []
        for epoch in range(self.n_epochs):
            self._train_one_epoch()

            ndcg = self._eval_test_set()
            ndcgs.append(ndcg)
            print(f'Epoch: {epoch+1}. NDCG_{self.ndcg_top_k}: {ndcg}')
        return ndcgs

    def _calc_loss(self, batch_ys: torch.FloatTensor,
                   batch_pred: torch.FloatTensor) -> torch.FloatTensor:
        P_y_i = torch.softmax(batch_ys, dim=0)
        P_z_i = torch.softmax(batch_pred, dim=0)
        return -torch.sum(P_y_i * torch.log(P_z_i))

    def _train_one_epoch(self) -> None:
        self.model.train()
        for query in np.unique(self.query_ids_train):
            query_idxs = np.where(self.query_ids_train == query)[0]
            query_x = self.X_train[query_idxs]
            query_y = self.ys_train[query_idxs]
            
            # здесь можно добавить итерацию по батчам
            self.optimizer.zero_grad()
            
            preds = self.model(query_x).flatten()
            loss = self._calc_loss(query_y, preds)
            loss.backward()
            self.optimizer.step()

    def _eval_test_set(self) -> float:
        with torch.no_grad():
            self.model.eval()
            ndcgs = []
            for query in np.unique(self.query_ids_test):
                query_idxs = np.where(self.query_ids_test == query)[0]
                query_x = self.X_test[query_idxs]
                query_y = self.ys_test[query_idxs].flatten()
                
                preds = self.model(query_x).flatten()
                ndcg = self._ndcg_k(query_y, preds, ndcg_top_k=self.ndcg_top_k)

                ndcgs.append(ndcg)
            return np.mean(ndcgs)

    def _ndcg_k(self, ys_true: torch.Tensor, ys_pred: torch.Tensor,
                ndcg_top_k: int) -> float:
        
        def compute_gain(y_value: float, gain_scheme: str) -> float:
            assert gain_scheme in ['const', 'exp2']
            if gain_scheme == 'const':
                return y_value
            elif gain_scheme == 'exp2':
                return 2 ** y_value - 1
        
        
        def dcg_k(ys_true: Tensor, ys_pred: Tensor, gain_scheme: str, k: int) -> float:
            dcg_value = 0
            _, sorted_ys_pred_idx = sort(ys_pred, descending=True)
            for i, rel in enumerate(ys_true[sorted_ys_pred_idx][:k]):
                dcg_value += compute_gain(rel, gain_scheme=gain_scheme) / log2(i+2)
            return dcg_value

        # расчет по экспоненциальной формуле
        dcg_value = dcg_k(ys_true, ys_pred, gain_scheme='exp2', k=ndcg_top_k)
        perfect_dcg = dcg_k(ys_true, ys_true, gain_scheme='exp2', k=ndcg_top_k)
        
        return dcg_value / perfect_dcg


In [58]:
solution = Solution(listnet_hidden_dim=30)

In [59]:
solution.fit()

Epoch: 1. NDCG_10: 0.41444405913352966
Epoch: 2. NDCG_10: 0.43271610140800476
Epoch: 3. NDCG_10: 0.4335741698741913
Epoch: 4. NDCG_10: 0.4383959472179413
Epoch: 5. NDCG_10: 0.43431270122528076


[0.41444406, 0.4327161, 0.43357417, 0.43839595, 0.4343127]