In [1]:
# !pip install catboost

In [1]:
import torch
import torchvision.utils

import numpy as np
import pandas as pd
from tqdm import tqdm

from scipy import stats as sts

from utils import ndcg, num_swapped_pairs, compute_ideal_dcg

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cuda:0'

# LambdaRank

Обучается ставить score выше для того, кто должен быть выше в списке. Для обучение нужно брать пары {i, j}, где i документ стоит выше, чем j. Для обучения нужно выбрать документ i и взять все пары, где он встречается, будь то {i, j} (где он выше) или {k, i} (где ниже).

$$C_{ij}=C(s_{i}-s_{j})=-\bar{P_{ij}}log(P_{ij})-(1-\bar{P_{ij}})log(1-P_{ij})=\frac{1}{2}(1-S_{ij})\sigma(s_{i}-s_{j})+log(1+e^{-\sigma(s_{i}-s_{j})})$$

$$\bar{P_{ij}}=\frac{1}{2}(1+S_{ij})$$

$$S_{ij}\in\{0;\pm1\}$$

In [2]:
class LambdaRank(torch.nn.Module):
    def __init__(self, num_input_features, hidden_dim=16):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.model = torch.nn.Sequential(
            torch.nn.Linear(num_input_features, self.hidden_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(self.hidden_dim, 1),
        )

    def forward(self, inp):
        logits = self.model(inp)
        return logits

In [81]:
def compute_lambdas(y_true, y_pred, gain_scheme='exp2'):
    # рассчитаем нормировку, IdealDCG
    ideal_dcg = compute_ideal_dcg(y_true, gain_scheme=gain_scheme)
    try:
        N = 1 / ideal_dcg
    except ZeroDivisionError:
        N = 0
    
    # рассчитаем порядок документов согласно оценкам релевантности
    _, rank_order = torch.sort(y_true, descending=True, axis=0)
    rank_order += 1
    
    with torch.no_grad():
        # получаем все попарные разницы скоров в батче
        pos_pairs_score_diff = 1.0 + torch.exp((y_pred - y_pred.t()))
        
        # поставим разметку для пар, 1 если первый документ релевантнее
        # -1 если второй документ релевантнее
        Sij = compute_labels_in_batch(y_true)
        # посчитаем изменение gain из-за перестановок
        gain_diff = compute_gain_diff(y_true, gain_scheme)
        
        # посчитаем изменение знаменателей-дискаунтеров
        decay_diff = (1.0 / torch.log2(rank_order + 1.0)) - (1.0 / torch.log2(rank_order.t() + 1.0))
        # посчитаем непосредственное изменение nDCG
        delta_ndcg = torch.abs(N * gain_diff * decay_diff)
        # посчитаем лямбды
        lambda_update =  (0.5 * (1 - Sij) - 1 / pos_pairs_score_diff) * delta_ndcg
        lambda_update = torch.sum(lambda_update, dim=1, keepdim=True)
        
        return lambda_update
    
    
def compute_labels_in_batch(y_true):
    
    # разница релевантностей каждого с каждым объектом
    rel_diff = y_true - y_true.t()
    
    # 1 в этой матрице - объект более релевантен
    pos_pairs = (rel_diff > 0).type(torch.float32)
    
    # 1 тут - объект менее релевантен
    neg_pairs = (rel_diff < 0).type(torch.float32)
    Sij = pos_pairs - neg_pairs
    return Sij

def compute_gain_diff(y_true, gain_scheme):
    if gain_scheme == "exp2":
        gain_diff = torch.pow(2.0, y_true) - torch.pow(2.0, y_true.t())
    elif gain_scheme == "const":
        gain_diff = y_true - y_true.t()
    else:
        raise ValueError(f"{gain_scheme} method not supported")
    return gain_diff

In [4]:
lambda_model = LambdaRank(num_input_features=10)

In [5]:
inp = torch.rand(4, 10)
y_true = torch.Tensor([[1], [3], [2], [0]])
# batch_size x input_dim
inp

tensor([[0.3256, 0.1433, 0.6899, 0.0219, 0.1388, 0.9045, 0.3842, 0.9491, 0.6014,
         0.8825],
        [0.3464, 0.6013, 0.6932, 0.0778, 0.1991, 0.2231, 0.4087, 0.5377, 0.0096,
         0.9442],
        [0.5481, 0.0355, 0.2304, 0.8616, 0.3843, 0.7930, 0.0048, 0.5900, 0.0301,
         0.1524],
        [0.3828, 0.5160, 0.0010, 0.0387, 0.6079, 0.3299, 0.2769, 0.8888, 0.2703,
         0.8580]])

In [11]:
preds = lambda_model(inp)
print(ndcg(y_true, preds))
preds

0.6913328532777214


tensor([[-0.0494],
        [-0.1123],
        [-0.0916],
        [-0.0911]], grad_fn=<AddmmBackward0>)

In [12]:
lambdas = compute_lambdas(y_true, preds)

In [13]:
optimizer = torch.optim.Adam(lambda_model.parameters())

In [14]:
for i in range(100):
    optimizer.zero_grad()
    preds = lambda_model(inp)
    if i % 20 == 0:
        print(preds)
        print(ndcg(y_true, preds))
    lambdas = compute_lambdas(y_true, preds)

    preds.backward(lambdas/preds.shape[0])
    optimizer.step()

tensor([[-0.0494],
        [-0.1123],
        [-0.0916],
        [-0.0911]], grad_fn=<AddmmBackward0>)
0.6913328532777214
tensor([[-0.1465],
        [-0.1094],
        [-0.1025],
        [-0.1756]], grad_fn=<AddmmBackward0>)
0.9224945508080385
tensor([[-0.2130],
        [-0.0955],
        [-0.1035],
        [-0.2364]], grad_fn=<AddmmBackward0>)
1.0
tensor([[-0.2632],
        [-0.0565],
        [-0.1029],
        [-0.2844]], grad_fn=<AddmmBackward0>)
1.0
tensor([[-0.3216],
        [-0.0064],
        [-0.1022],
        [-0.3392]], grad_fn=<AddmmBackward0>)
1.0


## Попробуем обучить

In [39]:
from catboost.datasets import msrank_10k

msrank_10k_train, msrank_10k_test = msrank_10k()
msrank_10k_train = msrank_10k_train.rename(columns={0: 'target', 1: 'query'}).rename(columns={i: f'feature_{i}' for i in range(2, 138)})
msrank_10k_test = msrank_10k_test.rename(columns={0: 'target', 1: 'query'}).rename(columns={i: f'feature_{i}' for i in range(2, 138)})

for feature in msrank_10k_train.columns:
    msrank_10k_train[feature] = msrank_10k_train[feature].astype(float)

for feature in msrank_10k_test.columns:
    msrank_10k_test[feature] = msrank_10k_test[feature].astype(float)

msrank_10k_train.head()

Unnamed: 0,target,query,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_128,feature_129,feature_130,feature_131,feature_132,feature_133,feature_134,feature_135,feature_136,feature_137
0,2.0,1.0,3.0,3.0,0.0,0.0,3.0,1.0,1.0,0.0,...,62.0,11089534.0,2.0,116.0,64034.0,13.0,3.0,0.0,0.0,0.0
1,2.0,1.0,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,54.0,11089534.0,2.0,124.0,64034.0,1.0,2.0,0.0,0.0,0.0
2,0.0,1.0,3.0,0.0,2.0,0.0,3.0,1.0,0.0,0.666667,...,45.0,3.0,1.0,124.0,3344.0,14.0,67.0,0.0,0.0,0.0
3,2.0,1.0,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,56.0,11089534.0,13.0,123.0,63933.0,1.0,3.0,0.0,0.0,0.0
4,1.0,1.0,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,64.0,5.0,7.0,256.0,49697.0,1.0,13.0,0.0,0.0,0.0


### Надо отнормировать признаки перед подачей в нейросеть, нормировать надо по группам, так как разные признаки соответствуют разным запросам

In [40]:
from sklearn.preprocessing import StandardScaler

for query in msrank_10k_train['query'].unique():
    scaler = StandardScaler()
    msrank_10k_train.loc[msrank_10k_train['query'] == query, 'feature_2':] = scaler\
    .fit_transform(msrank_10k_train.loc[msrank_10k_train['query'] == query, 'feature_2':])

for query in msrank_10k_test['query'].unique():
    scaler = StandardScaler()
    msrank_10k_test.loc[msrank_10k_test['query'] == query, 'feature_2':] = scaler\
    .fit_transform(msrank_10k_test.loc[msrank_10k_test['query'] == query, 'feature_2':])

In [41]:
x_check = torch.as_tensor(msrank_10k_train.loc[msrank_10k_train['query'] == 1, 'feature_2':].values, dtype=torch.float32)
y_check = torch.as_tensor(msrank_10k_train.loc[msrank_10k_train['query'] == 1, 'target'].values, dtype=torch.float32).view(-1, 1)

In [42]:
# test the model works

model = LambdaRank(num_input_features=136)
optimizer = torch.optim.Adam(model.parameters())

In [43]:
for i in range(100):
    optimizer.zero_grad()
    preds = model(x_check)
    if i % 20 == 0:
        print(ndcg(y_check, preds))
    lambdas = compute_lambdas(y_check, preds)

    preds.backward(lambdas/preds.shape[0])
    optimizer.step()

0.5891392105347059
0.8938722611119746
0.9149902933369998
0.9232238948801027
0.9267892939268222


In [44]:
class MSRankDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        x = self.x[index]
        y = self.y[index]

        return torch.as_tensor(x, dtype=torch.float32), torch.as_tensor(y, dtype=torch.float32)

In [45]:
x = msrank_10k_train.loc[msrank_10k_train['query'] == 1, 'feature_2':].values
y = msrank_10k_train.loc[msrank_10k_train['query'] == 1, 'target'].values.reshape(-1, 1)

dataset = MSRankDataset(x, y)

In [46]:
# test the model works

model = LambdaRank(num_input_features=136)
optimizer = torch.optim.Adam(model.parameters())

dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

In [47]:
for i in range(100):
    optimizer.zero_grad()
    for batch_x, batch_y in dataloader:
        preds = model(batch_x)
        lambdas = compute_lambdas(batch_y, preds)
        preds.backward(lambdas/batch_x.shape[0])
        
    optimizer.step()

    if i % 20 == 0:
        with torch.no_grad():
            whole_preds = model(x_check)
            print(ndcg(y_check, whole_preds))

0.720676636856725
0.8651334896561893
0.8954911591404534
0.9077700753816907
0.9133565127549089


### Обучение модели

In [100]:
# train
train_datasets = {}
train_dataloaders = {}

for query in msrank_10k_train['query'].unique():
    subset = msrank_10k_train.loc[msrank_10k_train['query'] == query]
    subset_x = subset.loc[:, 'feature_2':].values
    subset_y = subset.loc[:, 'target'].values.reshape(-1, 1)
    
    dataset = MSRankDataset(subset_x, subset_y)
    train_datasets[query] = dataset
    train_dataloaders[query] = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=True)

# test
test_datasets = {}
test_dataloaders = {}

for query in msrank_10k_test['query'].unique():
    subset = msrank_10k_test.loc[msrank_10k_test['query'] == query]
    subset_x = subset.loc[:, 'feature_2':].values
    subset_y = subset.loc[:, 'target'].values.reshape(-1, 1)
    
    dataset = MSRankDataset(subset_x, subset_y)
    test_datasets[query] = dataset
    test_dataloaders[query] = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=True)

In [101]:
def score_model(model, dataloader):
    model.eval()

    preds = []
    y_true = []
    with torch.no_grad():
        for it, (batch_x, batch_y) in enumerate(dataloader):
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)
            
            preds.append(model(batch_x))
            y_true.append(batch_y)
    preds = torch.concatenate(preds, dim=0)
    y_true = torch.concatenate(y_true, dim=0)

    try:
        result = ndcg(y_true, preds, gain_scheme='exp2', k=10)
    except ZeroDivisionError:
        result = 0

    return result

In [102]:
model = LambdaRank(num_input_features=136, hidden_dim=32)
model.to(device)

optimizer = torch.optim.Adam(model.parameters())

In [103]:
# score untrained

train_ndcgs = []
test_ndcgs = []

print('########### TRAIN')
for query, msrank_dataloader in train_dataloaders.items():
    ndcg_query = score_model(model, msrank_dataloader)
    # print(f'#### QUERY={query}, NDCG={ndcg_query}')
    train_ndcgs.append(ndcg_query)

print('########### TEST')
for query, msrank_dataloader in test_dataloaders.items():
    ndcg_query = score_model(model, msrank_dataloader)
    # print(f'#### QUERY={query}, NDCG={ndcg_query}')
    test_ndcgs.append(ndcg_query)

print(f'MEAN TRAIN={np.mean(train_ndcgs)}')
print(f'MEAN TEST={np.mean(test_ndcgs)}')

########### TRAIN
########### TEST
MEAN TRAIN=0.10866858044176127
MEAN TEST=0.11949016964402541


In [104]:
epochs = 25
len_dataset = len(msrank_10k_train)

for epoch in range(epochs):
    model.train()
    print(f'Epoch: {epoch}')
    train_losses = []
    for query, msrank_dataloader in train_dataloaders.items():
        for it, (batch_x, batch_y) in enumerate(msrank_dataloader):
    
            optimizer.zero_grad()
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)
    
            batch_pred = model(batch_x)
            lambdas = compute_lambdas(batch_y, batch_pred)
            batch_pred.backward(lambdas/batch_x.shape[0])
    
        optimizer.step()

    # check scores
    train_ndcgs = []
    test_ndcgs = []
    
    print('########### TRAIN')
    for query, msrank_dataloader in train_dataloaders.items():
        ndcg_query = score_model(model, msrank_dataloader)
        # print(f'#### QUERY={query}, NDCG={ndcg_query}')
        train_ndcgs.append(ndcg_query)
    
    print('########### TEST')
    for query, msrank_dataloader in test_dataloaders.items():
        ndcg_query = score_model(model, msrank_dataloader)
        # print(f'#### QUERY={query}, NDCG={ndcg_query}')
        test_ndcgs.append(ndcg_query)
    
    print(f'MEAN TRAIN={np.mean(train_ndcgs)}')
    print(f'MEAN TEST={np.mean(test_ndcgs)}')

Epoch: 0
########### TRAIN
########### TEST
MEAN TRAIN=0.40812127526588526
MEAN TEST=0.3452550271309377
Epoch: 1
########### TRAIN
########### TEST
MEAN TRAIN=0.4561624446357904
MEAN TEST=0.361557087164106
Epoch: 2
########### TRAIN
########### TEST
MEAN TRAIN=0.48806239120891987
MEAN TEST=0.3981795669297291
Epoch: 3
########### TRAIN
########### TEST
MEAN TRAIN=0.48414182749977436
MEAN TEST=0.39828112286589423
Epoch: 4
########### TRAIN
########### TEST
MEAN TRAIN=0.4731208490013138
MEAN TEST=0.39805250629393013
Epoch: 5
########### TRAIN
########### TEST
MEAN TRAIN=0.4803048019623369
MEAN TEST=0.41241678944021737
Epoch: 6
########### TRAIN
########### TEST
MEAN TRAIN=0.47805806366205916
MEAN TEST=0.41638812427488275
Epoch: 7
########### TRAIN
########### TEST
MEAN TRAIN=0.5018134056763615
MEAN TEST=0.4246640499767288
Epoch: 8
########### TRAIN
########### TEST
MEAN TRAIN=0.5117037513826276
MEAN TEST=0.4346634366644597
Epoch: 9
########### TRAIN
########### TEST
MEAN TRAIN=0.517401650