# LambdaRank Implementation in PyTorch

### Key formulation of LambdaRank

Formulation of pairwise ranking, for document $i$ and $j$ - Ranknet Loss function  

\begin{equation}
\begin{split}
L(y, s) &= \sum_{i=1}^{n}\sum_{j=1}^{n}\mathop{\mathbb{I}_{y_i > y_j}} \log_2(1 + e^{-\sigma(s_i - s_j)}) \\
& = \sum_{y_i > y_j} \log_2(1+e^{-\sigma(s_i - s_j)})
\end{split}
\end{equation}

#### Ranking Metrics - NDGC
\begin{equation}
\text{NDCG} = \frac{1}{\text{maxDCG}} \sum_{i=1}^{n} \frac{2^{y_i} - 1}{\log_2(1+i)} = \sum_{i=1}^{n}\frac{G_i}{D_i}
\end{equation}
where
\begin{equation}
G_i = \frac{2^{y_i} - 1}{\text{maxDCG}}, D_i = \log_2(1+i)
\end{equation}

- $G_i$ is the gain function
- $D_i$ is the discount functions
- $\text{maxDCG}$ is a constant factor per query

#### LambdaRank - Dynamically adjust the loss function during the training based on ranking metrics

Define the change of NDCG
\begin{equation}
\Delta\text{NDCG}(i,j) = |G_i - G_j||\frac{1}{D_i} -  \frac{1}{D_j}|
\end{equation}

Loss function
\begin{equation}
L(y,s) = \sum_{y_i>y_j}\Delta\text{NDCG}(i,j) log_2(1+e^{-\sigma(s_i-s_j)})
\end{equation}

In [2]:
import torch
import numpy as np

from torch.utils.data import Dataset, DataLoader

In [3]:
class MSLR10KDataset(Dataset):
    """MSLR 10K Pairs Dataset"""
    
    def __init__(self, path, mode="single"):
        """
        Args:
            path (str)
            mode (str), "single" or "pairs"
        """
        
        assert mode in ["single", "pairs"]
        
        print("Mode: %s" % mode)
        
        self.path = path
        self.mode = mode
        self.features = []
        self.labels = []
        self.query_ids = []
        
        # Generate dataset
        self._get_format_data(self.path)
        
        if mode == "pairs":
            self.pairs, self.scores, self.i_features, self.j_features, self.i_labels, self.j_labels = \
                self._get_pair_doc_data(self.labels, self.query_ids)
        

    def _get_format_data(self, data_path):
        """
        Extract data from data path
        Args:
            data_path (str): Path of the data file
        """
        
        print("Getting data from %s" % data_path)
        
        def _extract_features(toks):
            """Extract features from tokens (e.g. 1: 0 -> 0)"""
            features = []
            for tok in toks:
                features.append(float(tok.split(":")[1]))
            return features

        def _extract_query_data(tok):
            """Extract query features (e.g. qid: 10 -> 10)"""
            # qid
            query_features = [tok.split(":")[1]]
            return query_features
        
        with open(data_path, 'r', encoding='utf-8') as f:
            for line in f:
                data, _, comment = line.rstrip().partition("#")
                toks = data.split()

                self.labels.append(int(toks[0]))                  # label - The relevance score
                self.features.append(_extract_features(toks[2:]))    # doc features
                self.query_ids.append(_extract_query_data(toks[1]))  # qid
                
    def _get_pair_doc_data(self, y_train, query_id):
        """
        Get pairs data
        Args:
            y_train (list): List of relevance score
            query_id (list): List of query_id
        """
        pairs = []
        scores = []
        i_features = []
        j_features = []
        i_labels = []
        j_labels = []

        for i in range(0, len(query_id) - 1):
            for j in range(i + 1, len(query_id)):

                # Make sure the documents are for the same query id
                if query_id[i][0] != query_id[j][0]:
                    break

                if y_train[i] > y_train[j]:
                    pairs.append((i, j))
                    i_features.append(self.features[i])
                    j_features.append(self.features[j])
                    i_labels.append(y_train[i])
                    j_labels.append(y_train[j])
                    scores.append(1)
                elif y_train[i] < y_train[j]:
                    pairs.append((j, i))
                    i_features.append(self.features[j])
                    j_features.append(self.features[i])
                    i_labels.append(y_train[j])
                    j_labels.append(y_train[i])
                    scores.append(1)
                else:
                    pairs.append((i, j))
                    i_features.append(self.features[i])
                    j_features.append(self.features[j])
                    i_labels.append(y_train[i])
                    j_labels.append(y_train[j])
                    scores.append(0)

        return pairs, scores, i_features, j_features, i_labels, j_labels
    
    def __len__(self):
        return len(self.query_ids)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        if self.mode == "pairs":
            sample = {"pairs": self.pairs[idx],
                      "i_features": torch.tensor(np.array(self.i_features[idx])),
                      "j_features": torch.tensor(np.array(self.j_features[idx])),
                      "i_label": torch.tensor(np.array(self.i_labels[idx])),
                      "j_label": torch.tensor(np.array(self.j_labels[idx])),
                      "scores": torch.tensor(self.scores[idx])}
        elif self.mode == "single":
            sample = {"idx": torch.tensor(np.array(idx), dtype=torch.float),
                      "features": torch.tensor(np.array(self.features[idx]), dtype=torch.float),
                      "label": torch.tensor(np.array(self.labels[idx]), dtype=torch.float)}
        else:
            raise ValueError("Mode should be either single or pairs")
        
        return sample

In [4]:
dataset = MSLR10KDataset(path="./data/MSLR-WEB10K/Fold1/train.txt", mode="single")

Mode: single
Getting data from ./data/MSLR-WEB10K/Fold1/train.txt


In [5]:
len(dataset)

723412

In [6]:
pairs_dataset = MSLR10KDataset(path="./data/MSLR-WEB10K/Fold1/train.txt", mode="pairs")

Mode: pairs
Getting data from ./data/MSLR-WEB10K/Fold1/train.txt


In [7]:
len(pairs_dataset)

723412

In [8]:
pairs_dataset[0]

{'pairs': (0, 1),
 'i_features': tensor([ 3.0000e+00,  3.0000e+00,  0.0000e+00,  0.0000e+00,  3.0000e+00,
          1.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,
          1.5600e+02,  4.0000e+00,  0.0000e+00,  7.0000e+00,  1.6700e+02,
          6.9313e+00,  2.2077e+01,  1.9673e+01,  2.2255e+01,  6.9266e+00,
          3.0000e+00,  3.0000e+00,  0.0000e+00,  0.0000e+00,  6.0000e+00,
          1.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  2.0000e+00,
          1.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  2.0000e+00,
          1.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  2.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          1.9231e-02,  7.5000e-01,  0.0000e+00,  0.0000e+00,  3.5928e-02,
          6.4100e-03,  2.5000e-01,  0.0000e+00,  0.0000e+00,  1.1976e-02,
          6.4100e-03,  2.5000e-01,  0.0000e+00,  0.0000e+00,  1.1976e-02,
          6.4100e-03,  2.5000e-01,  0.0000e+00,  0.0000e+00,  1.1976e-02,
      

In [9]:
dataloader = DataLoader(dataset, batch_size=8,
                        shuffle=True, num_workers=4)

In [10]:
NUM_FEATURES = len(dataset[0]['features'])

In [11]:
for i_batch, sample_batched in enumerate(dataloader):
    print(sample_batched["features"].size())
    break

torch.Size([8, 136])


## Setup the model

Reference: https://github.com/airalcorn2/RankNet/blob/master/lambdarank.py

In [12]:
import torch.nn as nn
import torch.optim as optim

In [13]:
class RankNet(nn.Module):
    """Pairwise Ranking Ranknet"""
    
    def __init__(self, num_features, hidden_size_1=32, hidden_size_2=16):
        
        super().__init__()
        
        self.model = nn.Sequential(nn.Linear(num_features, hidden_size_1),
                                   nn.Dropout(0.5),
                                   nn.ReLU(),
                                   nn.Linear(hidden_size_1, hidden_size_2),
                                   nn.Dropout(0.5),
                                   nn.ReLU(),
                                   nn.Linear(hidden_size_2, 1))
        self.output = nn.Sigmoid()
        
    def forward(self, input_i, input_j):
        si = self.model(input_i)
        sj = self.model(input_j)
        diff = si - sj
        prob = self.output(diff)
        return prob
    
    def predict(self, x):
        return self.model(x)

In [14]:
ranknet = RankNet(NUM_FEATURES)

In [16]:
# Calculate scores
doc_features = dataset[0]['features']
doc_scores = ranknet.predict(doc_features.float())

In [17]:
doc_scores

tensor([-104588.7656], grad_fn=<AddBackward0>)

In [59]:
def rank_tensors(t):
    rank = torch.zeros(8)
    sorted_values, idx = torch.sort(t, dim=0, descending=True)
    rank[idx] = 1 + torch.arange(t.size()[0]).float().view(-1, 1)
    print(rank.shape)
    return sorted_values, idx, rank

In [57]:
def calc_dcg(scores, labels):
    """
    Calculate DCG
    
    Args:
        scores (torch.Tensor)
    """
    sorted_scores, rank = rank_tensors(scores)
    relevance = labels
    
    print(sorted_scores)
    print(rank)
    print(relevance)
    
    nom = (2 ** relevance) - 1
    denom = torch.log2(rank.float() + 1)
    
    return torch.sum(nom/denom)

In [79]:
for i_batch, sample_batched in enumerate(dataloader):
    doc_features = sample_batched['features']
    doc_relevance_labels = sample_batched['label']
    doc_scores = ranknet.predict(doc_features)
    
    print("doc_scores -> ", doc_scores.flatten())
    print("doc_relevance_labels -> ", doc_relevance_labels.flatten())
    
    # 1. Get the ranking of the scores
    sorted_scores, sorted_scores_idx, _ = rank_tensors(doc_scores)
    scores_ranking = 1 + torch.arange(doc_scores.size()[0])
    sorted_labels = doc_relevance_labels[sorted_scores_idx]
    print("sorted_scores -> ", sorted_scores.flatten())
    print("sorted_labels -> ", sorted_labels.flatten())
    print("scores_ranking ->", scores_ranking.flatten())
    
    dcg = torch.sum(((2 ** sorted_labels) - 1) / torch.log2(1 + scores_ranking.float().view(-1, 1)))
    print("dcg -> ", dcg)
    
    # Max dcg
    sorted_true_labels, sorted_true_labels_idx, _ = rank_tensors(doc_relevance_labels.view(-1, 1))
    print("sorted_true_labels -> ", sorted_true_labels.flatten())
    print("sorted_true_labels_idx -> ", sorted_true_labels_idx.flatten())
    print("scores_ranking ->", scores_ranking.flatten())
    
    max_dcg = torch.sum(((2 ** sorted_true_labels) - 1) / torch.log2(1 + scores_ranking.float().view(-1, 1)))
    print("max_dcg -> ", max_dcg)
    
    ndcg = dcg / max_dcg
    print("ndcg -> ", ndcg)
    
    break

doc_scores ->  tensor([ 4.1295e+03,  1.3244e+03,  1.2197e-01, -3.7664e+01,  7.1668e+02,
        -1.4218e+02, -5.2687e+02,  5.0729e+02], grad_fn=<AsStridedBackward>)
doc_relevance_labels ->  tensor([1., 1., 0., 0., 1., 0., 1., 0.])
torch.Size([8])
sorted_scores ->  tensor([ 4.1295e+03,  1.3244e+03,  7.1668e+02,  5.0729e+02,  1.2197e-01,
        -3.7664e+01, -1.4218e+02, -5.2687e+02], grad_fn=<AsStridedBackward>)
sorted_labels ->  tensor([1., 1., 1., 0., 0., 0., 0., 1.])
scores_ranking -> tensor([1, 2, 3, 4, 5, 6, 7, 8])
dcg ->  tensor(2.4464)
torch.Size([8])
sorted_true_labels ->  tensor([1., 1., 1., 1., 0., 0., 0., 0.])
sorted_true_labels_idx ->  tensor([0, 1, 4, 6, 2, 3, 5, 7])
scores_ranking -> tensor([1, 2, 3, 4, 5, 6, 7, 8])
max_dcg ->  tensor(2.5616)
ndcg ->  tensor(0.9550)


In [None]:
def delta_ndcg(i, j, i_labels, j_labels):
    """
    Calculate the delta of NDCG
    Args:
        i: i scores
        j: j scores
    """
    pass