# LambdaRank Implementation in PyTorch

### Key formulation of LambdaRank

Formulation of pairwise ranking, for document $i$ and $j$ - Ranknet Loss function  

\begin{equation}
\begin{split}
L(y, s) &= \sum_{i=1}^{n}\sum_{j=1}^{n}\mathop{\mathbb{I}_{y_i > y_j}} \log_2(1 + e^{-\sigma(s_i - s_j)}) \\
& = \sum_{y_i > y_j} \log_2(1+e^{-\sigma(s_i - s_j)})
\end{split}
\end{equation}

#### Ranking Metrics - NDGC
\begin{equation}
\text{NDCG} = \frac{1}{\text{maxDCG}} \sum_{i=1}^{n} \frac{2^{y_i} - 1}{\log_2(1+i)} = \sum_{i=1}^{n}\frac{G_i}{D_i}
\end{equation}
where
\begin{equation}
G_i = \frac{2^{y_i} - 1}{\text{maxDCG}}, D_i = \log_2(1+i)
\end{equation}

- $G_i$ is the gain function
- $D_i$ is the discount functions
- $\text{maxDCG}$ is a constant factor per query

#### LambdaRank - Dynamically adjust the loss function during the training based on ranking metrics

Define the change of NDCG
\begin{equation}
\Delta\text{NDCG}(i,j) = |G_i - G_j||\frac{1}{D_i} -  \frac{1}{D_j}|
\end{equation}

Loss function
\begin{equation}
L(y,s) = \sum_{y_i>y_j}\Delta\text{NDCG}(i,j) log_2(1+e^{-\sigma(s_i-s_j)})
\end{equation}

In [11]:
import torch
import numpy as np

from torch.utils.data import Dataset, DataLoader

In [9]:
class MSLR10KDataset(Dataset):
    """MSLR 10K Pairs Dataset"""
    
    def __init__(self, path):
        """
        Args:
            pairs (list of tuples): The pairs of record to be compared
            scores (list of int): The scores of 1, -1, 0
            i_features (list of list): Feature list of ith document
            j_features (list of list): Feature list of jth document
        """
        
        self.path = path
        self.features = []
        self.labels = []
        self.query_ids = []
        
        # Generate dataset
        self._get_format_data(self.path)
        self.pairs, self.scores, self.i_features, self.j_features = \
            self._get_pair_doc_data(self.labels, self.query_ids)
        

    def _get_format_data(self, data_path):
        """
        Extract data from data path
        Args:
            data_path (str): Path of the data file
        """
        
        print("Getting data from %s" % data_path)
        
        def _extract_features(toks):
            """Extract features from tokens (e.g. 1: 0 -> 0)"""
            features = []
            for tok in toks:
                features.append(float(tok.split(":")[1]))
            return features

        def _extract_query_data(tok):
            """Extract query features (e.g. qid: 10 -> 10)"""
            # qid
            query_features = [tok.split(":")[1]]
            return query_features
        
        with open(data_path, 'r', encoding='utf-8') as f:
            for line in f:
                data, _, comment = line.rstrip().partition("#")
                toks = data.split()

                self.labels.append(int(toks[0]))                  # label - The relevance score
                self.features.append(_extract_features(toks[2:]))    # doc features
                self.query_ids.append(_extract_query_data(toks[1]))  # qid
                
    def _get_pair_doc_data(self, y_train, query_id):
        """
        Get pairs data
        Args:
            y_train (list): List of relevance score
            query_id (list): List of query_id
        """
        pairs = []
        scores = []
        i_features = []
        j_features = []

        for i in range(0, len(query_id) - 1):
            for j in range(i + 1, len(query_id)):

                # Make sure the documents are for the same query id
                if query_id[i][0] != query_id[j][0]:
                    break

                if y_train[i] > y_train[j]:
                    pairs.append((i, j))
                    i_features.append(self.features[i])
                    j_features.append(self.features[j])
                    scores.append(1)
                elif y_train[i] < y_train[j]:
                    pairs.append((j, i))
                    i_features.append(self.features[j])
                    j_features.append(self.features[i])
                    scores.append(1)
                else:
                    pairs.append((i, j))
                    i_features.append(self.features[i])
                    j_features.append(self.features[j])
                    scores.append(0)

        return pairs, scores, i_features, j_features
    
    def __len__(self):
        return len(self.query_ids)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        sample = {"pairs": self.pairs[idx],
                  "i_features": torch.tensor(np.array(self.i_features[idx])),
                  "j_features": torch.tensor(np.array(self.j_features[idx])),
                  "scores": torch.tensor(self.scores[idx])}
        
        return sample

In [5]:
dataset = MSLR10KDataset(path="./data/MSLR-WEB10K/Fold1/train.txt")

Getting data from ./data/MSLR-WEB10K/Fold1/train.txt


In [14]:
NUM_FEATURES = len(dataset[0]['i_features'])

## Setup the model

Reference: https://github.com/airalcorn2/RankNet/blob/master/lambdarank.py

In [7]:
import torch.nn as nn
import torch.optim as optim

In [8]:
class RankNet(nn.Module):
    """Pairwise Ranking Ranknet"""
    
    def __init__(self, num_features, hidden_size_1=32, hidden_size_2=16):
        
        super().__init__()
        
        self.model = nn.Sequential(nn.Linear(num_features, hidden_size_1),
                                   nn.Dropout(0.5),
                                   nn.ReLU(),
                                   nn.Linear(hidden_size_1, hidden_size_2),
                                   nn.Dropout(0.5),
                                   nn.ReLU(),
                                   nn.Linear(hidden_size_2, 1))
        self.output = nn.Sigmoid()
        
    def forward(self, input_i, input_j):
        si = self.model(input_i)
        sj = self.model(input_j)
        diff = si - sj
        prob = self.output(diff)
        return prob
    
    def predict(self, x):
        return self.model(x)

In [16]:
ranknet = RankNet(NUM_FEATURES)

In [None]:
# Calculate scores
doc_features = dataset[0]['i_features']
doc_scores = ranknet.predict(doc_features)

In [None]:
# Calculate Document rank
sorted_scores, rank = doc_scores.sort()

In [None]:
# Discount function
D_fcn = torch.log2(1 + rank)

In [None]:
def dcg(scores):
    """
    Calculate DCG
    
    Args:
        scores (torch.Tensor)
    """
    sorted_scores, rank = scores.sort()
    
    nom = (2 ** scores) - 1
    denom = torch.log2(rank + 1)
    
    