In [1]:
# Michael A. Alcorn (malcorn@redhat.com)
# A (slightly modified) implementation of LamdaRank as described in [1].
#   [1] https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf
#   [2] https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/MSR-TR-2010-82.pdf

import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [266]:
# (C) Mathieu Blondel, November 2013
# License: BSD 3 clause

import numpy as np



def dcg_score(y_true, y_score, k=10, gains="exponential"):
    """Discounted cumulative gain (DCG) at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array-like, shape = [n_samples]
        Predicted scores.
    k : int
        Rank.
    gains : str
        Whether gains should be "exponential" (default) or "linear".
    Returns
    -------
    DCG @k : float
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    if gains == "exponential":
        gains = 2 ** y_true - 1
    elif gains == "linear":
        gains = y_true
    else:
        raise ValueError("Invalid gains option.")

    # highest rank is 1 so +2 instead of +1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)


def ndcg_score(y_true, y_score, k=10, gains="exponential"):
    """Normalized discounted cumulative gain (NDCG) at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array-like, shape = [n_samples]
        Predicted scores.
    k : int
        Rank.
    gains : str
        Whether gains should be "exponential" (default) or "linear".
    Returns
    -------
    NDCG @k : float
    """
    best = dcg_score(y_true, y_true, k, gains)
    actual = dcg_score(y_true, y_score, k, gains)
    return actual / best

In [265]:
#ndcg_score([1,2,3],[1,2,3])

In [8]:
# Data.
input_dim = 46
data_file = 'data/MQ2007/Fold1/train.txt'
data_dir = 'data/MQ2007/Fold1/q_json'
data_meta_csv = "{}/metafile.csv".format(data_dir)

feats_to_drop = ['doc_id','inc','prob','qid','y']

In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [10]:
# couple of days
# day that you recieve the offer 
# email: 
# 2 weeks
# location: santa-calara, austin

In [11]:
#label_tensor = torch.tensor([1,0,0,0,1])

In [12]:
#(label_tensor==0).sum().item()

In [13]:
class RANKNET_DS(Dataset):
    """Document Ranking Dataset."""
    def __init__(self, csv_file, root_dir, transform=None):
        """
        Args:
            text_file (string): Path to the txt file with q_id.
            root_dir (string): Directory with all the query_details.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.meta_file = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        self.feats_to_drop = feats_to_drop

    def __len__(self):
        return len(self.meta_file)

    def __getitem__(self, idx):
        q_fname = os.path.join(self.root_dir,str(self.meta_file.iloc[idx]['qid']))
        q_data = pd.read_csv("{}.csv".format(q_fname))
        if self.transform:
            return self.transform(q_data,self.feats_to_drop)
            
        return q_data

In [14]:
ranknet_ds = RANKNET_DS(data_meta_csv,data_dir,transform)
label,data = ranknet_ds[13]

NameError: name 'transform' is not defined

In [249]:
def transform(q_sample,cols_to_drop):
    """
        input dataframe
        transforms datafram into tensor
    """
    
    label_tensor = torch.tensor(q_sample['y'].values)
    n_rel = (label_tensor!=0).sum().item()
    data_tensor = torch.tensor(q_sample[q_sample.columns.difference(cols_to_drop)].values).float()
    return label_tensor,data_tensor,n_rel
    
class DOC_RANK(Dataset):
    """Document Ranking Dataset."""
    def __init__(self, csv_file, root_dir, transform=None):
        """
        Args:
            text_file (string): Path to the txt file with q_id.
            root_dir (string): Directory with all the query_details.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.meta_file = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        self.feats_to_drop = feats_to_drop

    def __len__(self):
        return len(self.meta_file)

    def __getitem__(self, idx):
        q_fname = os.path.join(self.root_dir,str(self.meta_file.iloc[idx]['qid']))
        q_data = pd.read_csv("{}.csv".format(q_fname))
        if self.transform:
            return self.transform(q_data,self.feats_to_drop)
            
        return q_data

In [223]:
# label,data = dataset[13]
# print(label)

# Model

In [251]:
# Model.
model = torch.nn.Sequential(
    nn.Linear(input_dim, 128),
    nn.ReLU(),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Linear(32, 1))

model = model.to(device)

In [253]:
dataset = DOC_RANK(data_meta_csv,data_dir,transform)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

In [291]:
def idcg(n_rel):
    # Assuming binary relevance.
    nums = np.ones(n_rel)
    denoms = np.log2(np.arange(n_rel) + 1 + 1)
    return (nums / denoms).sum()

for epoch in range(3):
    for labels,docs,n_rel in dataset:
        doc_scores = model(docs)
        n_docs = len(labels)
        n_irr = n_docs- n_rel
        #forward_compute
        doc_scores = model(docs)

        # Document ranks.
        (sorted_scores, sorted_idxs) = doc_scores.sort(dim = 0, descending = True)
        doc_ranks = torch.zeros(n_docs,dtype=torch.float).to(device)
        doc_ranks[sorted_idxs] = 1.0 + torch.arange(n_docs).view((n_docs, 1)).float().to(device)
        doc_ranks = doc_ranks.view((n_docs, 1))

        # # Compute lambdas.
        diffs = doc_scores[:n_rel] - doc_scores[n_rel:].view(n_irr)
        exped = diffs.exp()
        # # See equation (6) in [2].
        N = 1 / idcg(n_rel)
        #print('idgc =',N)
        ndcg_diffs = (1 / (1 + doc_ranks[:n_rel])).log2() - (1 / (1 + doc_ranks[n_rel:])).log2().view(n_irr)
        lamb_updates = -1 / (1 + exped) * N * ndcg_diffs.abs()
        # # See section 6.1 in [1], but lambdas have opposite signs from [2].
        lambs = torch.zeros((n_docs, 1)).to(device)
        lambs[:n_rel] -= lamb_updates.sum(dim = 1, keepdim = True)
        lambs[n_rel:] += lamb_updates.sum(dim = 0, keepdim = True).t()

        if i%100==0:
            print(ndcg_score([1]*n_rel+[2]*n_irr,doc_ranks.squeeze(1).tolist()))
        # # Accumulate lambda scaled gradients.
        model.zero_grad()
        doc_scores.backward(lambs)
        optimizer.step()

RuntimeError: The size of tensor a (0) must match the size of tensor b (40) at non-singleton dimension 0

In [262]:
n_irr

31

In [190]:
sorted_idxs.squeeze()

tensor([38,  8, 29, 10, 13, 22, 15,  1, 21, 26, 32, 24, 39,  4, 14, 19, 34, 28,
        12, 33,  7, 37, 30, 16,  9,  3, 11,  2,  5, 36, 18, 35,  6, 23, 31, 25,
        20,  0, 27, 17])

In [10]:
1+torch.arange(n_docs).view((n_docs, 1))

tensor([[ 1],
        [ 2],
        [ 3],
        [ 4],
        [ 5],
        [ 6],
        [ 7],
        [ 8],
        [ 9],
        [10],
        [11],
        [12],
        [13],
        [14],
        [15],
        [16],
        [17],
        [18],
        [19],
        [20]])