In [1]:
# A (slightly modified) implementation of LamdaRank as described in [1].
#   [1] https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf
#   [2] https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/MSR-TR-2010-82.pdf

import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
# (C) Mathieu Blondel, November 2013
# License: BSD 3 clause

import numpy as np



def dcg_score(y_true, y_score, k=10, gains="exponential"):
    """Discounted cumulative gain (DCG) at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array-like, shape = [n_samples]
        Predicted scores.
    k : int
        Rank.
    gains : str
        Whether gains should be "exponential" (default) or "linear".
    Returns
    -------
    DCG @k : float
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    if gains == "exponential":
        gains = 2 ** y_true - 1
    elif gains == "linear":
        gains = y_true
    else:
        raise ValueError("Invalid gains option.")

    # highest rank is 1 so +2 instead of +1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)


def ndcg_score(y_true, y_score, k=10, gains="exponential"):
    """Normalized discounted cumulative gain (NDCG) at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array-like, shape = [n_samples]
        Predicted scores.
    k : int
        Rank.
    gains : str
        Whether gains should be "exponential" (default) or "linear".
    Returns
    -------
    NDCG @k : float
    """
    best = dcg_score(y_true, y_true, k, gains)
    actual = dcg_score(y_true, y_score, k, gains)
    return actual / best

In [3]:
#ndcg_score([1,2,3],[1,2,3])

In [4]:
# Data.
input_dim = 46
data_file = 'data/MQ2007/Fold1/train.txt'
data_dir = 'data/MQ2007/Fold1/q_json'
data_meta_csv = "{}/metafile.csv".format(data_dir)

feats_to_drop = ['doc_id','inc','prob','qid','y']

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [6]:
# couple of days
# day that you recieve the offer 
# email: 
# 2 weeks
# location: santa-calara, austin

In [11]:
#label_tensor = torch.tensor([1,0,0,0,1])

In [12]:
#(label_tensor==0).sum().item()

In [9]:
class RANKNET_DS(Dataset):
    """Document Ranking Dataset."""
    def __init__(self, csv_file, root_dir, transform=None):
        """
        Args:
            text_file (string): Path to the txt file with q_id.
            root_dir (string): Directory with all the query_details.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.meta_file = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        self.feats_to_drop = feats_to_drop

    def __len__(self):
        return len(self.meta_file)

    def __getitem__(self, idx):
        q_fname = os.path.join(self.root_dir,str(self.meta_file.iloc[idx]['qid']))
        q_data = pd.read_csv("{}.csv".format(q_fname))
        if self.transform:
            return self.transform(q_data,self.feats_to_drop)
            
        return q_data

In [10]:
def transform(q_sample,cols_to_drop):
    """
        input dataframe
        transforms datafram into tensor
    """
    
    label_tensor = torch.tensor(q_sample['y'].values)
    n_rel = (label_tensor!=0).sum().item()
    data_tensor = torch.tensor(q_sample[q_sample.columns.difference(cols_to_drop)].values).float()
    return label_tensor,data_tensor,n_rel
    
class DOC_RANK(Dataset):
    """Document Ranking Dataset."""
    def __init__(self, csv_file, root_dir, transform=None):
        """
        Args:
            text_file (string): Path to the txt file with q_id.
            root_dir (string): Directory with all the query_details.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.meta_file = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        self.feats_to_drop = feats_to_drop

    def __len__(self):
        return len(self.meta_file)

    def __getitem__(self, idx):
        q_fname = os.path.join(self.root_dir,str(self.meta_file.iloc[idx]['qid']))
        q_data = pd.read_csv("{}.csv".format(q_fname))
        if self.transform:
            return self.transform(q_data,self.feats_to_drop)
            
        return q_data

In [17]:
ranknet_ds = RANKNET_DS(data_meta_csv,data_dir,transform)
label,dataset,n_rel= ranknet_ds[0]

# Model

In [18]:
# Model.
model = torch.nn.Sequential(
    nn.Linear(input_dim, 128),
    nn.ReLU(),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Linear(32, 1))

model = model.to(device)

In [19]:
dataset = DOC_RANK(data_meta_csv,data_dir,transform)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

In [24]:
def idcg(n_rel):
    # Assuming binary relevance.
    nums = np.ones(n_rel)
    denoms = np.log2(np.arange(n_rel) + 1 + 1)
    return (nums / denoms).sum()

for epoch in range(3):
    for labels,docs,n_rel in dataset:
        doc_scores = model(docs)
        n_docs = len(labels)
        n_irr = n_docs- n_rel
        if n_rel==0:
            continue
        #forward_compute
        doc_scores = model(docs)
       

        # Document ranks.
        (sorted_scores, sorted_idxs) = doc_scores.sort(dim = 0, descending = True)
        doc_ranks = torch.zeros(n_docs,dtype=torch.float).to(device)
        doc_ranks[sorted_idxs] = 1.0 + torch.arange(n_docs).view((n_docs, 1)).float().to(device)
        doc_ranks = doc_ranks.view((n_docs, 1))

        # # Compute lambdas.
        diffs = doc_scores[:n_rel] - doc_scores[n_rel:].view(n_irr)
        exped = diffs.exp()
        # # See equation (6) in [2].
        N = 1 / idcg(n_rel)
        #print('idgc =',N)
        ndcg_diffs = (1 / (1 + doc_ranks[:n_rel])).log2() - (1 / (1 + doc_ranks[n_rel:])).log2().view(n_irr)
        lamb_updates = -1 / (1 + exped) * N * ndcg_diffs.abs()
        # # See section 6.1 in [1], but lambdas have opposite signs from [2].
        lambs = torch.zeros((n_docs, 1)).to(device)
        lambs[:n_rel] -= lamb_updates.sum(dim = 1, keepdim = True)
        lambs[n_rel:] += lamb_updates.sum(dim = 0, keepdim = True).t()

        if epoch%100==0:
            print(ndcg_score([1]*n_rel+[2]*n_irr,doc_ranks.squeeze(1).tolist()))
        # # Accumulate lambda scaled gradients.
        model.zero_grad()
        doc_scores.backward(lambs)
        optimizer.step()

0.45086673773561947
0.8108582970019788
0.9074250374176304
0.8532721558012799
0.6498600945867898
0.589088768612251
0.6486218623221718
0.9368077570869662
0.7965100773735325
0.9477344873475498
0.8122418648457493
0.43893943504566835
1.0
0.9134166588977729
0.544610794872989
0.703935114791163
0.7686992054208691
0.37962081462451813
1.0
0.7219595910056631
0.7453597752277844
0.566547889864676
0.6241410282065163
0.8551595247651802
1.0
0.901447006056365
0.8532721558012799
0.8481064169964802
0.7091923051268172
0.4237902969274442
0.7430580004525728
0.7606971932189103
0.7642678977605577
0.8088720434739953
0.7182833344196091
0.955830517697074
0.8340611153182703
0.9432379215722527
0.7799082337019199
0.4639594030703841
0.47481757731527635
0.4929605712017434
0.45298473672387823
0.811178322335156
0.8340611153182703
0.42590829591570295
0.8698739994728926
0.5076288161633667
1.0
0.6481044710831649
0.6593007041766438
0.44879108889881736
0.6350677584671
0.5821032065260975
0.8532721558012799
0.7144097119277254

0.9134166588977729
0.8532721558012799
0.6873332711195502
0.7245949812329288
0.7081287626162236
0.5694969487670277
0.6431637888166242
0.6381210717697404
0.8254612838486164
0.7606971932189103
0.9095430364058891
0.3333333333333333
0.8169680738235195
0.8645073774981995
0.38224261473290666
0.3333333333333333
0.5263705276152086
0.5250842331427514
0.38559884598578353
0.6607868248473735
1.0
0.7182833344196091
1.0
1.0
0.48317417546358005
0.9368077570869662
0.45922519590939903
0.5817271694072151
0.540114229097692
0.8340611153182703
0.4434655591399216
0.4066972554326934
0.9095430364058891
0.8437289933844668
0.6651975125779157
0.9134166588977729
0.7013133146827744
0.9402879873227596
0.4342648940640067
0.6975049503058763
0.640742871878129
0.5752317468069429
0.5210914684875841
0.3333333333333333
0.9368077570869662
0.3333333333333333
0.955830517697074
0.5926538996677988
0.7625122742569328
0.7606971932189103
0.6350677584671
0.654356660362087
0.8010066431488297
0.6813552397582848
0.9477344873475498
1.0

KeyboardInterrupt: 