In [1]:
#   [1] https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf
#   [2] https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/MSR-TR-2010-82.pdf

import numpy as np
import pandas as pd
import os
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


In [None]:

# (C) Mathieu Blondel, November 2013
# License: BSD 3 clause
# metrics
import numpy as np



def dcg_score(y_true, y_score, k=10, gains="exponential"):
    """Discounted cumulative gain (DCG) at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array-like, shape = [n_samples]
        Predicted scores.
    k : int
        Rank.
    gains : str
        Whether gains should be "exponential" (default) or "linear".
    Returns
    -------
    DCG @k : float
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    if gains == "exponential":
        gains = 2 ** y_true - 1
    elif gains == "linear":
        gains = y_true
    else:
        raise ValueError("Invalid gains option.")

    # highest rank is 1 so +2 instead of +1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)


def ndcg_score(y_true, y_score, k=10, gains="exponential"):
    """Normalized discounted cumulative gain (NDCG) at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array-like, shape = [n_samples]
        Predicted scores.
    k : int
        Rank.
    gains : str
        Whether gains should be "exponential" (default) or "linear".
    Returns
    -------
    NDCG @k : float
    """
    best = dcg_score(y_true, y_true, k, gains)
    actual = dcg_score(y_true, y_score, k, gains)
    return actual / best

In [3]:
def dcg_score(y_true, y_score, k=10, gains="exponential"):
    """Discounted cumulative gain (DCG) at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array-like, shape = [n_samples]
        Predicted scores.
    k : int
        Rank.
    gains : str
        Whether gains should be "exponential" (default) or "linear".
    Returns
    -------
    DCG @k : float
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    if gains == "exponential":
        gains = 2 ** y_true - 1
    elif gains == "linear":
        gains = y_true
    else:
        raise ValueError("Invalid gains option.")

    # highest rank is 1 so +2 instead of +1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)


def ndcg_score(y_true, y_score, k=10, gains="exponential"):
    """Normalized discounted cumulative gain (NDCG) at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array-like, shape = [n_samples]
        Predicted scores.
    k : int
        Rank.
    gains : str
        Whether gains should be "exponential" (default) or "linear".
    Returns
    -------
    NDCG @k : float
    """
    best = dcg_score(y_true, y_true, k, gains)
    actual = dcg_score(y_true, y_score, k, gains)
    return actual / best


In [4]:
# Data.
input_dim = 46
data_file = 'data/MQ2007/Fold1/train.txt'
data_train_dir = 'data/MQ2007/Fold1/train_json'
data_train_meta_csv = "{}/metafile.csv".format(data_train_dir)

data_test_dir = 'data/MQ2007/Fold1/test_json'
data_test_meta_csv = "{}/metafile.csv".format(data_test_dir)


feats_to_drop = ['doc_id','inc','prob','qid','y']

# Feature Cols

In [5]:
feature_cols = [str(i) for i in range(1,47)]

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [7]:
def transform(q_sample,cols_to_drop):
    """
        input dataframe
        transforms datafram into tensor
    """
    
    label_tensor = torch.tensor(int(q_sample['y']))
    data_tensor = torch.tensor(q_sample[feature_cols].values.astype('float')).float()
    return {'y':label_tensor,'data':data_tensor}

class RANKNET_TRAIN_DS(Dataset):
    """Document Ranking Dataset."""
    def __init__(self, csv_file, root_dir, transform=None):
        """
        Args:
            text_file (string): Path to the txt file with q_id.
            root_dir (string): Directory with all the query_details.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.meta_file = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        self.feats_to_drop = feats_to_drop

    def __len__(self):
        return len(self.meta_file)

    def __getitem__(self, idx):
        q_fname = os.path.join(self.root_dir,str(self.meta_file.iloc[idx]['qid']))
        q_data = pd.read_csv("{}.csv".format(q_fname))
        i1,i2 = np.random.choice(len(q_data),2)
        z1,z2 = q_data.iloc[i1],q_data.iloc[i2]
        sample = {'doc1':transform(z1,feats_to_drop),'doc2':transform(z2,feats_to_drop)}
        return sample 
    
    


In [8]:
def transform_ls(q_sample_ls,cols_to_drop):
    """
        input dataframe
        transforms datafram into tensor
    """
    
    label_tensor_ls = torch.tensor(np.asarray([q_sample['y'] for q_sample in q_sample_ls]))
    data_tensor_ls = torch.tensor(\
                                  np.asarray([q_sample[feature_cols].values.astype('float') \
                                              for q_sample in q_sample_ls ])).float()
    return {'y':label_tensor_ls,'data':data_tensor_ls}

class RANKNET_TEST_DS(Dataset):
    """Document Ranking Dataset."""
    def __init__(self, csv_file, root_dir, transform=None):
        """
        Args:
            csv_file (string): Path to the txt file with q_id.
            root_dir (string): Directory with all the query_details.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.meta_file = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        self.feats_to_drop = feats_to_drop

    def __len__(self):
        return len(self.meta_file)

    def __getitem__(self, idx):
        q_fname = os.path.join(self.root_dir,str(self.meta_file.iloc[idx]['qid']))
        q_data = pd.read_csv("{}.csv".format(q_fname))
        z_ls = [q_data.iloc[i] for i in range(len(q_data))]
        sample_ls = transform_ls(z_ls,self.feats_to_drop)
        return sample_ls 

In [9]:
batch_size = 64
ranknet_train_ds = RANKNET_TRAIN_DS(data_train_meta_csv,data_train_dir,transform)
ranknet_test_ds = RANKNET_TEST_DS(data_test_meta_csv,data_test_dir,transform_ls)
train_dataloader = DataLoader(ranknet_train_ds, batch_size=batch_size,shuffle=True, num_workers=4)

In [10]:
#ranknet_test_ds[0]['data'].

# Rank Net Model

In [11]:
# Model.
class RankNet(nn.Module):
    def __init__(self,input_dim):
            super(RankNet, self).__init__()
            self.l1 = nn.Linear(input_dim, 128)
            self.l2 = nn.Linear(128, 64)
            self.l3 = nn.Linear(64, 32)
            self.l4 = nn.Linear(32, 1)
    
    def single_forward(self,x):
        return self.l4(F.relu(self.l3(F.relu(self.l2(F.relu(self.l1(x)))))))
    
    def forward(self, x_i, x_j, t_i, t_j):
        
       
        s_i = self.single_forward(x_i)
        s_j = self.single_forward(x_j)
        s_diff = s_i - s_j
        s_diff = s_diff.squeeze(1)
        S_ij = torch.zeros(size = t_i.shape)
        pos_mask = t_i>t_j
        neg_mask = t_i<t_j
        equal_mask = t_i==t_j
        S_ij[pos_mask]=1 
        S_ij[neg_mask]=-1
        S_ij[equal_mask]=0

        term1 = (1 - S_ij) * s_diff
        
        loss = (1 - S_ij) * s_diff /2.0 + torch.log(1 + torch.exp(-s_diff))
        return loss
    
    def predict(self,x):
        return self.single_forward(x)
    


In [12]:
model = RankNet(input_dim)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

In [13]:
def train_step(model):
    epoch_loss_ls = []
    for i_batch, sample_batched in enumerate(train_dataloader):
        xi, ti = sample_batched['doc1']['data'], sample_batched['doc1']['y']
        xj, tj =  sample_batched['doc2']['data'], sample_batched['doc2']['y']
        loss = model(xi,xj,ti,tj)
        total_loss = loss.mean()
        epoch_loss_ls.append(total_loss.item())
        model.zero_grad()
        total_loss.backward()
        optimizer.step()
    
    return sum(epoch_loss_ls)/len(epoch_loss_ls)
        
def test_step(model):
    ndgc_ls = []
    for i_batch, sample_batched in enumerate(ranknet_test_ds):
        label,data = sample_batched['y'],sample_batched['data']
        pred = model.predict(data)
        pred_ar = pred.squeeze(1).detach().numpy()
        label_ar = label.detach().numpy()
        ndgc_s = ndcg_score(label_ar,pred_ar)
        if not math.isnan(ndgc_s):
            ndgc_ls.append(ndgc_s)
    
    return sum(ndgc_ls)/len(ndgc_ls)

for epoch in range(1000):
    epoch_train_loss = train_step(model)
    print("Epoch: {} Train Loss: {}".format(epoch,epoch_train_loss))
    if epoch%10==0:
        epoch_test_dcg = test_step(model)
        print("Epoch: {} Test DCG: {}".format(epoch,epoch_test_dcg))
        print("--"*50)


Epoch: 0 Train Loss: 0.693467378616333




Epoch: 0 Test DCG: 0.2804546738575022
----------------------------------------------------------------------------------------------------
Epoch: 1 Train Loss: 0.6933716833591461
Epoch: 2 Train Loss: 0.692994948476553
Epoch: 3 Train Loss: 0.6925122886896133
Epoch: 4 Train Loss: 0.6924650147557259
Epoch: 5 Train Loss: 0.6925134211778641
Epoch: 6 Train Loss: 0.6916909851133823
Epoch: 7 Train Loss: 0.6915219016373158
Epoch: 8 Train Loss: 0.6913166753947735
Epoch: 9 Train Loss: 0.6916309185326099
Epoch: 10 Train Loss: 0.6907091774046421
Epoch: 10 Test DCG: 0.5197877513504553
----------------------------------------------------------------------------------------------------
Epoch: 11 Train Loss: 0.6906305216252804
Epoch: 12 Train Loss: 0.6896536350250244
Epoch: 13 Train Loss: 0.6896859966218472
Epoch: 14 Train Loss: 0.6880596876144409
Epoch: 15 Train Loss: 0.686484944075346
Epoch: 16 Train Loss: 0.6878590956330299
Epoch: 17 Train Loss: 0.6851712130010128
Epoch: 18 Train Loss: 0.68708818778

Process Process-389:
Process Process-391:
Process Process-392:
Traceback (most recent call last):
Traceback (most recent call last):
Process Process-390:
  File "/Users/jawa/anaconda3/envs/vqa/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/Users/jawa/anaconda3/envs/vqa/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/Users/jawa/anaconda3/envs/vqa/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/jawa/anaconda3/envs/vqa/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/jawa/anaconda3/envs/vqa/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/jawa/anaconda3/envs/vqa/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 106, in _worker_loop
    samples = collate_

  File "/Users/jawa/anaconda3/envs/vqa/lib/python3.7/site-packages/pandas/core/internals.py", line 5048, in _interleaved_dtype
    dtype = find_common_type([b.dtype for b in blocks])
  File "/Users/jawa/anaconda3/envs/vqa/lib/python3.7/site-packages/pandas/core/dtypes/cast.py", line 1141, in find_common_type
    return np.find_common_type(types, [])
  File "/Users/jawa/anaconda3/envs/vqa/lib/python3.7/site-packages/numpy/core/numerictypes.py", line 1002, in find_common_type
    scalar_types = [dtype(x) for x in scalar_types]
KeyboardInterrupt


KeyboardInterrupt: 