In [1]:
from google.colab import drive
drive.mount('/content/drive')
% cd drive/My Drive/hw3/

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
/content/drive/My Drive/hw3


In [2]:
!ls

dataset				       HW3-output.ipynb   ranking_example.py
datasetbinarized_purged_querynorm.npz  ir1_2020_hw3.pdf   ranking.py
dataset.py			       PairwiseLTR.ipynb  try
evaluate.py			       pointwise_LTR.py   untitled
example.py			       __pycache__	  Untitled.ipynb


In [0]:
import dataset
import ranking as rnk
import evaluate as evl
import numpy as np
import torch
import matplotlib.pyplot as plt
from scipy import stats
from torch.utils.data import Dataset, TensorDataset, DataLoader
import itertools



device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(0)

class RankNet(torch.nn.Module):
    def __init__(self, n_feature, n_hidden):
        super(RankNet, self).__init__()
        self.hidden = torch.nn.Linear(n_feature, n_hidden)
        self.output = torch.nn.Linear(n_hidden, 1)      
        
    def forward(self, x1):
        x = torch.nn.functional.relu(self.hidden(x1))
        x = self.output(x)
        
        return x

class Model():
    def __init__(self, n_feature, n_hidden, learning_rate):
        self.ranknet = RankNet(n_feature, n_hidden).to(device)
        self.optimizer = torch.optim.SGD(self.ranknet.parameters(), lr=learning_rate)

def eval_model(model, data_fold):
    with torch.no_grad():
        x = torch.from_numpy(data_fold.feature_matrix).float().to(device)
        y = data_fold.label_vector
        model.ranknet.eval()
               
        output = model.ranknet(x)
        output = output.detach().cpu().numpy().squeeze()
        
        loss = torch.FloatTensor([0.5])*(torch.FloatTensor([1])-y)*((si-sj).sigmoid()).view(si.size(0))+torch.log(torch.FloatTensor([1])+torch.exp(-((si-sj).sigmoid().view(si.size(0)))))
        scores = evl.evaluate(data_fold, np.asarray(output))  

    return loss, scores


def load_dataset():
    data = dataset.get_dataset().get_data_folds()[0]
    data.read_data()

#     train_x = torch.from_numpy(data.train.feature_matrix).float()
#     train_y = torch.from_numpy(data.train.label_vector).float()

    documents = data.train.feature_matrix
    doc_list = list(range(len(documents)))
    
    # Carthesian product
    Carth = list(itertools.combinations(doc_list,2))
    x1, x2, target = [], [], []
  
    # iterate over all possible combinations
    for i,j in Carth:
        x1.append(docs[i])
        x2.append(docs[j])
        if data.train.label_vector[i]>data.train.label_vector[j]:
            # this is the S_{ij}
            target.append(float(1))
        elif data.train.label_vector[i]<data.train.label_vector[j]:
            target.append(float(-1))
        else:
            target.append(float(0))
    train_set = TensorDataset(x1, x2, target)
    train_loader = DataLoader(dataset=train_set, batch_size=32, shuffle=True)
    
#     return torch.FloatTensor(x1), torch.FloatTensor(x2), torch.FloatTensor(target)

    return data, train_loader
       

def plot_ndcg_loss(losses, ndcgs):
    x = np.arange(len(losses))
    fig, ax = plt.subplots()
    
    ax.plot(x, losses, label='Loss')
    ax.plot(x, ndcgs, label='NDCG')
    ax.set_xlabel("Batch % 2000")
    ax.set_ylabel("Score")
    ax.set_title("Pointwise LTR")
    legend = ax.legend(loc='upper center')
    
    plt.show()
    plt.savefig('Pointwise_LTR_plot.png')

    
def train_batch(train_data, y_batch, model):
    model.ranknet.train()
    for epoch in range(1):
        for qid in range(0, train_data.num_queries(qid)):
            if train_data.query_size(qid) < 2:
                continue
                
            s_i, e_i = train_data.query_range(qid)
            
            documentfeatures = torch.tensor(train_data.feature_matrix[s_i:e_i]).float()
            labels = torch.tensor(train_data.label_vector[s_i:e_i])
            
    #x_batch = x_batch.to(device)
    #y_batch = y_batch.to(device)
           
            output = model.ranknet(documentfeatures)
            
            loss = pairwiseloss(output, labels)
            loss.backward()
            model.optimizer.step()
            model.optimizer.zero_grad() 
            
        
    return model
    
def pairwiseloss(predictedvals, values, n_docs):
    values_true = values / 4
    predictedvals = predictedvals.squeeze()
    pairs = int(math.factorial(n_docs) / (math.factorial(n_docs - 2) * 2))
    
    tups = list(itertools.combinations(range(predictedvals.shape[0]), 2))
    val1, val2 = [x[0] for x in tups], [x[1] for x in tups]
    pred1 = predictedvals[val1].to(device)
    pred2 = predictedvals[val2].to(device)
    
    true1 = values[val1].to(device)
    true2 = values[val2].to(device)
    
    l1 = (true1 > true2).type(torch.ByteTensor).to(device)
    l2 = (true1 < true2).type(torch.ByteTensor).to(device)
    S = torch.zeros(pairs).to(device) + l1 - l2
    
    sigmoid = torch.sigmoid(pred1.float() - pred2.float()).to(device)
    C_T = (0.5 * (1 - S) * sigmoid + torch.log(1 + torch.exp(-sigmoid)))
    
    return C_T.mean()

def hyperparam_search():
    # hyper-parameters
    epochs = 300
    learning_rates = [10**-1, 10**-2, 10**-3, 10**-4]
    n_hiddens = [100, 150, 200, 250, 300, 350, 400]
    data, train_loader = load_dataset()
    
    best_ndcg = 0
    for learning_rate in learning_rates:
        for n_hidden in n_hiddens:
        
            print("\nTesting learning_rate = {} and n_hidden = {}".format(learning_rate, n_hidden))
            model = Model(data.num_features, n_hidden, learning_rate)
            
            last_ndcg = 0
            for epoch in range(epochs):
                
                model.ranknet.train()
                for x_batch, y_batch in train_loader:
                    model = train_batch(x_batch, y_batch, model)                          
                loss, scores = eval_model(model, data.validation)
                
                ndcg = scores["ndcg"][0]
                print("Epoch: {}, ndcg: {}".format(epoch, ndcg))
                            
                if ndcg < last_ndcg:
                    break
                last_ndcg = ndcg
                if ndcg > best_ndcg:
                    best_ndcg = ndcg
                    best_params = {"learning_rate": learning_rate, "n_hidden": n_hidden, "epoch": epoch}            
                    print("Best parameters:", best_params)
    
    return best_params
    
    
def train_best(best_params):
    epochs = best_params["epoch"]
    n_hidden = best_params["n_hidden"]
    learning_rate = best_params["learning_rate"]
    
    # load data
    data, train_loader = load_dataset()
    model = Model(data.num_features, n_hidden, learning_rate)

    losses, ndcgs = [], []
    for epoch in range(epochs):
        eval_count = 0
        for x_batch, y_batch in train_loader:
            model = train_batch(x_batch, y_batch, model)
            eval_count +=1
            if eval_count % 2000 == 0:
                loss, scores = eval_model(model, data.validation)
                losses.append(loss)
                ndcgs.append(scores["ndcg"][0])
        print("Epoch: {}, ndcg: {}".format(epoch, scores["ndcg"][0]))
        
    return ndcgs, losses, model


def get_distributions(model):
    data = dataset.get_dataset().get_data_folds()[0]
    data.read_data()
    model.ranknet.eval()

    val_x = torch.from_numpy(data.validation.feature_matrix).float().to(device)
    test_x = torch.from_numpy(data.test.feature_matrix).float().to(device)
           
    val = model.ranknet(val_x).detach().cpu().numpy().squeeze()
    test = model.ranknet(test_x).detach().cpu().numpy().squeeze()
    actual = np.concatenate((data.train.label_vector, data.validation.label_vector, data.test.label_vector))
    
    distributions = {
    "val_mean": np.mean(val),
    "val_std": np.std(val),
    "test_mean": np.mean(test),
    "test_std": np.std(test),
    "actual_mean": np.mean(actual), 
    "actual_std": np.std(actual),
    }
    
    return distributions



if __name__ == "__main__":
    #determine best hyper parameters
    best_params = hyperparam_search()
    #train best model
    ndcgs, losses, model = train_best(best_params)
    #plot ndcg and loss    
    plot_ndcg_loss(losses, ndcgs)
    #get distributions of scores
    distributions = get_distributions(model)
    #performance on test set
    data = dataset.get_dataset().get_data_folds()[0]
    data.read_data()
    loss, scores = eval_model(model, data.test)




In [0]:
pip install evaluate

In [0]:

!ls
# from google.colab import drive
# drive.mount('/content/drive/MyDrive/hw3')

bin					   etc	  opt	 sys
boot					   home   proc	 tensorflow-2.1.0
content					   lib	  root	 tmp
datalab					   lib32  run	 tools
dev					   lib64  sbin	 usr
dlib-19.18.0-cp27-cp27mu-linux_x86_64.whl  media  srv	 var
dlib-19.18.0-cp36-cp36m-linux_x86_64.whl   mnt	  swift


In [0]:
import dataset
import ranking as rnk
import evaluate as evl
import numpy as np
import torch
import matplotlib.pyplot as plt
from scipy import stats
from torch.utils.data import Dataset, TensorDataset, DataLoader

device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(0)

class RankNet(torch.nn.Module):
    def __init__(self, n_feature, n_hidden):
        super(RankNet, self).__init__()
        self.hidden = torch.nn.Linear(n_feature, n_hidden)
        self.output = torch.nn.Linear(n_hidden, 1)      
        
    def forward(self, x1, x2):
        si = torch.nn.functional.relu(self.hidden(x1))
        si = self.output(si)
        sj = torch.nn.functional.relu(self.hidden(x2))
        sj = self.output(sj)
        return si, sj


class Model():
    def __init__(self, n_feature, n_hidden, n_q, learning_rate):
        for query_id in n_q:
            start_index, end_index = data.train.query_range(query_id)
            
            x1, x2, target = self.target_feature(start_index, end_index)

            optimizer.zero_grad()
            # maybe another way
            try:
                si, sj = RankNet.forward(x1, x2)
            except:
                print("Query with one document, skipping this query", qid)
                continue
                
            # Get pairs
            loss = torch.FloatTensor([0.5]) * (torch.FloatTensor([1]) - target) * ((si - sj).sigmoid()).view(
                si.size(0)) + torch.log(torch.FloatTensor([1]) + torch.exp(-((si - sj).sigmoid().view(si.size(0)))))

            loss.sum().backward()
            optimizer.step()

            if not query_id % 100:
                print(f"Data: {query_id}, with a loss of {loss.sum()}")
                all_scores = self.model(torch.FloatTensor(data.validation.feature_matrix)).view(-1).detach().numpy()
                metrics = self.evaluate(data.validation, all_scores)
                ndcg = metrics['ndcg']
                arr = metrics['relevant rank']
                if self.use_early_stop:
                    self.normal_ndcg.append(ndcg[0])
                    self.normal_ndcg.pop(0)
                    x = list(range(0, len(self.normal_ndcg)))
                    slope, *_ = stats.linregress(x, self.normal_ndcg)
                    if slope < 0:
                        print(f"negative slope, early stopping activated slope of: {slope}")
                        break

                print(f"the ndcg is {ndcg} the arr is {arr}")

        all_scores = model.model(torch.FloatTensor(data.train.feature_matrix)).view(-1).detach().numpy()
        metrics = model.evaluate(data.train, all_scores)
        ndcg = metrics['ndcg']
        print(f"Train ndcg is {ndcg}")

#         self.net = Net(n_feature, n_hidden).to(device)
#         self.criterion = torch.nn.MSELoss(reduction='mean')
#         self.optimizer = torch.optim.SGD(self.net.parameters(), lr=learning_rate)


def eval_model(model, data_fold):
    with torch.no_grad():
        x = torch.from_numpy(data_fold.feature_matrix).float().to(device)
        y = data_fold.label_vector
        model.net.eval()
               
        output = model.net(x)
        output = output.detach().cpu().numpy().squeeze()
        
        loss = np.mean(np.square(output - y))
        scores = evl.evaluate(data_fold, np.asarray(output))  

    return loss, scores


# def load_dataset():
#     data = dataset.get_dataset().get_data_folds()[0]
#     data.read_data()

#     train_x = torch.from_numpy(data.train.feature_matrix).float()
#     train_y = torch.from_numpy(data.train.label_vector).float()

#     train_set = TensorDataset(train_x, train_y)
#     train_loader = DataLoader(dataset=train_set, batch_size=32, shuffle=True)

#     return data, train_loader
       
    
def target_feature(self, start_index, end_index):
        documents = data.train.feature_matrix[start_index:end_index]
        document_list = list(range(len(docs)))

        # Carthesian product of all documents
        Carth = list(itertools.combinations(document_list, 2))
        x1, x2, target = [], [], []

        # iterate over all possible combinations
        for i, j in Cart:
            x1.append(documents[i])
            x2.append(documents[j])
            if data.train.label_vector[i + start_index] > data.train.label_vector[j + start_index]:
                #  S_ij
                target.append(float(1))
            elif data.train.label_vector[i + start_index] < data.train.label_vector[j + start_index]:
                target.append(float(-1))
            else:
                target.append(float(0))
        return torch.FloatTensor(input_1), torch.FloatTensor(input_2), torch.FloatTensor(target)


def plot_ndcg_loss(losses, ndcgs):
    x = np.arange(len(losses))
    fig, ax = plt.subplots()
    
    ax.plot(x, losses, label='Loss')
    ax.plot(x, ndcgs, label='NDCG')
    ax.set_xlabel("Batch % 2000")
    ax.set_ylabel("Score")
    ax.set_title("Pointwise LTR")
    legend = ax.legend(loc='upper center')
    
    plt.show()
    plt.savefig('Pointwise_LTR_plot.png')

    
# def train_batch(x_batch, y_batch, model):
#     model.net.train()
#     x_batch = x_batch.to(device)
#     y_batch = y_batch.to(device)
           
#     output = model.net(x_batch)
#     if output.size() != y_batch.size():
#         y_batch = y_batch.view(-1, 1)
#     loss = model.criterion(y_batch, output)
    
#     loss.backward()
#     model.optimizer.step()
#     model.optimizer.zero_grad() 
    
#     return model
    
       
def hyperparam_search():
    # hyper-parameters
    epochs = 300
    learning_rates = [10**-1, 10**-2, 10**-3, 10**-4]
    n_hiddens = [100, 150, 200, 250, 300, 350, 400]
    data = target_feature()
    
    best_ndcg = 0
    num_q = list(range(1, data.train.num_queries()))
    for learning_rate in learning_rates:
        for n_hidden in n_hiddens:
        
            print("\nTesting learning_rate = {} and n_hidden = {}".format(learning_rate, n_hidden))
            model = Model(data.num_features, n_hidden, n_q = num_q , learning_rate)
            
            last_ndcg = 0
            for epoch in range(epochs):
                
                model.net.train()
                Model_trained = Model(data.num_features, n_hidden, n_q = num_q , learning_rate                      
                loss, scores = eval_model(model, data.validation)
                
                ndcg = scores["ndcg"][0]
                print("Epoch: {}, ndcg: {}".format(epoch, ndcg))
                            
                if ndcg < last_ndcg:
                    break
                last_ndcg = ndcg
                if ndcg > best_ndcg:
                    best_ndcg = ndcg
                    best_params = {"learning_rate": learning_rate, "n_hidden": n_hidden, "epoch": epoch}            
                    print("Best parameters:", best_params)
    
    return best_params
    
    
def train_best(best_params):
    epochs = best_params["epoch"]
    n_hidden = best_params["n_hidden"]
    learning_rate = best_params["learning_rate"]
    
    #load data
    data, train_loader = load_dataset()
    model = Model(data.num_features, n_hidden, learning_rate)

    losses, ndcgs = [], []
    for epoch in range(epochs):
        eval_count = 0
        for x_batch, y_batch in train_loader:
            model = train_batch(x_batch, y_batch, model)
            eval_count +=1
            if eval_count % 2000 == 0:
                loss, scores = eval_model(model, data.validation)
                losses.append(loss)
                ndcgs.append(scores["ndcg"][0])
        print("Epoch: {}, ndcg: {}".format(epoch, scores["ndcg"][0]))
        
    return ndcgs, losses, model


def get_distributions(model):
    data = dataset.get_dataset().get_data_folds()[0]
    data.read_data()
    model.net.eval()

    val_x = torch.from_numpy(data.validation.feature_matrix).float().to(device)
    test_x = torch.from_numpy(data.test.feature_matrix).float().to(device)
           
    val = model.net(val_x).detach().cpu().numpy().squeeze()
    test = model.net(test_x).detach().cpu().numpy().squeeze()
    actual = np.concatenate((data.train.label_vector, data.validation.label_vector, data.test.label_vector))
    
    distributions = {
    "val_mean": np.mean(val),
    "val_std": np.std(val),
    "test_mean": np.mean(test),
    "test_std": np.std(test),
    "actual_mean": np.mean(actual),
    "actual_std": np.std(actual),
    }
    
    return distributions



if __name__ == "__main__":
    #determine best hyper parameters
    best_params = hyperparam_search()
    #train best model
    ndcgs, losses, model = train_best(best_params)
    #plot ndcg and loss    
    plot_ndcg_loss(losses, ndcgs)
    #get distributions of scores
    distributions = get_distributions(model)
    #performance on test set
    data = dataset.get_dataset().get_data_folds()[0]
    data.read_data()
    loss, scores = eval_model(model, data.test)



In [0]:
class Ranknet():
  def __init__(self, num_features, hidden_units=10, device="cpu", lr = 0.005, layers = 2, early_stopping = 6, use_early_stop=False): #, encoder, decoder, classes, n_protos, bottleneck_size=40, device='cpu', model_type='simple'
    '''
    arguments:
      num_features = size of input vector (features size)
    '''
    self.num_features = num_features
    self.use_early_stop = use_early_stop
    self.lr = lr
    assert layers in [1,2,3,4], "layers need to be 1,2,3 or 4"
    if layers == 2:
      self.model = nn.Sequential(
              nn.Linear(num_features, hidden_units),
              nn.Dropout(0.5),
              nn.BatchNorm1d(hidden_units),
              nn.LeakyReLU(0.2, inplace=True),
              nn.Linear(hidden_units, 1)
          ).to(device)
    elif layers == 1:
      self.model = nn.Sequential(
              nn.Linear(num_features, 1)
          ).to(device)
    elif layers == 3:
      self.model = nn.Sequential(
              nn.Linear(num_features, int(hidden_units/2)),
              nn.Dropout(0.5),
              nn.BatchNorm1d(int(hidden_units/2)),
              nn.LeakyReLU(0.2, inplace=True),
              nn.Linear(int(hidden_units/2), hidden_units),
              nn.Dropout(0.5),
              nn.BatchNorm1d(hidden_units),
              nn.LeakyReLU(0.2, inplace=True),
              nn.Linear(hidden_units, 1)
          ).to(device)
    elif layers == 4:
      self.model = nn.Sequential(
              nn.Linear(num_features, int(hidden_units/2)),
              nn.Dropout(0.5),
              nn.BatchNorm1d(int(hidden_units/2)),
              nn.LeakyReLU(0.2, inplace=True),
              nn.Linear(int(hidden_units/2), hidden_units),
              nn.Dropout(0.5),
              nn.BatchNorm1d(hidden_units),
              nn.LeakyReLU(0.2, inplace=True),
              nn.Linear(hidden_units, int(hidden_units/2)),
              nn.Dropout(0.5),
              nn.BatchNorm1d(int(hidden_units/2)),
              nn.LeakyReLU(0.2, inplace=True),
              nn.Linear(int(hidden_units/2), 1)
          ).to(device)

    self.normal_ndcg = [0 for _ in range(early_stopping)]
    self.early_stopping = early_stopping
    self.output_sig = nn.Sigmoid()
    self.device=device
    self.model.to(device)
    for mod in self.model:
      if isinstance(mod, nn.Linear):
        nn.init.normal_(mod.weight, mean=2, std=2)
        mod.bias.data.fill_(0)
  
  def forward(self, input_1, input_2):
    '''
    Does one forward pass using two inputs
    '''
    si = self.model(input_1.float().to(self.device))
    sj = self.model(input_2.float().to(self.device))#self.output_sig(x)
    
    return si, sj

  def get_feature_vectors_target_train(self, s_i, e_i):
    docs = data.train.feature_matrix[s_i:e_i]
    document_list = list(range(len(docs)))

    # create dict from idx of list location to the feature vector
    # idx_2_fm = {}
    # for e, d in enumerate(docs):
    #   idx_2_fm[e] = d

    # create all the possible combinations
    doc_combs_in_qid = list(itertools.combinations(document_list,2))
    input_1, input_2, target = [], [], []
  
    # iterate over all possible combinations
    for i,j in doc_combs_in_qid:
      input_1.append(docs[i])
      input_2.append(docs[j])
      if data.train.label_vector[i+s_i]>data.train.label_vector[j+s_i]:
        # this is the S_{ij}
        target.append(float(1))
      elif data.train.label_vector[i+s_i]<data.train.label_vector[j+s_i]:
        target.append(float(-1))
      else:
        target.append(float(0))
    return torch.FloatTensor(input_1), torch.FloatTensor(input_2), torch.FloatTensor(target)

  def train(self, n_epochs):
    lr=self.lr
    if not os.path.exists('loss_folder'):
        os.makedirs('loss_folder')

    optimizer = optim.Adam(self.model.parameters(), lr=lr)

    losses = {'Ranking_class': [] }
    accuracies = {'Test': [], 'Train': [] }
    # data arguments: ['datafold', 'doc_feat', 'doc_str', 'doclist_ranges', 'feature_matrix', 'label_vector', 'name', 'num_docs', 'num_queries', 'query_feat', 'query_labels', 'query_range', 'query_size', 'query_sizes']
    for epoch in range(n_epochs):
        c_loss = []
        train_acc = []
        
        num_queries = list(range(1, data.train.num_queries()))
        # if epoch != 0:
        #   random.shuffle(num_queries)
        # num_queries = num_queries[0 : int(len(num_queries) * early_stop)]
        for qid in num_queries:
            s_i, e_i = data.train.query_range(qid)

            input_1, input_2, target = self.get_feature_vectors_target_train(s_i, e_i)

            optimizer.zero_grad()

            try:
              si, sj = self.forward(input_1, input_2)
            except:
              print("Query with one document, skipping this query", qid)
              continue
            # GET PAIRS
            loss = torch.FloatTensor([0.5])*(torch.FloatTensor([1])-target)*((si-sj).sigmoid()).view(si.size(0))+torch.log(torch.FloatTensor([1])+torch.exp(-((si-sj).sigmoid().view(si.size(0)))))

            loss.sum().backward()
            optimizer.step()

            if not qid%100:
              print(f"Data: {qid}, with a loss of {loss.sum()}")
              all_scores = self.model(torch.FloatTensor(data.validation.feature_matrix)).view(-1).detach().numpy()
              metrics = self.evaluate(data.validation, all_scores)
              ndcg = metrics['ndcg']
              arr = metrics['relevant rank']
              if self.use_early_stop:
                self.normal_ndcg.append(ndcg[0])
                self.normal_ndcg.pop(0)
                x = list(range(0, len(self.normal_ndcg)))
                slope, *_ = stats.linregress(x, self.normal_ndcg)
                if slope < 0:
                  print(f"negative slope, early stopping activated slope of: {slope}")
                  break
                
              print(f"the ndcg is {ndcg} the arr is {arr}")

    all_scores = model.model(torch.FloatTensor(data.train.feature_matrix)).view(-1).detach().numpy()
    metrics = model.evaluate(data.train, all_scores)
    ndcg = metrics['ndcg']
    print(f"Train ndcg is {ndcg}")

    # create_log(losses, accuracies, log_file)
    # torch.save(model.state_dict(), os.path.join(model_folder, 'model_final.pt'))

  def predict(self, data_split):
      # self.eval()
      allscores = []
      x_val_tensor = torch.from_numpy(data_split.feature_matrix).float()
      y_val_tensor = torch.from_numpy(data_split.label_vector)
      
      count = 0 
      for val, label in zip(x_val_tensor, y_val_tensor):
          y_pred = self.model(val)
          allscores.append(y_pred.numpy())
          
      return np.array(allscores)

  def evaluate(self, data_split, all_scores):
    '''
    function to evaluate the model
    '''
    metrics = evl.evaluate(data_split, all_scores, False)
    return metrics




In [0]:
from scipy import stats
sped_ndcg, sped_arr = [], []
class SpedUpRanknet():
  def __init__(self, num_features, hidden_units=10, device="cpu", lr = 0.05, layers = 1,  early_stopping=15): #, encoder, decoder, classes, n_protos, bottleneck_size=40, device='cpu', model_type='simple'
    '''
    arguments:
      num_features = size of input vector (features size)
    '''
    self.normal_ndcg = = [0 for _ in range(early_stopping)]
    self.num_features = num_features
    self.lr = lr
    assert layers in [1,2,3,4], "layers need to be 1,2,3 or 4"
    if layers == 2:
      self.model = nn.Sequential(
              nn.Linear(num_features, hidden_units),
              nn.Dropout(0.5),
              nn.BatchNorm1d(hidden_units),
              nn.LeakyReLU(0.2, inplace=True),
              nn.Linear(hidden_units, 1)
          )
    elif layers == 1:
      self.model = nn.Sequential(
              nn.Linear(num_features, 1)
          )
    elif layers == 3:
      self.model = nn.Sequential(
              nn.Linear(num_features, int(hidden_units/2)),
              nn.Dropout(0.5),
              nn.BatchNorm1d(int(hidden_units/2)),
              nn.LeakyReLU(0.2, inplace=True),
              nn.Linear(int(hidden_units/2), hidden_units),
              nn.Dropout(0.5),
              nn.BatchNorm1d(hidden_units),
              nn.LeakyReLU(0.2, inplace=True),
              nn.Linear(hidden_units, 1)
          )
    elif layers == 4:
      self.model = nn.Sequential(
              nn.Linear(num_features, int(hidden_units/2)),
              nn.Dropout(0.5),
              nn.BatchNorm1d(int(hidden_units/2)),
              nn.LeakyReLU(0.2, inplace=True),
              nn.Linear(int(hidden_units/2), hidden_units),
              nn.Dropout(0.5),
              nn.BatchNorm1d(hidden_units),
              nn.LeakyReLU(0.2, inplace=True),
              nn.Linear(hidden_units, int(hidden_units/2)),
              nn.Dropout(0.5),
              nn.BatchNorm1d(int(hidden_units/2)),
              nn.LeakyReLU(0.2, inplace=True),
              nn.Linear(int(hidden_units/2), 1)
          )


    self.output_sig = nn.Sigmoid()
    self.device=device
    self.model.to(device)
    for mod in self.model:
      if isinstance(mod, nn.Linear):
        nn.init.normal_(mod.weight, mean=2, std=2)
        mod.bias.data.fill_(0)
  
  def forward(self, input_1):
    '''
    Does one forward pass using two inputs
    '''
    si = self.model(input_1.float())
    # sj = self.model(input_2.float())#self.output_sig(x)
    
    return si

  # def get_feature_vectors_target_train(self, s_i, e_i):
  #   docs = data.train.feature_matrix[s_i:e_i]
  #   document_list = list(range(len(docs)))

  #   # create dict from idx of list location to the feature vector
  #   idx_2_fm = {}
  #   for e, d in enumerate(docs):
  #     idx_2_fm[e] = d

  #   # create all the possible combinations
  #   doc_combs_in_qid = list(itertools.combinations(document_list,2))
  #   input_1, input_2, target = [], [], []
  
  #   # iterate over all possible combinations
  #   for i,j in doc_combs_in_qid:
  #     input_1.append(docs[i])
  #     input_2.append(docs[j])
  #     if data.train.label_vector[i+s_i]>data.train.label_vector[j+s_i]:
  #       # this is the S_{ij}
  #       target.append(float(1))
  #     elif data.train.label_vector[i+s_i]<data.train.label_vector[j+s_i]:
  #       target.append(float(-1))
  #     else:
  #       target.append(float(0))
  #   return torch.FloatTensor(input_1), torch.FloatTensor(input_2), torch.FloatTensor(target)

  def fast_loss(self, forward_output, first_doc, last_doc):
    # docs = data.train.feature_matrix[s_i:e_i]
    document_list = list(range(last_doc-first_doc))

    # idx_2_fm = {}
    # for e, d in enumerate(docs):
    #   idx_2_fm[e] = d

    doc_combs_in_qid = list(itertools.combinations(document_list,2))
    q_loss = torch.zeros(1) + (float(0)*forward_output[0])
    for i,j in doc_combs_in_qid:
      s_i = forward_output[i]
      s_j = forward_output[j]
      if data.train.label_vector[i+first_doc]>data.train.label_vector[j+first_doc]:
        # this is the S_{ij}
        S_ij = float(1)
      elif data.train.label_vector[i+first_doc]<data.train.label_vector[j+first_doc]:
        S_ij = float(-1)
      else:
        continue
      loss = torch.FloatTensor([0.5])*(torch.FloatTensor([1])-S_ij)*((s_i-s_j).sigmoid()).view(s_i.size(0))+torch.log(torch.FloatTensor([1])+torch.exp(-((s_i-s_j).sigmoid().view(s_i.size(0)))))

      # loss = (torch.FloatTensor([0.5])*(torch.FloatTensor([1])-S_ij)-(torch.FloatTensor([1])/(torch.FloatTensor([1])+torch.exp((s_i-s_j).sigmoid().view(s_i.size(0)))))).sigmoid()
      q_loss += loss
    # print(q_loss)
    return q_loss

  def train(self, n_epochs):
    lr=self.lr
    if not os.path.exists('loss_folder'):
        os.makedirs('loss_folder')

    optimizer = optim.Adam(self.model.parameters(), lr=lr)

    losses = {'Ranking_class': [] }
    accuracies = {'Test': [], 'Train': [] }
    # data arguments: ['datafold', 'doc_feat', 'doc_str', 'doclist_ranges', 'feature_matrix', 'label_vector', 'name', 'num_docs', 'num_queries', 'query_feat', 'query_labels', 'query_range', 'query_size', 'query_sizes']
    for epoch in range(n_epochs):
        c_loss = []
        train_acc = []
        
        num_queries = list(range(1, data.train.num_queries()))
        # if epoch != 0:
        #   random.shuffle(num_queries)
        # num_queries = num_queries[0 : int(len(num_queries) * early_stop)]
        for qid in num_queries:
            s_i, e_i = data.train.query_range(qid)

            # input_1, input_2, target = self.get_feature_vectors_target_train(s_i, e_i)

            optimizer.zero_grad()
            input_1 = torch.FloatTensor(data.train.feature_matrix[s_i:e_i])
            try:
              si = self.forward(input_1)
            except:
              print("Query with one document, skipping this query", qid)
              continue

            loss = self.fast_loss(si, s_i, e_i)

            # loss = torch.FloatTensor([0.5])*(torch.FloatTensor([1])-target)*((si-sj).sigmoid()).view(si.size(0))+torch.log(torch.FloatTensor([1])+torch.exp(-((si-sj).sigmoid().view(si.size(0)))))
            # loss = (torch.FloatTensor([0.5])*(torch.FloatTensor([1])-target)-(torch.FloatTensor([1])/(torch.FloatTensor([1])+torch.exp((si-sj).sigmoid().view(si.size(0)))))).sigmoid()

            loss.sum().backward()
            optimizer.step()

            if not qid%100:
              print(f"Data: {qid}, with a loss of {loss.sum()}")
              all_scores = self.model(torch.FloatTensor(data.validation.feature_matrix)).view(-1).detach().numpy()
              metrics = self.evaluate(data.validation, all_scores)
              ndcg = metrics['ndcg']
              arr = metrics['relevant rank']
              sped_ndcg.append(ndcg)
              sped_arr.append(arr)
              print(f"the ndcg is {ndcg} the arr is {arr}")
              self.normal_ndcg.append(ndcg[0])
              self.normal_ndcg.pop(0)
              x = list(range(0, len(self.normal_ndcg)))
              slope, *_ = stats.linregress(x, self.normal_ndcg)
              if slope < 0:
                print(f"negative slope, early stopping activated slope of: {slope}")
                break

    all_scores = self.model(torch.FloatTensor(data.train.feature_matrix)).view(-1).detach().numpy()
    metrics = self.evaluate(data.train, all_scores)
    ndcg = metrics['ndcg']
    print(f"Train ndcg is {ndcg}")

    # create_log(losses, accuracies, log_file)
    # torch.save(model.state_dict(), os.path.join(model_folder, 'model_final.pt'))

  def predict(self, data_split):
      # self.eval()
      allscores = []
      x_val_tensor = torch.from_numpy(data_split.feature_matrix).float()
      y_val_tensor = torch.from_numpy(data_split.label_vector)
      
      count = 0 
      for val, label in zip(x_val_tensor, y_val_tensor):
          y_pred = self.model(val)
          allscores.append(y_pred.numpy())
          
      return np.array(allscores)

  def evaluate(self, data_split, all_scores):
    '''
    function to evaluate the model
    '''
    metrics = evl.evaluate(data_split, all_scores, False)
    return metrics

sped_model = SpedUpRanknet(num_features=data.num_features, hidden_units=10, lr = 0.05, layers = 1, device='cpu')
# print("hier")
sped_model.train(n_epochs=1)