In [None]:
# MOUNT GOOGLE DRIVE
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Setup - remote processing
Run this section if using remote processing (provided by Google)

In [None]:
%matplotlib inline

# IMPORT LIBRARIES
import sys
import os
import numpy as np
import pandas as pd
import math
from scipy.optimize import curve_fit
import random
import glob
import subprocess
import matplotlib.pyplot as plt
from scipy.integrate import simps
from scipy.stats import norm
import os

import scipy

In [None]:
# full path deleted for anonymity, replace with correct full path

# Ensure that Python looks in correct place for local modules
DIR = '/content/drive/MyDrive/CP_CLF/SharedCode/EMNLP23_code/submitted/emnlp23_supplementary_files/code/'
# set the full working directory path
path = '/content/drive/MyDrive/CP_CLF/SharedCode/EMNLP23_code/submitted/emnlp23_supplementary_files/code/'

sys.path.append(DIR)

# IMPORT OWN FUNCTIONS
from utils.read_data_fns import *
from utils.eval_fns import *
from utils.inhomogeneous_pp_fns import *

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Classification Utils Functions



In [None]:
# import NLP libraries

# model selection & hyper-parameters toning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid

#modeling
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection, preprocessing, linear_model

# feature selection
from sklearn.feature_extraction.text import TfidfVectorizer

#evaluation
from sklearn import metrics

## train-test classification with TF-IDF

In [None]:
# funtion to train clf on examined sample docs and apply it to unexamined docs
def run_classification_model_tfidf(query_id, n_samp_docs, n_docs, labels, sample_rel_list, features, clf_name, imbalance_handle):

    #split train & test sets
    train_x = features[0:n_samp_docs] # features are tfidf
    train_y = labels[0:n_samp_docs]
    test_x = features[n_samp_docs:n_docs]
    test_y = labels[n_samp_docs:n_docs]


    # calculate relv, non-relv
    relv_cnt = sum(train_y)
    non_relv_cnt = len(train_y) - relv_cnt
    #print(f'relv_cnt: {relv_cnt} , non_relv_cnt: {non_relv_cnt}')
    #print('imbalance_handle: ' , imbalance_handle)


    # no need to calculate tfidf, already claculated
    xtrain_tfidf_ngram =  train_x
    xtest_tfidf_ngram =  test_x



    # train model and get result
    #print('training model')

    # LogisticRegression on TF IDF Vectors
    clf = linear_model.LogisticRegression(solver='lbfgs', random_state=0, C=1.0, max_iter=10000)

    if imbalance_handle == 'cost_sensitive':
      #print('imbalance_handle = cost_sensitive')
      # no need to sampling, use cost-sensitive learning with balanced class weights
      clf = linear_model.LogisticRegression(solver='lbfgs', random_state=0, C=1.0, max_iter=10000, class_weight = 'balanced')



    if imbalance_handle == 'cost_sensitive_manual':
      # manually assign majority and minority to either 0 or 1 based on sample
      #print('imbalance_handle = cost_sensitive_manual')
      # no need to sampling, use cost-sensitive learning with balanced class weights
      if relv_cnt >= non_relv_cnt:
        majority_class = 1
        minority_class = 0
        IR = non_relv_cnt/relv_cnt
        class_weight={majority_class:IR, minority_class:1}
      else:
        majority_class = 0
        minority_class = 1
        IR = relv_cnt/non_relv_cnt
        class_weight={majority_class:IR, minority_class:1}

      clf = linear_model.LogisticRegression(solver='lbfgs', random_state=0, C=1.0, max_iter=10000, class_weight = class_weight)


    #test threshold
    accuracy, predictions = train_model_threshold(query_id,clf_name, clf, xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram, test_y)


    return accuracy, predictions


In [None]:
# train clf and get predictions
def train_model_threshold(topic_id, clf_name, classifier, feature_vector_train, label, feature_vector_test, test_y, is_neural_net=False):

    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)

    global selected_threshold # LR by default = 0.5
    model_threshold = selected_threshold # update it to current threshold

    if clf_name == 'LR_scores':
      # get clf confidence scores

      predictions = classifier.predict_proba(feature_vector_test)[:,1] # return the class 1 probabilities

      n_true_clf_predictions = int(np.sum(predictions)) # pares int to handle scores as well
      return n_true_clf_predictions, predictions
    elif clf_name == 'LR_scores_threshold':
      # get clf confidence scores
      predictions = classifier.predict_proba(feature_vector_test)[:,1]

      n_true_clf_predictions = int(np.sum(predictions))

      # update predictions, keep the class 1 probabilities if >= threshold, otherwise = 0
      predictions = list(map(lambda x: 0 if x < model_threshold else x, predictions))

      # return the class 1 probabilities if >= threshold, otherwise = 0
      n_true_clf_predictions = int(np.sum(predictions)) # pares int to handle scores as well
      return n_true_clf_predictions, predictions

    else:
      # clf LR, get clf labels
      predictions = (classifier.predict_proba(feature_vector_test)[:,1] >= model_threshold).astype(bool) # set threshold to threshold_list[i]
      acc = metrics.accuracy_score(test_y, predictions) * 100
      return acc, predictions


## load text

In [None]:
#load text data for all topics
def load_text(dataset_name):

    #load text features of all topics
    txt_DIR = DIR+'data/text/'


    if dataset_name == 'CLEF2019':
      text_df = pd.read_csv(txt_DIR+'CLEF2019_df_all.csv')
      text_df['title_abstract_TOKENS'] = text_df['title_abstract_TOKENS'].astype(str).values # .astype(str).values needed for the TfidfVectorizer
    elif dataset_name == 'CLEF2018':
      text_df = pd.read_csv(txt_DIR+'CLEF2018_df_all.csv')
      text_df['title_abstract_TOKENS'] = text_df['title_abstract_TOKENS'].astype(str).values
    elif dataset_name == 'CLEF2017':
      text_df = pd.read_csv(txt_DIR+'CLEF2017_df_all.csv')
      text_df['title_abstract_TOKENS'] = text_df['title_abstract_TOKENS'].astype(str).values
    elif dataset_name == 'CLEF_all':
      text_df = pd.read_csv(txt_DIR+'CLEF_all_years_df_all.csv')
      text_df['title_abstract_TOKENS'] = text_df['title_abstract_TOKENS'].astype(str).values
    elif dataset_name == 'TR':
      data_name = 'tr'
      topic_set = 'test'
      df_fname = data_name + "_" + topic_set +  "_text_toknized.csv"
      df_fname = os.path.join(txt_DIR ,  df_fname)
      text_df = pd.read_csv(df_fname, index_col = 0)
      text_df['content_toknize'] = text_df['content_toknize'].astype(str).values
    elif dataset_name == 'Legal':
      data_name = 'legal'
      topic_set = 'test'
      df_fname = data_name + "_" + topic_set +  "_text_toknized.csv"
      df_fname = os.path.join(txt_DIR,  df_fname)
      text_df = pd.read_csv(df_fname, index_col = 0)
      text_df['content_toknize'] = text_df['content_toknize'].astype(str).values
    elif dataset_name == 'RCV1_all':
      data_name = 'RCV1_all'
      topic_set = 'test'
      df_fname = "rcv1_text_toknized.csv"
      df_fname = os.path.join(txt_DIR, df_fname)
      text_df = pd.read_csv(df_fname, index_col = 0)
      text_df['content_toknize'] = text_df['content_toknize'].astype(str).values
    elif dataset_name == 'RCV1':
      data_name = 'RCV1'
      topic_set = 'test'
      text_df = pd.read_csv(txt_DIR+'rcv1_text_toknized_selected_45_0.2.csv', index_col = 0)
      text_df['content_toknize'] = text_df['content_toknize'].astype(str).values
    return text_df


# make dic of list of text features (title_abstract_TOKENS) of ranked docs for each queryid
def make_rank_text_dic_with_filter(doc_rank_dic,text_df):

    rank_text_dic = {}

    for (query_id, doc_ids) in doc_rank_dic.items():

        print(query_id)

        if dataset_name == 'TR' or dataset_name == 'Legal' or dataset_name == 'RCV1':
          topic_df = text_df # same docs for all topics
        else:
          # get current topic records only (for CLEF)
          topic_df = text_df[text_df['topicid']== query_id]

        rank_text_dic[query_id] = []


        # get text dic
        if dataset_name == 'TR' or dataset_name == 'Legal' or dataset_name == 'RCV1':
          docs_list = topic_df['doc_id'].tolist() # no need now

          if dataset_name == 'TR' or dataset_name == 'RCV1':
            # int index with str doc ids
            sorted_doc_ids = [int(i) for i in doc_rank_dic[query_id]]
            text_df = text_df.reindex(sorted_doc_ids)
            rank_text_dic[query_id] = list(text_df['content_toknize'].astype(str)) # .astype(str) needed for RCV1

          elif dataset_name == 'Legal':
            # str index & doc ids
            text_df = text_df.reindex(doc_rank_dic[query_id]) # re-order
            rank_text_dic[query_id] = list(text_df['content_toknize'])
        else:
          # CLEF datasets
          docs_list = topic_df['pmid'].tolist()
          for doc_id in doc_ids:
              #get text of each doc_id
              if int(doc_id) in docs_list:
              #if doc_id in docs_list:
                #get current doc text feature
                doc_text = topic_df['title_abstract_TOKENS'][ topic_df['pmid'] == int(doc_id)].values[0] # must parse int to doc_id
                rank_text_dic[query_id].append(doc_text)
              else:
                print(doc_id + ' not in text list of ' + query_id)
                doc_rank_dic[query_id].remove(doc_id)

    return doc_rank_dic, rank_text_dic


In [None]:
# LOAD RUN DATA with CLEF text
def load_run_data_with_text(run, dataset_name):
  run_fname = os.path.join(DIR, run)
  with open(run_fname, 'r') as infile:
    run_data = infile.readlines()

  doc_rank_dic = make_rank_dic(run_data)  # make dictionary of ranked docids for each queryid

  #load text features of all topics
  text_df = load_text(dataset_name)
  doc_rank_dic, rank_text_dic = make_rank_text_dic_with_filter(doc_rank_dic, text_df) # make dic of list relevances of ranked docs for each queryid

  # make rank_rel_dic after doc_rank_dic filtered
  rank_rel_dic = make_rank_rel_dic(query_rel_dic,doc_rank_dic) # make dic of list relevances of ranked docs for each queryid

  return doc_rank_dic, rank_rel_dic, rank_text_dic

# Setup - shared
Code to setup stopping methods (implemented to be independent of processing run)

In [None]:
# Setup evaluation directory
EVALDIR = os.path.join(DIR, 'tar_eval_out/')    # Directory for evaluation output files
# Check whether dircetory exists and create it if not
if not os.path.exists(EVALDIR):
    os.makedirs(EVALDIR)

In [None]:
# LOAD TOPIC RELEVANCE DATA
def load_rel_data(qrels):
  qrel_fname =  os.path.join(DIR, qrels)
  with open(qrel_fname, 'r') as infile:
      qrels_data = infile.readlines()
  query_rel_dic = make_rel_dic(qrels_data) # make dictionary of list of docids relevant to each queryid

  #print("Number of topics:", len(query_rel_dic))

  return qrel_fname, query_rel_dic

In [None]:
# LOAD RUN DATA
def load_run_data(run):
  run_fname = os.path.join(DIR, run)
  with open(run_fname, 'r') as infile:
    run_data = infile.readlines()
  doc_rank_dic = make_rank_dic(run_data)  # make dictionary of ranked docids for each queryid
  rank_rel_dic = make_rank_rel_dic(query_rel_dic,doc_rank_dic) # make dic of list relevances of ranked docs for each queryid

  #return doc_rank_dic, rank_rel_dic, rank_text_dic
  return doc_rank_dic, rank_rel_dic

### Set Parameters

In [None]:
# SET POISSON PROCESS/COX PROCESS PARAMETERS
# dynamically create list with beta increment
alpha = 0.025
beta = 0.025
sample_props = list(np.arange(alpha, (1+beta), beta).round(3))

n_windows = 10  # number of windows to make from sample

# SET EXPERIMENTAL PARAMETERS
des_recalls = [.9999, 0.95, 0.9, 0.8, 0.7] # desired recalls to experiment over
des_probs = [0.95, 0.8] # desired confidences to experiment over

selected_threshold = 0.5 # optimised (models default)

min_doc_in_sample = 10 # min number docs must be in sample to proceed with pp algorithm
min_rel_in_sample = 20 # min number rel docs must be initial sample to proceed with algorithm
min_rel_in_sample_hold = 20 # hold value
min_rel_in_sample_type = 'dynamic'

###run_tar_eval

In [None]:
# Updated version of function that returns MACRO average (as summarised by tar_eval)
# Function to evaluate output file and return scores for range of metrics

# Runs tar_eval script and parses output
# Uses same metrics as Li and Kanoulas:
# 1) recall
# 2) cost (== percentage effort)
# 3) relative error (absolute diff between recall achived and target recall)
# 4) loss_er (from tar_eval)
# 5) reliability (%age of times desired recall is achieved)
def run_tar_eval(qrel_fname, out_fname, des_recall):

      # Location of script
      script = os.path.join(DIR, 'scripts/tar_eval.py')

      # Run tar_eval script
      # print(f"{script} {qrel_fname} {out_fname}")
      ret = subprocess.check_output(['python', script, qrel_fname, out_fname])
      # ret = subprocess.check_output([' tail -27 '], shell=True, input=ret)
      ret = ret.decode(encoding='utf-8')

      #print(ret)

      # Parse eval script output
      teval_dict = {}   # Summary results (computed across all topics)
      recalls = []      # Recall for each topic
      for line in ret.split('\n'):
          if line != '':
            tid, key, val = line.split()
            #print(f"tid: {tid}, key: {key}, val: {val}")
            if tid != 'ALL':
                if key == 'topic_id':
                    teval_dict[tid] = {}
                teval_dict[tid][key] = val


      # print(teval_dict)

      recalls = []
      costs = []
      reliabilities = []
      loss_ers = []
      rel_errors = []
      topic_dfs = []
      for tid in teval_dict:
          # Compute recall (rels_found / num_rel)
          recall = float(teval_dict[tid]['rels_found']) / float(teval_dict[tid]['num_rels'])
          recalls.append(recall)

          # cost (num_shown / num_docs)
          cost = float(teval_dict[tid]['num_shown']) / float(teval_dict[tid]['num_docs'])
          costs.append(cost)

          # reliability (number for which recall >= des_recall)
          if recall >= des_recall:
            reliability = 1
          else:
            reliability = 0
          reliabilities.append(reliability)

          # loss_er -- available directly
          loss_ers.append(teval_dict[tid]['loss_er'])

          # rel_error -- diff between
          rel_error = np.abs(recall - des_recall) / des_recall
          rel_errors.append(rel_error)


      # Compute mean and (optionall also std_dev)
      recalls = np.array(recalls)
      recall_all = "{:.3f}".format(recalls.mean())

      costs = np.array(costs)
      cost_all = "{:.3f}".format(costs.mean())

      reliabilities = np.array(reliabilities)
      reliability = "{:.3f}".format(reliabilities.mean())
      reliability_all = "{:.3f}".format(reliabilities.mean())

      loss_ers = np.array(loss_ers).astype(float)
      loss_er_all = "{:.3f}".format(loss_ers.mean())

      rel_errors = np.array(rel_errors)
      rel_error_all = "{:.3f}".format(rel_errors.mean())


      # Optionally print out std dev of scores with mean
      # ddof=1 for sample std like pd.describe
      if 1 == 0:
        recall_all = "{}±{:.3f}".format(recall_all, recalls.std(ddof=1))
        cost_all = "{}±{:.3f}".format(cost_all, costs.std(ddof=1))
        reliability_all = "{}±{:.3f}".format(reliability_all, reliabilities.std(ddof=1))
        loss_er_all = "{}±{:.3f}".format(loss_er_all, loss_ers.std(ddof=1))
        rel_error_all = "{}±{:.3f}".format(rel_error_all, rel_errors.std(ddof=1))

      return recall_all, cost_all, reliability_all, loss_er_all, rel_error_all

###run_oracle

In [None]:
# Function to run ORACLE METHOD (OR)
def run_oracle_method(des_recall, topics_list):
    # print('run_oracle_method')
    # Create output file
    out_fname = dataset_name+ "_Oracle_"+str(des_recall)+".txt"
    out_fname = os.path.join(EVALDIR, out_fname)
    out_f = open(out_fname, "w+")  #  Create a new file if it doesn't exist

    for query_id in topics_list:
        rel_list = rank_rel_dic[query_id]  # list binary rel of ranked docs
        rel_doc_idxs = np.where(np.array(rel_list) == 1)[0]
        oracle_n_rel = math.ceil(len(rel_doc_idxs)*des_recall)
        oracle_idx = rel_doc_idxs[oracle_n_rel-1]

        # Write output file
        for i in range(oracle_idx + 1):
               out_f.write(f"{query_id}\tAF\t{doc_rank_dic[query_id][i]}\t{i + 1}\t{-i}\tmyrun\n")

    # Evaluate results
    out_f.close()
    # recall, acc, perc_eff_saved = run_tar_eval(qrel_fname, out_fname, des_recall)
    recall, cost, reliability, loss_er, rel_err = run_tar_eval(qrel_fname, out_fname, des_recall)

    # return recall, acc, perc_eff_saved
    return recall, cost, reliability, loss_er, rel_err


### Rate Functions

In [None]:


# Functions encoding relevant distribution models
# Exponential model
def exp_model_func(x, a, k): # x = vector x values
    return a*np.exp(-k*x)

# Power law
def power_model_func(x, a, k): # x = vector x values
    return a*x**k

# AP Prior distribution
def apprior_model_func(x, a): # x = vector x values
    # print(f"apprior_model_func: n_docs: {n_docs}")
    return a * (n_docs / x)

# hyperbolic model
def hyperbolic_model_func(x, a, b, k):
    return a/((1.0+b*k*x)**(1.0/b))

# Integral of model functions
def model_integral(a, k, n_docs, model):
    if model == "E":
        # mu = (a_val/-k_val)*(math.exp(-k_val*n_docs)-1)
        # return (a/-k)*(math.exp(-k*n_docs)-1)
        return (a/-k)*(np.exp(-k*n_docs)-1)
    elif model == "P":
        # mu = (a_val/(k_val+1))*(n_docs**(k_val+1)-1)  # update power
        return (a/(k+1))*(n_docs**(k+1)-1)
    elif model == "A":
        mu = a * (n_docs / ((n_docs * math.log(n_docs)) - math.lgamma(n_docs + 1)))
        return mu




# Integral of model functions
def model_integral_b(a, b, k, n_docs, model):

    is_harmonic = 0

    if b == 1:
      is_harmonic = 1 # hyperbolic becomes harmonic

    if model == "E":
        # mu = (a_val/-k_val)*(math.exp(-k_val*n_docs)-1)
        # return (a/-k)*(math.exp(-k*n_docs)-1)
        return (a/-k)*(np.exp(-k*n_docs)-1)
    elif model == "P":
        # mu = (a_val/(k_val+1))*(n_docs**(k_val+1)-1)  # update power
        return (a/(k+1))*(n_docs**(k+1)-1)
    elif model == "A":
        mu = a * (n_docs / ((n_docs * math.log(n_docs)) - math.lgamma(n_docs + 1)))
        return mu
    elif model == "H":
        if is_harmonic:
          mu = (a/k) * np.log((k*n_docs)+1)
          return mu
        else:
          mu =  (a/k*(b-1)) * np.power((1+(b*k*n_docs)), 1 - (1/b))
          return mu

### Run Point Process

In [None]:
def run_point_process_GridSearch(des_recall, des_prob, topics_list, process_type, model):

    # des_recall: desired recall
    # des_prob: confidence in des_recall
    # topics_list: list of topics to process
    # process_type: type of point process (either "IP" or "CX")
    # model: model for rate function (either "P": power model; "E":exponential model)

    # print(f"point process: type {process_type}, model {model}")

    # n_docs variable needs to be made global so it can be accessed by
    # curve fit and integral functions
    global n_docs
    #global min_rel_in_sample

    global RMSE,min_rel_in_sample,alpha, beta
    global min_rel_in_sample_hold # keep value


    # Keep count of figures drawn
    figure_count = 1

    # Do check that input is valid
    if not ((process_type == "IP" or process_type == "CX") and
            (model == "P" or model == "E" or model == "A" or model == "H")):
        #print("Incorrect arguments to run_point_process - exiting\n")
        return 0, 0, 0

    # Create output file
    out_fname = dataset_name+"_"+str(process_type)+"_"+str(model)+"_"+str(des_recall)+"_"+str(des_prob)+".txt"
    out_fname = os.path.join(EVALDIR, out_fname)
    out_f = open(out_fname, "w+")  #  Create a new file if it doesn't exist


    global actual_predicted_df
     # empty df befor each topic for each run

    for query_id in topics_list:
        min_rel_in_sample = min_rel_in_sample_hold # 27-7-22 update for each topic

        #print('*************** query_id: ' , query_id)

        # EXTRACT COUNTS AND REL LISTS
        n_docs = len(doc_rank_dic[query_id])  # total n. docs in topic
        rel_list = rank_rel_dic[query_id]  # list binary rel of ranked docs

        # Initialise count of documents in sample
        n_samp_docs = int(round(n_docs*sample_props[0]))

        windows_end_point = 0
        pred_stop_n = n_docs
        i = 0



        #print(i , len(sample_props), pred_stop_n ,n_samp_docs)
        while (i < len(sample_props)) and (pred_stop_n > n_samp_docs):
          #print(" Entered while (i < len(sample_props)) and (pred_stop_n > n_samp_docs)")
          #print('i: ', i, 'sample_props: ', sample_props[i] )



          # Check that enough relevant documents have been observed
          n_samp_docs = int(round(n_docs*sample_props[i]))
          sample_rel_list = rel_list[0:n_samp_docs]  # chunk of rel list examined in sample
          # print(f"Sample: {sample_props[i]} - rel found {np.sum(sample_rel_list)}")


          # check min docs in sample
          if n_samp_docs < min_doc_in_sample:
            #print("docs in sample too little: " , n_samp_docs)
            i = i + 1
            continue # skip to next iteration


          # calculate all actual vs. predicted results
          n_unobserved_docs = n_docs - n_samp_docs
          n_rel = np.sum(rel_list)
          n_rel_at_end_samp = np.sum(sample_rel_list)
          n_rel_unobserved = n_rel - n_rel_at_end_samp

          min_rel_in_sample_flag = 0 #determine if min_rel_in_sample achieved
          if (np.sum(sample_rel_list) >= min_rel_in_sample):

                #print("min_rel_in_sample : " , min_rel_in_sample)
                #print("n_rel_at_end_samp: ", n_rel_at_end_samp)
                min_rel_in_sample_flag = 1 #set to 1 if min_rel_in_sample achieved
                # print("Running point process")
                sample_prop = sample_props[i]
                # print(f"sample_prop {sample_prop}")

                n_samp_docs = int(round(n_docs*sample_props[i]))
                sample_rel_list = rel_list[0:n_samp_docs]  # chunk of rel list examined in sample




                # get points
                windows = make_windows(n_windows, n_samp_docs)
                window_size = windows[0][1]


                # calculate points that will be used to fit curve
                if model == "P":
                    x,y = get_points_power(windows, window_size, sample_rel_list)


                y5=y[5:] # check last 5 points only


                # try to fit curve
                good_curve_fit = 0
                if sum(y5) == 0 and n_rel_at_end_samp >= min_rel_in_sample: # 27-7-22 check available min_rel_in_sample
                  break # stop for this topic
                else:
                  try:
                        if model == "P":
                          p0 = [0.1, 0.001 ]  # initialise curve parameters
                          opt, pcov = curve_fit(power_model_func, x, y, p0)  # fit curve
                          good_curve_fit = 1

                          #print(opt)
                          #print(pcov)

                          # Compute residuals and draw graphs as sanity check
                          p1 = opt[0]
                          p2 = opt[1]
                          residuals = np.array(y - power_model_func(x,p1, p2))
                          diff = np.max(y) - np.min(y)
                          fres = sum(residuals**2) / diff
                          #print(f"Norm RMSE: {fres}")
                  except Exception as error:
                      pass
                      # e = str(error)
                      # print(e)


                # Run point process
                if(good_curve_fit == 1):
                    # get y-values for fitted curve
                    if model == "P":
                      a, k = opt
                      y2 = power_model_func(x, a, k)

                    n_rel_at_end_samp = np.sum(sample_rel_list)

                    # Check error in curve fit (using normalised RMSE)
                    if model == "P":
                        predicted_y = power_model_func(x, a, k)

                    residuals = np.array(y - predicted_y)
                    diff = np.max(y) - np.min(y)
                    norm_rmse = sum(residuals**2) / diff



                    counting_process_flag = 0 # determine if counting process applied or not


                    if(norm_rmse < RMSE):
                        counting_process_flag = 1 # set to 1 if counting process applied

                        # Run point process (Inhomogenous Poisson or Cox Proc.)
                        # Inhom Poisson process
                        if process_type == "IP":
                            mu = model_integral(a, k, n_docs, model) - model_integral(a, k, n_samp_docs, model)

                            pred_unobserved = predict_n_rel(des_prob, n_unobserved_docs, mu) # 18-4-22 update
                            pred_n_rel = n_rel_at_end_samp + pred_unobserved
                            # print(f"pred_n_rel: {pred_n_rel} (n_rel_at_end_samp: {n_rel_at_end_samp} pred_unobserved: {pred_unobserved})")


                        des_n_rel = des_recall*pred_n_rel
                        if des_n_rel <= n_rel_at_end_samp:
                            pred_stop_n = n_rel_at_end_samp



          # decrease needed min_rel_in_sample while increasing sample size ## 27-7-22
          if min_rel_in_sample_type == 'dynamic':
            if (min_rel_in_sample > 0):
              #min_rel_in_sample = int(min_rel_in_sample_hold - (sample_props[i]*min_rel_in_sample_hold)) # by % instead of fixed number
              min_rel_in_sample = int(min_rel_in_sample - (sample_props[i]*min_rel_in_sample)) # by % instead of fixed number


          i += 1  # increase sample proportion size


        # Write output file
        for i in range(n_samp_docs):
            out_f.write(f"{query_id}\tAF\t{doc_rank_dic[query_id][i]}\t{i + 1}\t{-i}\tmyrun\n")

    # Compute results
    out_f.close()
    # recall, acc, perc_eff_saved = run_tar_eval(qrel_fname, out_fname, des_recall)
    recall, cost, reliability, loss_er, rel_err = run_tar_eval(qrel_fname, out_fname, des_recall)

    return recall, cost, reliability, loss_er, rel_err



In [None]:
def run_point_process_GridSearch_with_classification(des_recall, des_prob, topics_list, process_type, model, clf_name, clf):

    # des_recall: desired recall
    # des_prob: confidence in des_recall
    # topics_list: list of topics to process
    # process_type: type of point process (either "IP" or "CX")
    # model: model for rate function (either "P": power model; "E":exponential model)

    # print(f"point process: type {process_type}, model {model}")

    # n_docs variable needs to be made global so it can be accessed by
    # curve fit and integral functions
    global n_docs
    #global min_rel_in_sample

    global RMSE,min_rel_in_sample,alpha, beta
    global min_rel_in_sample_hold # keep value


    # Keep count of figures drawn
    figure_count = 1

    # Do check that input is valid
    if not ((process_type == "IP" or process_type == "CX") and
            (model == "P" or model == "E" or model == "A" or model == "H")):
        #print("Incorrect arguments to run_point_process - exiting\n")
        return 0, 0, 0

    # Create output file
    out_fname = "PP_CLF_"+clf_name+"_"+dataset_name+"_"+str(process_type)+"_"+str(model)+"_"+str(des_recall)+"_"+str(des_prob)+".txt"
    out_fname = os.path.join(EVALDIR, out_fname)
    out_f = open(out_fname, "w+")  #  Create a new file if it doesn't exist


    for query_id in topics_list:

        min_rel_in_sample = min_rel_in_sample_hold # 27-7-22 update for each topic

        # print(query_id)

        # EXTRACT COUNTS AND REL LISTS
        n_docs = len(doc_rank_dic[query_id])  # total n. docs in topic
        rel_list = rank_rel_dic[query_id]  # list binary rel of ranked docs
        text_list = rank_text_dic[query_id]  # list text feature of ranked docs
        tfidf_list = TfidfVectorizer(min_df=2).fit_transform(text_list) # tfidf features
        # Initialise count of documents in sample

        n_samp_docs = int(round(n_docs*sample_props[0]))

        windows_end_point = 0
        pred_stop_n = n_docs
        i = 0

        #print('query_id: ' , query_id)

        # with classification cant use all set as training, must have some test set so sample_props[i] != 1 must be checked
        while (i < len(sample_props)) and (pred_stop_n > n_samp_docs) and sample_props[i] != 1:
          #print('i: ', i, 'sample_props: ', sample_props[i] )

          # Check that enough relevant documents have been observed
          n_samp_docs = int(round(n_docs*sample_props[i]))
          sample_rel_list = rel_list[0:n_samp_docs]  # chunk of rel list examined in sample
          # print(f"Sample: {sample_props[i]} - rel found {np.sum(sample_rel_list)}")

          # check min docs in sample
          if n_samp_docs < min_doc_in_sample:
            #print("docs in sample too little: " , n_samp_docs)
            i = i + 1
            continue # skip to next iteration

          # calculate all actual vs. predicted results
          n_unobserved_docs = n_docs - n_samp_docs
          n_rel = np.sum(rel_list)
          n_rel_at_end_samp = np.sum(sample_rel_list)
          n_rel_unobserved = n_rel - n_rel_at_end_samp

          min_rel_in_sample_flag = 0 #determine if min_rel_in_sample achieved
          if (np.sum(sample_rel_list) >= min_rel_in_sample):

                #print("min_rel_in_sample : " , min_rel_in_sample)
                #print("n_rel_at_end_samp: ", n_rel_at_end_samp)
                min_rel_in_sample_flag = 1 #set to 1 if min_rel_in_sample achieved
                # print("Running point process")
                sample_prop = sample_props[i]
                # print(f"sample_prop {sample_prop}")

                n_samp_docs = int(round(n_docs*sample_props[i]))
                sample_rel_list = rel_list[0:n_samp_docs]  # chunk of rel list examined in sample

                #print (f"query_id: {query_id}, n_samp_docs: {n_samp_docs}, n_docs: {n_docs} , rel_list: {len(rel_list)}, sample_rel_list: {len(sample_rel_list)}")


                # run clf only if sample havs both 1 and 0 labels (relv & non-relv)
                # no need to apply classification, only 1 class availalbe
                if np.sum(sample_rel_list) == len(sample_rel_list):

                  windows = make_windows(n_windows, n_samp_docs) # use only sample
                  window_size = windows[0][1]

                  updated_rel_list = rel_list.copy() # use original rel_list (no predicitions)
                else:
                  # run classification model, train on sample_rel_list, apply on remaining, update rel with predictions
                  acc, predictions = run_classification_model_tfidf(query_id, n_samp_docs, n_docs, rel_list, sample_rel_list, tfidf_list, clf_name, dataset_imbalance_handle)

                  #update rel_list & sample_rel_list with predictions
                  # keep original lebels for training set & update test set labels with predictions

                  updated_rel_list = rel_list.copy() # 18-4-22 update # use list.copy() to keep original rel_list and do modifications on updated_rel_list var

                  updated_rel_list[n_samp_docs:n_docs] = predictions


                  windows = make_windows(n_windows, n_docs) # n_docs to include both training & testing
                  window_size = windows[0][1]


                # calculate all actual vs. predicted results
                n_unobserved_docs = n_docs - n_samp_docs
                n_rel = np.sum(rel_list)
                n_rel_at_end_samp = np.sum(sample_rel_list)
                n_rel_unobserved = n_rel - n_rel_at_end_samp
                n_rel_unobserved_clf_predictions = int(np.sum(predictions)) # pares int to handle scores as well

                #print('n_rel_unobserved_clf_predictions: ', n_rel_unobserved_clf_predictions)


                # calculate points that will be used to fit curve
                if model == "E" or model == "A" or model == "H":
                    x,y = get_points(windows, window_size, updated_rel_list)  # updated_rel_list to include both training & testing
                elif model == "P":
                    x,y = get_points_power(windows, window_size, updated_rel_list)



                y5=y[5:] # check last 5 points only



                # try to fit curve
                good_curve_fit = 0
                if sum(y5) == 0 and n_rel_at_end_samp >= min_rel_in_sample: # 27-7-22 check available min_rel_in_sample

                  break # stop for this topic

                else:
                  try:
                        if model == "E":
                          p0 = [0.1, 0.001 ]  # initialise curve parameters
                          opt, pcov = curve_fit(exp_model_func, x, y, p0)  # fit curve
                          #opt, pcov = curve_fit(exp_model_func, x, y)  # try no p0
                          good_curve_fit = 1

                          # Compute residuals and draw graphs as sanity check
                          p1 = opt[0]
                          p2 = opt[1]
                          residuals = np.array(y - exp_model_func(x,p1, p2))
                          diff = np.max(y) - np.min(y)
                          fres = sum(residuals**2) / diff


                        elif model == "P":
                          p0 = [0.1, 0.001 ]  # initialise curve parameters
                          opt, pcov = curve_fit(power_model_func, x, y, p0)  # fit curve
                          good_curve_fit = 1


                          # Compute residuals and draw graphs as sanity check
                          p1 = opt[0]
                          p2 = opt[1]
                          residuals = np.array(y - power_model_func(x,p1, p2))
                          diff = np.max(y) - np.min(y)
                          fres = sum(residuals**2) / diff


                  except Exception as error:
                      pass
                      # e = str(error)
                      # print(e)

                # Run point process
                if(good_curve_fit == 1):
                    # get y-values for fitted curve
                    if model == "E":
                      a, k = opt
                      y2 =   exp_model_func(x, a, k)
                    elif model == "P":
                      a, k = opt
                      y2 = power_model_func(x, a, k)

                    # print(f"y2: {y2}")

                    n_rel_at_end_samp = np.sum(sample_rel_list)

                    # Check error in curve fit (using normalised RMSE)
                    if model == "E":
                        predicted_y =   exp_model_func(x, a, k)
                    elif model == "P":
                        predicted_y = power_model_func(x, a, k)


                    residuals = np.array(y - predicted_y)
                    diff = np.max(y) - np.min(y)
                    norm_rmse = sum(residuals**2) / diff


                    counting_process_flag = 0 # determine if counting process applied or not


                    if(norm_rmse < RMSE):
                        counting_process_flag = 1 # set to 1 if counting process applied

                        # Run point process (Inhomogenous Poisson or Cox Proc.)
                        # Inhom Poisson process
                        if process_type == "IP":
                            mu = model_integral(a, k, n_docs, model) - model_integral(a, k, n_samp_docs, model)
                            pred_unobserved = predict_n_rel(des_prob, n_unobserved_docs, mu) # 18-4-22 update
                            pred_n_rel = n_rel_at_end_samp + pred_unobserved
                            # print(f"pred_n_rel: {pred_n_rel} (n_rel_at_end_samp: {n_rel_at_end_samp} pred_unobserved: {pred_unobserved})")


                        des_n_rel = des_recall*pred_n_rel
                        if des_n_rel <= n_rel_at_end_samp:
                            pred_stop_n = n_rel_at_end_samp


          # decrease needed min_rel_in_sample while increasing sample size ## 27-7-22
          if min_rel_in_sample_type == 'dynamic':
            if (min_rel_in_sample > 0):
              min_rel_in_sample = int(min_rel_in_sample - (sample_props[i]*min_rel_in_sample)) # by % instead of fixed number


          i += 1  # increase sample proportion size


        # Write output file
        for i in range(n_samp_docs):
            out_f.write(f"{query_id}\tAF\t{doc_rank_dic[query_id][i]}\t{i + 1}\t{-i}\tmyrun\n")

    # Compute results
    out_f.close()

    recall, cost, reliability, loss_er, rel_err = run_tar_eval(qrel_fname, out_fname, des_recall)

    return recall, cost, reliability, loss_er, rel_err



#Experiments Functions

### run OR

In [None]:
# for quick results run ip-p only
# Function to call stopping approaches and collect results together
def run_sp_approaches_OR(des_recall, des_prob):

    # PREPARE SCORING DICTIONARIES
    run_score_dic = {}   # Final dict storing results for each approach, returned by fn

    topics_list = make_topics_list(doc_rank_dic,1)  # sort topics by no docs



    # ORACLE METHOD
    print("Running oracle method")
    model = 'OR'

    run_name = dataset_name + '_'+ str(des_prob)+'-'+str(des_recall)+'-OR'

    run_score_dic[run_name] = run_oracle_method(des_recall, topics_list)

    file_name = str(des_prob)+'-'+str(des_recall)+ dataset_name+'-'+model+'-'


    return run_score_dic


# RUN EXPERIMENTS
def run_experiments_OR():

  df = pd.DataFrame(columns=['recall', 'cost', 'reliability', 'loss_er','rel_err'])
  for prob in des_probs:
    print("Confidence level: ", prob)
    for recall in des_recalls:
      print(f"recall: {recall}")
      results_dict = {}
      results_dict = run_sp_approaches_OR(recall, prob)
      df_tmp = pd.DataFrame.from_dict(results_dict, orient="index", columns=['recall', 'cost', 'reliability', 'loss_er','rel_err'])

      df_tmp['dataset'] = dataset_name
      df_tmp['des_prob'] = prob
      df_tmp['des_recall'] = recall

      df = df.append(df_tmp) # append all results togather
      df = df.sort_index(ascending=False)
      display(df)

  return df


## run GS

In [None]:
def set_point_process_GridSearch(des_prob,des_recall,topics_list,run_score_dic, pp, m, r, relv, a, b):

    global min_rel_in_sample_type

    if relv == 'dynamic20-sample':
      min_rel_in_sample_type = 'dynamic'
      relv = 20 #update relv
    else:
      min_rel_in_sample_type = 'static'

    global model,RMSE,min_rel_in_sample, alpha, beta
    point_process = pp
    model = m
    RMSE = r
    min_rel_in_sample = relv
    min_rel_in_sample_hold = relv
    alpha = a
    beta = b
    sample_props = list(np.arange(alpha, (1+beta), beta).round(3))




    print("Running-" +point_process + '-' +model)

    run_name = dataset_name + '_'+ str(des_prob)+'-'+str(des_recall)+'-'+point_process+'-'+model


    print(run_name)
    run_score_dic[run_name] = run_point_process_GridSearch(des_recall, des_prob, topics_list, point_process, model)




# Function to call stopping approaches and collect results together
def run_sp_approaches_GridSearch(des_recall, des_prob):

    # PREPARE SCORING DICTIONARIES
    run_score_dic = {}   # Final dict storing results for each approach, returned by fn

    topics_list = make_topics_list(doc_rank_dic,1)  # sort topics by no docs


    for params in grid:
      set_point_process_GridSearch(des_prob,des_recall,topics_list,run_score_dic,params['point_process'], params['model'], params['RMSE'], params['min_rel_in_sample'], params['alpha'], params['beta'])


    return run_score_dic

# RUN EXPERIMENTS
def run_experiments_GridSearch():
  print(dataset_name)
  #define outside func in order to access later
  df = pd.DataFrame(columns=['recall', 'cost', 'reliability', 'loss_er','rel_err'])

  for prob in des_probs:
    print("Confidence level: ", prob)
    for recall in des_recalls:
      print(f"recall: {recall}")
      results_dict = {}
      results_dict = run_sp_approaches_GridSearch(recall, prob)
      df_tmp = pd.DataFrame.from_dict(results_dict, orient="index", columns=['recall', 'cost', 'reliability', 'loss_er','rel_err'])

      # format paper table
      df_tmp['dataset'] = dataset_name
      df_tmp['des_prob'] = prob
      df_tmp['des_recall'] = recall


      df = df.append(df_tmp) # append all results togather
      df = df.sort_index(ascending=False)
      display(df)

      ranking = 'autotar_ranker'

  return df




## run GS with Classification

In [None]:
from sklearn.model_selection import ParameterGrid

min_rel_in_sample_type = 'xx'

hyperparameters_space = {'point_process':['IP'],
                        'model':['H'],
                        'RMSE':[0.1],
                        'min_rel_in_sample':['dynamic20-sample'],
                        'alpha':[0.025],
                        'beta':[0.025]}

grid = ParameterGrid(hyperparameters_space)

In [None]:
def set_point_process_GridSearch_clf(des_prob,des_recall,clf_name, topics_list,run_score_dic, pp, m, r, relv, a, b):

    global min_rel_in_sample_type

    if relv == 'dynamic20-sample':
      min_rel_in_sample_type = 'dynamic'
      relv = 20 #update relv
    else:
      min_rel_in_sample_type = 'static'

    global model,RMSE,min_rel_in_sample, alpha, beta
    point_process = pp
    model = m
    RMSE = r
    min_rel_in_sample = relv
    min_rel_in_sample_hold = relv
    alpha = a
    beta = b
    sample_props = list(np.arange(alpha, (1+beta), beta).round(3))



    print("Running-" +point_process + '-' +model)

    run_name = dataset_name + '_'+ str(des_prob)+'-'+str(des_recall)+'-'+point_process+'-'+model+"-"+clf_name

    print(run_name)
    run_score_dic[run_name] = run_point_process_GridSearch_with_classification(des_recall, des_prob, topics_list, point_process, model, clf_name, clf_name)



# Function to call stopping approaches and collect results together
def run_sp_approaches_clf(des_recall, des_prob, clf_name):

    # PREPARE SCORING DICTIONARIES
    run_score_dic = {}   # Final dict storing results for each approach, returned by fn

    topics_list = make_topics_list(doc_rank_dic,1)  # sort topics by no docs


    for params in grid:
      set_point_process_GridSearch_clf(des_prob,des_recall,clf_name, topics_list,run_score_dic,
                                                       params['point_process'], params['model'], params['RMSE'], params['min_rel_in_sample'], params['alpha'], params['beta'])


    return run_score_dic

# RUN EXPERIMENTS
def run_experiments_clf(clf_name):
  print(dataset_name)
  #define outside func in order to access later
  df = pd.DataFrame(columns=['recall', 'cost', 'reliability', 'loss_er','rel_err'])

  for prob in des_probs:
    print("Confidence level: ", prob)
    for recall in des_recalls:
      print(f"recall: {recall}")
      results_dict = {}
      results_dict = run_sp_approaches_clf(recall, prob, clf_name)
      df_tmp = pd.DataFrame.from_dict(results_dict, orient="index", columns=['recall', 'cost', 'reliability', 'loss_er','rel_err'])

      # format paper table
      df_tmp['dataset'] = dataset_name
      df_tmp['des_prob'] = prob
      df_tmp['des_recall'] = recall


      df = df.append(df_tmp) # append all results togather
      df = df.sort_index(ascending=False)
      display(df)

  return df


# OR for CLEF & TREC 0.7, 0.8, 0.9

In [None]:
df_all = pd.DataFrame() # all runs in one df

In [None]:
EVALDIR = os.path.join(DIR, 'experiments_output/OR/tar_eval_out/')    # Directory for evaluation output files
# Check whether dircetory exists and create it if not
if not os.path.exists(EVALDIR):
    os.makedirs(EVALDIR)



### CLEF2017

In [None]:

dataset_name = 'CLEF2017'


qrels = "data/qrels/CLEF2017_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2017_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)



# SET EXPERIMENTAL PARAMETERS

des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over



df = run_experiments_OR()

df_all = df_all.append(df) # append all results togather



Number of topics: 42
Confidence level:  0.95
recall: 0.7
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2017_0.95-0.7-OR,0.744,0.034,1.0,0.074,0.063,CLEF2017,0.95,0.7


recall: 0.8
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2017_0.95-0.8-OR,0.834,0.043,1.0,0.033,0.042,CLEF2017,0.95,0.8
CLEF2017_0.95-0.7-OR,0.744,0.034,1.0,0.074,0.063,CLEF2017,0.95,0.7


recall: 0.9
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2017_0.95-0.9-OR,0.923,0.057,1.0,0.009,0.026,CLEF2017,0.95,0.9
CLEF2017_0.95-0.8-OR,0.834,0.043,1.0,0.033,0.042,CLEF2017,0.95,0.8
CLEF2017_0.95-0.7-OR,0.744,0.034,1.0,0.074,0.063,CLEF2017,0.95,0.7


### CLEF2018

In [None]:
dataset_name = 'CLEF2018'


qrels = "data/qrels/CLEF2018_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2018_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)




# SET EXPERIMENTAL PARAMETERS

des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over

df = run_experiments_OR()

df_all = df_all.append(df) # append all results togather



Number of topics: 30
Confidence level:  0.95
recall: 0.7
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2018_0.95-0.7-OR,0.715,0.042,1.0,0.082,0.021,CLEF2018,0.95,0.7


recall: 0.8
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2018_0.95-0.8-OR,0.812,0.051,1.0,0.037,0.015,CLEF2018,0.95,0.8
CLEF2018_0.95-0.7-OR,0.715,0.042,1.0,0.082,0.021,CLEF2018,0.95,0.7


recall: 0.9
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2018_0.95-0.9-OR,0.912,0.067,1.0,0.01,0.013,CLEF2018,0.95,0.9
CLEF2018_0.95-0.8-OR,0.812,0.051,1.0,0.037,0.015,CLEF2018,0.95,0.8
CLEF2018_0.95-0.7-OR,0.715,0.042,1.0,0.082,0.021,CLEF2018,0.95,0.7


### CLEF2019

In [None]:
dataset_name = 'CLEF2019'


qrels = "data/qrels/CLEF2019_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2019_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)




# SET EXPERIMENTAL PARAMETERS

des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over



df = run_experiments_OR()

df_all = df_all.append(df) # append all results togather




Number of topics: 31
Confidence level:  0.95
recall: 0.7
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2019_0.95-0.7-OR,0.731,0.047,1.0,0.078,0.044,CLEF2019,0.95,0.7


recall: 0.8
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2019_0.95-0.8-OR,0.83,0.057,1.0,0.035,0.037,CLEF2019,0.95,0.8
CLEF2019_0.95-0.7-OR,0.731,0.047,1.0,0.078,0.044,CLEF2019,0.95,0.7


recall: 0.9
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2019_0.95-0.9-OR,0.929,0.071,1.0,0.011,0.033,CLEF2019,0.95,0.9
CLEF2019_0.95-0.8-OR,0.83,0.057,1.0,0.035,0.037,CLEF2019,0.95,0.8
CLEF2019_0.95-0.7-OR,0.731,0.047,1.0,0.078,0.044,CLEF2019,0.95,0.7


### TREC-Legal

In [None]:

dataset_name = 'Legal'


qrels = "data/qrels/TREC_Legal_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/legal_test_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)




# SET EXPERIMENTAL PARAMETERS

des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over


df = run_experiments_OR()

df_all = df_all.append(df) # append all results togather



Number of topics: 4
Confidence level:  0.95
recall: 0.7
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
Legal_0.95-0.7-OR,0.7,0.019,1.0,0.09,0.0,Legal,0.95,0.7


recall: 0.8
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
Legal_0.95-0.8-OR,0.801,0.026,1.0,0.04,0.001,Legal,0.95,0.8
Legal_0.95-0.7-OR,0.7,0.019,1.0,0.09,0.0,Legal,0.95,0.7


recall: 0.9
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
Legal_0.95-0.9-OR,0.9,0.04,1.0,0.01,0.0,Legal,0.95,0.9
Legal_0.95-0.8-OR,0.801,0.026,1.0,0.04,0.001,Legal,0.95,0.8
Legal_0.95-0.7-OR,0.7,0.019,1.0,0.09,0.0,Legal,0.95,0.7


### TREC-TR

In [None]:

dataset_name = 'TR'


qrels = "data/qrels/TREC_TR_Test_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/tr_test_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)




# SET EXPERIMENTAL PARAMETERS

des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over



df = run_experiments_OR()

df_all = df_all.append(df) # append all results togather



Number of topics: 34
Confidence level:  0.95
recall: 0.7
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
TR_0.95-0.7-OR,0.702,0.003,1.0,0.089,0.003,TR,0.95,0.7


recall: 0.8
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
TR_0.95-0.8-OR,0.802,0.004,1.0,0.039,0.003,TR,0.95,0.8
TR_0.95-0.7-OR,0.702,0.003,1.0,0.089,0.003,TR,0.95,0.7


recall: 0.9
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
TR_0.95-0.9-OR,0.902,0.005,1.0,0.01,0.003,TR,0.95,0.9
TR_0.95-0.8-OR,0.802,0.004,1.0,0.039,0.003,TR,0.95,0.8
TR_0.95-0.7-OR,0.702,0.003,1.0,0.089,0.003,TR,0.95,0.7


### RCV1 selected 45 / 0.2

In [None]:

dataset_name = 'RCV1'


qrels = "data/qrels/rcv1_qrels_selected_45_0.2.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/RCV1_selected_45_0.2_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)


# SET EXPERIMENTAL PARAMETERS

des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over



df = run_experiments_OR()

df_all = df_all.append(df) # append all results togather


Number of topics: 45
Confidence level:  0.95
recall: 0.7
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
RCV1_0.95-0.7-OR,0.701,0.008,1.0,0.089,0.002,RCV1,0.95,0.7


recall: 0.8
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
RCV1_0.95-0.8-OR,0.801,0.01,1.0,0.04,0.001,RCV1,0.95,0.8
RCV1_0.95-0.7-OR,0.701,0.008,1.0,0.089,0.002,RCV1,0.95,0.7


recall: 0.9
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
RCV1_0.95-0.9-OR,0.901,0.016,1.0,0.01,0.001,RCV1,0.95,0.9
RCV1_0.95-0.8-OR,0.801,0.01,1.0,0.04,0.001,RCV1,0.95,0.8
RCV1_0.95-0.7-OR,0.701,0.008,1.0,0.089,0.002,RCV1,0.95,0.7


In [None]:
# save all
df_all['Model'] = 'OR'
df_all.to_csv(DIR+'experiments_output/df_all_OR.csv')


In [None]:
df_all

Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall,Model
CLEF2017_0.95-0.9-OR,0.923,0.057,1.0,0.009,0.026,CLEF2017,0.95,0.9,OR
CLEF2017_0.95-0.8-OR,0.834,0.043,1.0,0.033,0.042,CLEF2017,0.95,0.8,OR
CLEF2017_0.95-0.7-OR,0.744,0.034,1.0,0.074,0.063,CLEF2017,0.95,0.7,OR
CLEF2018_0.95-0.9-OR,0.912,0.067,1.0,0.01,0.013,CLEF2018,0.95,0.9,OR
CLEF2018_0.95-0.8-OR,0.812,0.051,1.0,0.037,0.015,CLEF2018,0.95,0.8,OR
CLEF2018_0.95-0.7-OR,0.715,0.042,1.0,0.082,0.021,CLEF2018,0.95,0.7,OR
CLEF2019_0.95-0.9-OR,0.929,0.071,1.0,0.011,0.033,CLEF2019,0.95,0.9,OR
CLEF2019_0.95-0.8-OR,0.83,0.057,1.0,0.035,0.037,CLEF2019,0.95,0.8,OR
CLEF2019_0.95-0.7-OR,0.731,0.047,1.0,0.078,0.044,CLEF2019,0.95,0.7,OR
Legal_0.95-0.9-OR,0.9,0.04,1.0,0.01,0.0,Legal,0.95,0.9,OR


#******* EMNLP results  *******

## IP-P

In [None]:
# set model parameters space
from sklearn.model_selection import ParameterGrid

hyperparameters_space = {'point_process':['IP'],
                        'model':['P'],
                        'RMSE':[0.1],
                        'min_rel_in_sample':['dynamic20-sample'],
                        'alpha':[0.025],
                        'beta':[0.025]}

grid = ParameterGrid(hyperparameters_space)

In [None]:
df_all = pd.DataFrame() # all runs in one df


In [None]:
EVALDIR = os.path.join(DIR, 'experiments_output/CP/tar_eval_out/')    # Directory for evaluation output files
# Check whether dircetory exists and create it if not
if not os.path.exists(EVALDIR):
    os.makedirs(EVALDIR)


### CLEF2017

In [None]:

dataset_name = 'CLEF2017'


qrels = "data/qrels/CLEF2017_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2017_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)



# SET EXPERIMENTAL PARAMETERS
des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over



df = run_experiments_GridSearch()

df_all = df_all.append(df) # append all results togather



Number of topics: 42
CLEF2017
Confidence level:  0.95
recall: 0.7
Running-IP-P
CLEF2017_0.95-0.7-IP-P


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2017_0.95-0.7-IP-P,1.0,0.255,1.0,0.028,0.428,CLEF2017,0.95,0.7


recall: 0.8
Running-IP-P
CLEF2017_0.95-0.8-IP-P


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2017_0.95-0.8-IP-P,1.0,0.265,1.0,0.029,0.25,CLEF2017,0.95,0.8
CLEF2017_0.95-0.7-IP-P,1.0,0.255,1.0,0.028,0.428,CLEF2017,0.95,0.7


recall: 0.9
Running-IP-P
CLEF2017_0.95-0.9-IP-P


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2017_0.95-0.9-IP-P,1.0,0.281,1.0,0.03,0.111,CLEF2017,0.95,0.9
CLEF2017_0.95-0.8-IP-P,1.0,0.265,1.0,0.029,0.25,CLEF2017,0.95,0.8
CLEF2017_0.95-0.7-IP-P,1.0,0.255,1.0,0.028,0.428,CLEF2017,0.95,0.7


### CLEF2018

In [None]:

dataset_name = 'CLEF2018'


qrels = "data/qrels/CLEF2018_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2018_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)


# SET EXPERIMENTAL PARAMETERS
des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over



df = run_experiments_GridSearch()

df_all = df_all.append(df) # append all results togather




Number of topics: 30
CLEF2018
Confidence level:  0.95
recall: 0.7
Running-IP-P
CLEF2018_0.95-0.7-IP-P


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2018_0.95-0.7-IP-P,1.0,0.277,1.0,0.021,0.428,CLEF2018,0.95,0.7


recall: 0.8
Running-IP-P
CLEF2018_0.95-0.8-IP-P


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2018_0.95-0.8-IP-P,1.0,0.287,1.0,0.023,0.25,CLEF2018,0.95,0.8
CLEF2018_0.95-0.7-IP-P,1.0,0.277,1.0,0.021,0.428,CLEF2018,0.95,0.7


recall: 0.9
Running-IP-P
CLEF2018_0.95-0.9-IP-P


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2018_0.95-0.9-IP-P,1.0,0.293,1.0,0.024,0.111,CLEF2018,0.95,0.9
CLEF2018_0.95-0.8-IP-P,1.0,0.287,1.0,0.023,0.25,CLEF2018,0.95,0.8
CLEF2018_0.95-0.7-IP-P,1.0,0.277,1.0,0.021,0.428,CLEF2018,0.95,0.7


###CLEF2019

In [None]:

dataset_name = 'CLEF2019'


qrels = "data/qrels/CLEF2019_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2019_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)


# SET EXPERIMENTAL PARAMETERS
des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over



df = run_experiments_GridSearch()

df_all = df_all.append(df) # append all results togather


Number of topics: 31
CLEF2019
Confidence level:  0.95
recall: 0.7
Running-IP-P
CLEF2019_0.95-0.7-IP-P


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2019_0.95-0.7-IP-P,0.999,0.276,1.0,0.046,0.427,CLEF2019,0.95,0.7


recall: 0.8
Running-IP-P
CLEF2019_0.95-0.8-IP-P


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2019_0.95-0.8-IP-P,0.999,0.279,1.0,0.046,0.249,CLEF2019,0.95,0.8
CLEF2019_0.95-0.7-IP-P,0.999,0.276,1.0,0.046,0.427,CLEF2019,0.95,0.7


recall: 0.9
Running-IP-P
CLEF2019_0.95-0.9-IP-P


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2019_0.95-0.9-IP-P,0.999,0.283,1.0,0.047,0.11,CLEF2019,0.95,0.9
CLEF2019_0.95-0.8-IP-P,0.999,0.279,1.0,0.046,0.249,CLEF2019,0.95,0.8
CLEF2019_0.95-0.7-IP-P,0.999,0.276,1.0,0.046,0.427,CLEF2019,0.95,0.7


### TREC-Legal

In [None]:

dataset_name = 'Legal'


qrels = "data/qrels/TREC_Legal_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/legal_test_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)


# SET EXPERIMENTAL PARAMETERS
des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over



df = run_experiments_GridSearch()

df_all = df_all.append(df) # append all results togather


Number of topics: 4
Legal
Confidence level:  0.95
recall: 0.7
Running-IP-P
Legal_0.95-0.7-IP-P


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
Legal_0.95-0.7-IP-P,1.0,0.287,1.0,0.001,0.428,Legal,0.95,0.7


recall: 0.8
Running-IP-P
Legal_0.95-0.8-IP-P


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
Legal_0.95-0.8-IP-P,1.0,0.338,1.0,0.001,0.249,Legal,0.95,0.8
Legal_0.95-0.7-IP-P,1.0,0.287,1.0,0.001,0.428,Legal,0.95,0.7


recall: 0.9
Running-IP-P
Legal_0.95-0.9-IP-P


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
Legal_0.95-0.9-IP-P,1.0,0.425,1.0,0.002,0.111,Legal,0.95,0.9
Legal_0.95-0.8-IP-P,1.0,0.338,1.0,0.001,0.249,Legal,0.95,0.8
Legal_0.95-0.7-IP-P,1.0,0.287,1.0,0.001,0.428,Legal,0.95,0.7


### TREC-TR

In [None]:

dataset_name = 'TR'


qrels = "data/qrels/TREC_TR_Test_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/tr_test_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)


# SET EXPERIMENTAL PARAMETERS
des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over


df = run_experiments_GridSearch()

df_all = df_all.append(df) # append all results togather


Number of topics: 34
TR
Confidence level:  0.95
recall: 0.7
Running-IP-P
TR_0.95-0.7-IP-P


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
TR_0.95-0.7-IP-P,1.0,0.052,1.0,0.0,0.428,TR,0.95,0.7


recall: 0.8
Running-IP-P
TR_0.95-0.8-IP-P


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
TR_0.95-0.8-IP-P,1.0,0.056,1.0,0.0,0.25,TR,0.95,0.8
TR_0.95-0.7-IP-P,1.0,0.052,1.0,0.0,0.428,TR,0.95,0.7


recall: 0.9
Running-IP-P
TR_0.95-0.9-IP-P


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
TR_0.95-0.9-IP-P,1.0,0.059,1.0,0.0,0.111,TR,0.95,0.9
TR_0.95-0.8-IP-P,1.0,0.056,1.0,0.0,0.25,TR,0.95,0.8
TR_0.95-0.7-IP-P,1.0,0.052,1.0,0.0,0.428,TR,0.95,0.7


### RCV1 - Selected 45 cat / 0.2 sample

In [None]:

dataset_name = 'RCV1'


qrels = "data/qrels/rcv1_qrels_selected_45_0.2.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/RCV1_selected_45_0.2_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)


# SET EXPERIMENTAL PARAMETERS
des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over




df = run_experiments_GridSearch()

df_all = df_all.append(df) # append all results togather


Number of topics: 45
RCV1
Confidence level:  0.95
recall: 0.7
Running-IP-P
RCV1_0.95-0.7-IP-P


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
RCV1_0.95-0.7-IP-P,0.998,0.134,1.0,0.0,0.425,RCV1,0.95,0.7


recall: 0.8
Running-IP-P
RCV1_0.95-0.8-IP-P


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
RCV1_0.95-0.8-IP-P,0.998,0.154,1.0,0.0,0.248,RCV1,0.95,0.8
RCV1_0.95-0.7-IP-P,0.998,0.134,1.0,0.0,0.425,RCV1,0.95,0.7


recall: 0.9
Running-IP-P
RCV1_0.95-0.9-IP-P


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
RCV1_0.95-0.9-IP-P,0.999,0.193,1.0,0.001,0.11,RCV1,0.95,0.9
RCV1_0.95-0.8-IP-P,0.998,0.154,1.0,0.0,0.248,RCV1,0.95,0.8
RCV1_0.95-0.7-IP-P,0.998,0.134,1.0,0.0,0.425,RCV1,0.95,0.7


In [None]:
# save all
df_all['Model'] = 'CP'
df_all.to_csv(DIR+'experiments_output/df_all_cp.csv')

In [None]:
df_all

Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall,Model
CLEF2017_0.95-0.9-IP-P,1.0,0.281,1.0,0.03,0.111,CLEF2017,0.95,0.9,CP
CLEF2017_0.95-0.8-IP-P,1.0,0.265,1.0,0.029,0.25,CLEF2017,0.95,0.8,CP
CLEF2017_0.95-0.7-IP-P,1.0,0.255,1.0,0.028,0.428,CLEF2017,0.95,0.7,CP
CLEF2018_0.95-0.9-IP-P,1.0,0.293,1.0,0.024,0.111,CLEF2018,0.95,0.9,CP
CLEF2018_0.95-0.8-IP-P,1.0,0.287,1.0,0.023,0.25,CLEF2018,0.95,0.8,CP
CLEF2018_0.95-0.7-IP-P,1.0,0.277,1.0,0.021,0.428,CLEF2018,0.95,0.7,CP
CLEF2019_0.95-0.9-IP-P,0.999,0.283,1.0,0.047,0.11,CLEF2019,0.95,0.9,CP
CLEF2019_0.95-0.8-IP-P,0.999,0.279,1.0,0.046,0.249,CLEF2019,0.95,0.8,CP
CLEF2019_0.95-0.7-IP-P,0.999,0.276,1.0,0.046,0.427,CLEF2019,0.95,0.7,CP
Legal_0.95-0.9-IP-P,1.0,0.425,1.0,0.002,0.111,Legal,0.95,0.9,CP


##IP-P-CLF

In [None]:
from sklearn.model_selection import ParameterGrid

min_rel_in_sample_type = 'xx'

hyperparameters_space = {'point_process':['IP'],
                        'model':['P'],
                        'RMSE':[0.1],
                        'min_rel_in_sample':['dynamic20-sample'],
                        'alpha':[0.025],
                        'beta':[0.025]}

grid = ParameterGrid(hyperparameters_space)

In [None]:
dataset_imbalance_handle = 'cost_sensitive_manual'
df_all = pd.DataFrame() # all runs in one df

In [None]:
EVALDIR = os.path.join(DIR, 'experiments_output/CP_ClassLabel/tar_eval_out/')    # Directory for evaluation output files
# Check whether dircetory exists and create it if not
if not os.path.exists(EVALDIR):
    os.makedirs(EVALDIR)


### CLEF2017

In [None]:

dataset_name = 'CLEF2017'


qrels = "data/qrels/CLEF2017_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2017_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic, rank_text_dic = load_run_data_with_text(run, dataset_name)



# SET EXPERIMENTAL PARAMETERS

des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over



df = run_experiments_clf('LR')

df_all = df_all.append(df) # append all results togather



Number of topics: 42
CD008081
CD007394
CD007427
CD008054
CD008643
CD008782
CD009020
CD009135
CD009519
CD009551
CD009579
CD009591
CD009647
CD009786
CD010023
CD010173
CD010276
CD010339
CD010386
CD010409
CD010438
CD010632
CD010633
CD010653
CD010705
CD011134
CD011549
CD011975
CD011984
CD012019
CLEF2017
Confidence level:  0.95
recall: 0.7
Running-IP-P
CLEF2017_0.95-0.7-IP-P-LR


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2017_0.95-0.7-IP-P-LR,0.988,0.15,1.0,0.018,0.412,CLEF2017,0.95,0.7


recall: 0.8
Running-IP-P
CLEF2017_0.95-0.8-IP-P-LR


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2017_0.95-0.8-IP-P-LR,0.989,0.152,1.0,0.019,0.236,CLEF2017,0.95,0.8
CLEF2017_0.95-0.7-IP-P-LR,0.988,0.15,1.0,0.018,0.412,CLEF2017,0.95,0.7


recall: 0.9
Running-IP-P
CLEF2017_0.95-0.9-IP-P-LR


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2017_0.95-0.9-IP-P-LR,0.989,0.153,1.0,0.019,0.098,CLEF2017,0.95,0.9
CLEF2017_0.95-0.8-IP-P-LR,0.989,0.152,1.0,0.019,0.236,CLEF2017,0.95,0.8
CLEF2017_0.95-0.7-IP-P-LR,0.988,0.15,1.0,0.018,0.412,CLEF2017,0.95,0.7


### CLEF2018

In [None]:

dataset_name = 'CLEF2018'


qrels = "data/qrels/CLEF2018_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2018_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic, rank_text_dic = load_run_data_with_text(run, dataset_name)




# SET EXPERIMENTAL PARAMETERS

des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over



df = run_experiments_clf('LR')

df_all = df_all.append(df) # append all results togather




Number of topics: 30
CD008122
CD008587
CD008759
CD008892
CD009175
CD009263
CD009694
CD010213
CD010296
CD010502
CD010657
CD010680
CD010864
CD011053
CD011126
CD011420
CD011431
CD011515
CD011602
CD011686
CD011912
CD011926
CD012009
CD012010
CD012083
CD012165
CD012179
CD012216
CD012281
CD012599
CLEF2018
Confidence level:  0.95
recall: 0.7
Running-IP-P
CLEF2018_0.95-0.7-IP-P-LR


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2018_0.95-0.7-IP-P-LR,0.982,0.135,1.0,0.01,0.402,CLEF2018,0.95,0.7


recall: 0.8
Running-IP-P
CLEF2018_0.95-0.8-IP-P-LR


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2018_0.95-0.8-IP-P-LR,0.983,0.137,1.0,0.01,0.229,CLEF2018,0.95,0.8
CLEF2018_0.95-0.7-IP-P-LR,0.982,0.135,1.0,0.01,0.402,CLEF2018,0.95,0.7


recall: 0.9
Running-IP-P
CLEF2018_0.95-0.9-IP-P-LR


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2018_0.95-0.9-IP-P-LR,0.983,0.137,1.0,0.01,0.092,CLEF2018,0.95,0.9
CLEF2018_0.95-0.8-IP-P-LR,0.983,0.137,1.0,0.01,0.229,CLEF2018,0.95,0.8
CLEF2018_0.95-0.7-IP-P-LR,0.982,0.135,1.0,0.01,0.402,CLEF2018,0.95,0.7


### CLEF2019

In [None]:


dataset_name = 'CLEF2019'


qrels = "data/qrels/CLEF2019_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2019_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic, rank_text_dic = load_run_data_with_text(run, dataset_name)




# SET EXPERIMENTAL PARAMETERS

des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over



df = run_experiments_clf('LR')

df_all = df_all.append(df) # append all results togather



Number of topics: 31
CD006468
CD000996
CD001261
CD004414
CD007867
CD009069
CD009642
CD010038
CD010239
CD010558
CD010753
CD011140
CD011571
CD011768
CD011977
CD012069
CD012164
CD012342
CD012455
CD012551
CD012661
CD011558
CD011787
CD008874
CD009044
CD011686
CD012080
CD012233
CD012567
CD012669
CD012768
CLEF2019
Confidence level:  0.95
recall: 0.7
Running-IP-P
CLEF2019_0.95-0.7-IP-P-LR


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2019_0.95-0.7-IP-P-LR,0.996,0.212,1.0,0.037,0.423,CLEF2019,0.95,0.7


recall: 0.8
Running-IP-P
CLEF2019_0.95-0.8-IP-P-LR


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2019_0.95-0.8-IP-P-LR,0.996,0.216,1.0,0.037,0.245,CLEF2019,0.95,0.8
CLEF2019_0.95-0.7-IP-P-LR,0.996,0.212,1.0,0.037,0.423,CLEF2019,0.95,0.7


recall: 0.9
Running-IP-P
CLEF2019_0.95-0.9-IP-P-LR


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2019_0.95-0.9-IP-P-LR,0.996,0.221,1.0,0.038,0.107,CLEF2019,0.95,0.9
CLEF2019_0.95-0.8-IP-P-LR,0.996,0.216,1.0,0.037,0.245,CLEF2019,0.95,0.8
CLEF2019_0.95-0.7-IP-P-LR,0.996,0.212,1.0,0.037,0.423,CLEF2019,0.95,0.7


### TREC-Legal

In [None]:

dataset_name = 'Legal'


qrels = "data/qrels/TREC_Legal_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/legal_test_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic, rank_text_dic = load_run_data_with_text(run, dataset_name)



# SET EXPERIMENTAL PARAMETERS

des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over




df = run_experiments_clf('LR')

df_all = df_all.append(df) # append all results togather



Number of topics: 4
303
304
Legal
Confidence level:  0.95
recall: 0.7
Running-IP-P
Legal_0.95-0.7-IP-P-LR


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
Legal_0.95-0.7-IP-P-LR,0.972,0.088,1.0,0.001,0.389,Legal,0.95,0.7


recall: 0.8
Running-IP-P
Legal_0.95-0.8-IP-P-LR


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
Legal_0.95-0.8-IP-P-LR,0.972,0.088,1.0,0.001,0.215,Legal,0.95,0.8
Legal_0.95-0.7-IP-P-LR,0.972,0.088,1.0,0.001,0.389,Legal,0.95,0.7


recall: 0.9
Running-IP-P
Legal_0.95-0.9-IP-P-LR


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
Legal_0.95-0.9-IP-P-LR,0.972,0.088,1.0,0.001,0.08,Legal,0.95,0.9
Legal_0.95-0.8-IP-P-LR,0.972,0.088,1.0,0.001,0.215,Legal,0.95,0.8
Legal_0.95-0.7-IP-P-LR,0.972,0.088,1.0,0.001,0.389,Legal,0.95,0.7


### TREC-TR

In [None]:
dataset_name = 'TR'


qrels = "data/qrels/TREC_TR_Test_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/tr_test_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic, rank_text_dic = load_run_data_with_text(run, dataset_name)



# SET EXPERIMENTAL PARAMETERS

des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over


df = run_experiments_clf('LR')

df_all = df_all.append(df) # append all results togather


Number of topics: 34
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
TR
Confidence level:  0.95
recall: 0.7
Running-IP-P
TR_0.95-0.7-IP-P-LR


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
TR_0.95-0.7-IP-P-LR,0.999,0.027,1.0,0.0,0.427,TR,0.95,0.7


recall: 0.8
Running-IP-P
TR_0.95-0.8-IP-P-LR


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
TR_0.95-0.8-IP-P-LR,0.999,0.027,1.0,0.0,0.248,TR,0.95,0.8
TR_0.95-0.7-IP-P-LR,0.999,0.027,1.0,0.0,0.427,TR,0.95,0.7


recall: 0.9
Running-IP-P
TR_0.95-0.9-IP-P-LR


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
TR_0.95-0.9-IP-P-LR,0.999,0.028,1.0,0.0,0.11,TR,0.95,0.9
TR_0.95-0.8-IP-P-LR,0.999,0.027,1.0,0.0,0.248,TR,0.95,0.8
TR_0.95-0.7-IP-P-LR,0.999,0.027,1.0,0.0,0.427,TR,0.95,0.7


### RCV1 - Selected 45 cat / 0.2 sample

In [None]:

dataset_name = 'RCV1'
dataset_imbalance_handle = 'na'

qrels = "data/qrels/rcv1_qrels_selected_45_0.2.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/RCV1_selected_45_0.2_ranking.txt"

doc_rank_dic, rank_rel_dic, rank_text_dic = load_run_data_with_text(run, dataset_name)


# SET EXPERIMENTAL PARAMETERS

des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over



df = run_experiments_clf('LR')

df_all = df_all.append(df) # append all results togather



Number of topics: 45
ALG
ASIA
BELG
BUL
BURMA
C12
C182
C33
CUBA
CZREP
E12
E513
EEC
FIN
GENV
GPRO
GUAT
I1300003
I14000
I21000
I22100
I24000
I32200
I32830
I3302021
I35102
I36400
I41000
I41300
I42600
I42900
I65100
I81501
I8150211
I81502
I82000
I83100
INDON
ISRAEL
MCDNIA
MEX
POL
RWANDA
SWED
TAIWAN
RCV1
Confidence level:  0.95
recall: 0.7
Running-IP-P
RCV1_0.95-0.7-IP-P-LR


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
RCV1_0.95-0.7-IP-P-LR,0.969,0.036,1.0,0.002,0.385,RCV1,0.95,0.7


recall: 0.8
Running-IP-P
RCV1_0.95-0.8-IP-P-LR


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
RCV1_0.95-0.8-IP-P-LR,0.969,0.036,1.0,0.002,0.212,RCV1,0.95,0.8
RCV1_0.95-0.7-IP-P-LR,0.969,0.036,1.0,0.002,0.385,RCV1,0.95,0.7


recall: 0.9
Running-IP-P
RCV1_0.95-0.9-IP-P-LR


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
RCV1_0.95-0.9-IP-P-LR,0.972,0.038,0.956,0.002,0.082,RCV1,0.95,0.9
RCV1_0.95-0.8-IP-P-LR,0.969,0.036,1.0,0.002,0.212,RCV1,0.95,0.8
RCV1_0.95-0.7-IP-P-LR,0.969,0.036,1.0,0.002,0.385,RCV1,0.95,0.7


In [None]:
# save all
df_all['Model'] = 'CP_ClassLabel'
df_all.to_csv(DIR+'experiments_output/df_all_cp_lbl.csv')

In [None]:
df_all

Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall,Model
CLEF2017_0.95-0.9-IP-P-LR,0.989,0.153,1.0,0.019,0.098,CLEF2017,0.95,0.9,CP_ClassLabel
CLEF2017_0.95-0.8-IP-P-LR,0.989,0.152,1.0,0.019,0.236,CLEF2017,0.95,0.8,CP_ClassLabel
CLEF2017_0.95-0.7-IP-P-LR,0.988,0.15,1.0,0.018,0.412,CLEF2017,0.95,0.7,CP_ClassLabel
CLEF2018_0.95-0.9-IP-P-LR,0.983,0.137,1.0,0.01,0.092,CLEF2018,0.95,0.9,CP_ClassLabel
CLEF2018_0.95-0.8-IP-P-LR,0.983,0.137,1.0,0.01,0.229,CLEF2018,0.95,0.8,CP_ClassLabel
CLEF2018_0.95-0.7-IP-P-LR,0.982,0.135,1.0,0.01,0.402,CLEF2018,0.95,0.7,CP_ClassLabel
CLEF2019_0.95-0.9-IP-P-LR,0.996,0.221,1.0,0.038,0.107,CLEF2019,0.95,0.9,CP_ClassLabel
CLEF2019_0.95-0.8-IP-P-LR,0.996,0.216,1.0,0.037,0.245,CLEF2019,0.95,0.8,CP_ClassLabel
CLEF2019_0.95-0.7-IP-P-LR,0.996,0.212,1.0,0.037,0.423,CLEF2019,0.95,0.7,CP_ClassLabel
Legal_0.95-0.9-IP-P-LR,0.972,0.088,1.0,0.001,0.08,Legal,0.95,0.9,CP_ClassLabel


##IP-P-CLF-SCR-THRSH

In [None]:
from sklearn.model_selection import ParameterGrid

min_rel_in_sample_type = 'xx'

hyperparameters_space = {'point_process':['IP'],
                        'model':['P'],
                        'RMSE':[0.1],
                        'min_rel_in_sample':['dynamic20-sample'],
                        'alpha':[0.025],
                        'beta':[0.025]}

grid = ParameterGrid(hyperparameters_space)

In [None]:
dataset_imbalance_handle = 'cost_sensitive_manual'
df_all = pd.DataFrame() # all runs in one df

In [None]:
EVALDIR = os.path.join(DIR, 'experiments_output/CP_CLassScore/tar_eval_out/')    # Directory for evaluation output files
# Check whether dircetory exists and create it if not
if not os.path.exists(EVALDIR):
    os.makedirs(EVALDIR)


### CLEF2017

In [None]:

dataset_name = 'CLEF2017'


qrels = "data/qrels/CLEF2017_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2017_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic, rank_text_dic = load_run_data_with_text(run, dataset_name)



# SET EXPERIMENTAL PARAMETERS

des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over


df = run_experiments_clf('LR_scores_threshold')

df_all = df_all.append(df) # append all results togather




Number of topics: 42
CD008081
CD007394
CD007427
CD008054
CD008643
CD008782
CD009020
CD009135
CD009519
CD009551
CD009579
CD009591
CD009647
CD009786
CD010023
CD010173
CD010276
CD010339
CD010386
CD010409
CD010438
CD010632
CD010633
CD010653
CD010705
CD011134
CD011549
CD011975
CD011984
CD012019
CLEF2017
Confidence level:  0.95
recall: 0.7
Running-IP-P
CLEF2017_0.95-0.7-IP-P-LR_scores_threshold


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2017_0.95-0.7-IP-P-LR_scores_threshold,0.988,0.147,1.0,0.018,0.411,CLEF2017,0.95,0.7


recall: 0.8
Running-IP-P
CLEF2017_0.95-0.8-IP-P-LR_scores_threshold


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2017_0.95-0.8-IP-P-LR_scores_threshold,0.989,0.152,1.0,0.019,0.236,CLEF2017,0.95,0.8
CLEF2017_0.95-0.7-IP-P-LR_scores_threshold,0.988,0.147,1.0,0.018,0.411,CLEF2017,0.95,0.7


recall: 0.9
Running-IP-P
CLEF2017_0.95-0.9-IP-P-LR_scores_threshold


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2017_0.95-0.9-IP-P-LR_scores_threshold,0.989,0.152,1.0,0.019,0.098,CLEF2017,0.95,0.9
CLEF2017_0.95-0.8-IP-P-LR_scores_threshold,0.989,0.152,1.0,0.019,0.236,CLEF2017,0.95,0.8
CLEF2017_0.95-0.7-IP-P-LR_scores_threshold,0.988,0.147,1.0,0.018,0.411,CLEF2017,0.95,0.7


### CLEF2018

In [None]:

dataset_name = 'CLEF2018'


qrels = "data/qrels/CLEF2018_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2018_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic, rank_text_dic = load_run_data_with_text(run, dataset_name)




# SET EXPERIMENTAL PARAMETERS

des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over



df = run_experiments_clf('LR_scores_threshold')

df_all = df_all.append(df) # append all results togather




Number of topics: 30
CD008122
CD008587
CD008759
CD008892
CD009175
CD009263
CD009694
CD010213
CD010296
CD010502
CD010657
CD010680
CD010864
CD011053
CD011126
CD011420
CD011431
CD011515
CD011602
CD011686
CD011912
CD011926
CD012009
CD012010
CD012083
CD012165
CD012179
CD012216
CD012281
CD012599
CLEF2018
Confidence level:  0.95
recall: 0.7
Running-IP-P
CLEF2018_0.95-0.7-IP-P-LR_scores_threshold


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2018_0.95-0.7-IP-P-LR_scores_threshold,0.981,0.134,1.0,0.01,0.402,CLEF2018,0.95,0.7


recall: 0.8
Running-IP-P
CLEF2018_0.95-0.8-IP-P-LR_scores_threshold


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2018_0.95-0.8-IP-P-LR_scores_threshold,0.982,0.136,1.0,0.01,0.228,CLEF2018,0.95,0.8
CLEF2018_0.95-0.7-IP-P-LR_scores_threshold,0.981,0.134,1.0,0.01,0.402,CLEF2018,0.95,0.7


recall: 0.9
Running-IP-P
CLEF2018_0.95-0.9-IP-P-LR_scores_threshold


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2018_0.95-0.9-IP-P-LR_scores_threshold,0.983,0.137,1.0,0.01,0.092,CLEF2018,0.95,0.9
CLEF2018_0.95-0.8-IP-P-LR_scores_threshold,0.982,0.136,1.0,0.01,0.228,CLEF2018,0.95,0.8
CLEF2018_0.95-0.7-IP-P-LR_scores_threshold,0.981,0.134,1.0,0.01,0.402,CLEF2018,0.95,0.7


### CLEF2019

In [None]:

dataset_name = 'CLEF2019'


qrels = "data/qrels/CLEF2019_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2019_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic, rank_text_dic = load_run_data_with_text(run, dataset_name)



# SET EXPERIMENTAL PARAMETERS

des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over



df = run_experiments_clf('LR_scores_threshold')

df_all = df_all.append(df) # append all results togather



Number of topics: 31
CD006468
CD000996
CD001261
CD004414
CD007867
CD009069
CD009642
CD010038
CD010239
CD010558
CD010753
CD011140
CD011571
CD011768
CD011977
CD012069
CD012164
CD012342
CD012455
CD012551
CD012661
CD011558
CD011787
CD008874
CD009044
CD011686
CD012080
CD012233
CD012567
CD012669
CD012768
CLEF2019
Confidence level:  0.95
recall: 0.7
Running-IP-P
CLEF2019_0.95-0.7-IP-P-LR_scores_threshold


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2019_0.95-0.7-IP-P-LR_scores_threshold,0.994,0.207,1.0,0.036,0.42,CLEF2019,0.95,0.7


recall: 0.8
Running-IP-P
CLEF2019_0.95-0.8-IP-P-LR_scores_threshold


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2019_0.95-0.8-IP-P-LR_scores_threshold,0.996,0.213,1.0,0.037,0.245,CLEF2019,0.95,0.8
CLEF2019_0.95-0.7-IP-P-LR_scores_threshold,0.994,0.207,1.0,0.036,0.42,CLEF2019,0.95,0.7


recall: 0.9
Running-IP-P
CLEF2019_0.95-0.9-IP-P-LR_scores_threshold


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
CLEF2019_0.95-0.9-IP-P-LR_scores_threshold,0.996,0.221,1.0,0.038,0.107,CLEF2019,0.95,0.9
CLEF2019_0.95-0.8-IP-P-LR_scores_threshold,0.996,0.213,1.0,0.037,0.245,CLEF2019,0.95,0.8
CLEF2019_0.95-0.7-IP-P-LR_scores_threshold,0.994,0.207,1.0,0.036,0.42,CLEF2019,0.95,0.7


### TREC-Legal

In [None]:

dataset_name = 'Legal'


qrels = "data/qrels/TREC_Legal_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/legal_test_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic, rank_text_dic = load_run_data_with_text(run, dataset_name)



# SET EXPERIMENTAL PARAMETERS

des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over


df = run_experiments_clf('LR_scores_threshold')

df_all = df_all.append(df) # append all results togather



Number of topics: 4
303
304
Legal
Confidence level:  0.95
recall: 0.7
Running-IP-P
Legal_0.95-0.7-IP-P-LR_scores_threshold


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
Legal_0.95-0.7-IP-P-LR_scores_threshold,0.963,0.075,1.0,0.002,0.376,Legal,0.95,0.7


recall: 0.8
Running-IP-P
Legal_0.95-0.8-IP-P-LR_scores_threshold


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
Legal_0.95-0.8-IP-P-LR_scores_threshold,0.972,0.088,1.0,0.001,0.215,Legal,0.95,0.8
Legal_0.95-0.7-IP-P-LR_scores_threshold,0.963,0.075,1.0,0.002,0.376,Legal,0.95,0.7


recall: 0.9
Running-IP-P
Legal_0.95-0.9-IP-P-LR_scores_threshold


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
Legal_0.95-0.9-IP-P-LR_scores_threshold,0.972,0.088,1.0,0.001,0.08,Legal,0.95,0.9
Legal_0.95-0.8-IP-P-LR_scores_threshold,0.972,0.088,1.0,0.001,0.215,Legal,0.95,0.8
Legal_0.95-0.7-IP-P-LR_scores_threshold,0.963,0.075,1.0,0.002,0.376,Legal,0.95,0.7


### TREC-TR

In [None]:

dataset_name = 'TR'


qrels = "data/qrels/TREC_TR_Test_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/tr_test_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic, rank_text_dic = load_run_data_with_text(run, dataset_name)



# SET EXPERIMENTAL PARAMETERS

des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over


df = run_experiments_clf('LR_scores_threshold')

df_all = df_all.append(df) # append all results togather


Number of topics: 34
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
TR
Confidence level:  0.95
recall: 0.7
Running-IP-P
TR_0.95-0.7-IP-P-LR_scores_threshold


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
TR_0.95-0.7-IP-P-LR_scores_threshold,0.999,0.027,1.0,0.0,0.427,TR,0.95,0.7


recall: 0.8
Running-IP-P
TR_0.95-0.8-IP-P-LR_scores_threshold


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
TR_0.95-0.8-IP-P-LR_scores_threshold,0.999,0.027,1.0,0.0,0.248,TR,0.95,0.8
TR_0.95-0.7-IP-P-LR_scores_threshold,0.999,0.027,1.0,0.0,0.427,TR,0.95,0.7


recall: 0.9
Running-IP-P
TR_0.95-0.9-IP-P-LR_scores_threshold


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
TR_0.95-0.9-IP-P-LR_scores_threshold,0.999,0.028,1.0,0.0,0.11,TR,0.95,0.9
TR_0.95-0.8-IP-P-LR_scores_threshold,0.999,0.027,1.0,0.0,0.248,TR,0.95,0.8
TR_0.95-0.7-IP-P-LR_scores_threshold,0.999,0.027,1.0,0.0,0.427,TR,0.95,0.7


### RCV1 - Selected 45 cat / 0.2 sample

In [None]:

dataset_name = 'RCV1'
dataset_imbalance_handle = 'na'


qrels = "data/qrels/rcv1_qrels_selected_45_0.2.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/RCV1_selected_45_0.2_ranking.txt"

doc_rank_dic, rank_rel_dic, rank_text_dic = load_run_data_with_text(run, dataset_name)


# SET EXPERIMENTAL PARAMETERS

des_recalls = [0.7, 0.8, 0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over



df = run_experiments_clf('LR_scores_threshold')

df_all = df_all.append(df) # append all results togather



Number of topics: 45
ALG
ASIA
BELG
BUL
BURMA
C12
C182
C33
CUBA
CZREP
E12
E513
EEC
FIN
GENV
GPRO
GUAT
I1300003
I14000
I21000
I22100
I24000
I32200
I32830
I3302021
I35102
I36400
I41000
I41300
I42600
I42900
I65100
I81501
I8150211
I81502
I82000
I83100
INDON
ISRAEL
MCDNIA
MEX
POL
RWANDA
SWED
TAIWAN
RCV1
Confidence level:  0.95
recall: 0.7
Running-IP-P
RCV1_0.95-0.7-IP-P-LR_scores_threshold


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
RCV1_0.95-0.7-IP-P-LR_scores_threshold,0.969,0.036,1.0,0.002,0.385,RCV1,0.95,0.7


recall: 0.8
Running-IP-P
RCV1_0.95-0.8-IP-P-LR_scores_threshold


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
RCV1_0.95-0.8-IP-P-LR_scores_threshold,0.969,0.036,1.0,0.002,0.212,RCV1,0.95,0.8
RCV1_0.95-0.7-IP-P-LR_scores_threshold,0.969,0.036,1.0,0.002,0.385,RCV1,0.95,0.7


recall: 0.9
Running-IP-P
RCV1_0.95-0.9-IP-P-LR_scores_threshold


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall
RCV1_0.95-0.9-IP-P-LR_scores_threshold,0.969,0.036,0.933,0.002,0.083,RCV1,0.95,0.9
RCV1_0.95-0.8-IP-P-LR_scores_threshold,0.969,0.036,1.0,0.002,0.212,RCV1,0.95,0.8
RCV1_0.95-0.7-IP-P-LR_scores_threshold,0.969,0.036,1.0,0.002,0.385,RCV1,0.95,0.7


In [None]:
# save all
df_all['Model'] = 'CP_CLassScore'
df_all.to_csv(DIR+'experiments_output/df_all_cp_scrs.csv')

In [None]:
df_all

Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset,des_prob,des_recall,Model
CLEF2017_0.95-0.9-IP-P-LR_scores_threshold,0.989,0.152,1.0,0.019,0.098,CLEF2017,0.95,0.9,CP_CLassScore
CLEF2017_0.95-0.8-IP-P-LR_scores_threshold,0.989,0.152,1.0,0.019,0.236,CLEF2017,0.95,0.8,CP_CLassScore
CLEF2017_0.95-0.7-IP-P-LR_scores_threshold,0.988,0.147,1.0,0.018,0.411,CLEF2017,0.95,0.7,CP_CLassScore
CLEF2018_0.95-0.9-IP-P-LR_scores_threshold,0.983,0.137,1.0,0.01,0.092,CLEF2018,0.95,0.9,CP_CLassScore
CLEF2018_0.95-0.8-IP-P-LR_scores_threshold,0.982,0.136,1.0,0.01,0.228,CLEF2018,0.95,0.8,CP_CLassScore
CLEF2018_0.95-0.7-IP-P-LR_scores_threshold,0.981,0.134,1.0,0.01,0.402,CLEF2018,0.95,0.7,CP_CLassScore
CLEF2019_0.95-0.9-IP-P-LR_scores_threshold,0.996,0.221,1.0,0.038,0.107,CLEF2019,0.95,0.9,CP_CLassScore
CLEF2019_0.95-0.8-IP-P-LR_scores_threshold,0.996,0.213,1.0,0.037,0.245,CLEF2019,0.95,0.8,CP_CLassScore
CLEF2019_0.95-0.7-IP-P-LR_scores_threshold,0.994,0.207,1.0,0.036,0.42,CLEF2019,0.95,0.7,CP_CLassScore
Legal_0.95-0.9-IP-P-LR_scores_threshold,0.972,0.088,1.0,0.001,0.08,Legal,0.95,0.9,CP_CLassScore


# Prepare Excess & Final Results Table

In [None]:
#load results
df_all_OR = pd.read_csv(DIR+'experiments_output/df_all_OR.csv')
df_all_cp = pd.read_csv(DIR+'experiments_output/df_all_cp.csv')
df_all_cp_lbl = pd.read_csv(DIR+'experiments_output/df_all_cp_lbl.csv')
df_all_cp_scrs = pd.read_csv(DIR+'experiments_output/df_all_cp_scrs.csv')

#calculate excess
df_all_cp['excess'] = (df_all_cp['cost'] - df_all_OR['cost']) / (1-df_all_OR['cost'])
df_all_cp_lbl['excess'] = (df_all_cp_lbl['cost'] - df_all_OR['cost']) / (1-df_all_OR['cost'])
df_all_cp_scrs['excess'] = (df_all_cp_scrs['cost'] - df_all_OR['cost']) / (1-df_all_OR['cost'])

#merge all
df_all_paper = pd.concat([df_all_cp, df_all_cp_lbl,df_all_cp_scrs], ignore_index=True)

#get selected columns
df_all_paper = df_all_paper[['des_recall', 'dataset', 'Model', 'recall', 'reliability', 'cost', 'excess' ]]

#sort results
df_all_paper.replace(to_replace="RCV1", value="_RCV1", inplace = True) # to be sorted as the paper
df_all_paper.replace(to_replace="CP_CLassScore", value="_CP_CLassScore", inplace = True) # to be sorted as the paper

df_all_paper_sorted = df_all_paper.sort_values(['des_recall', 'dataset', 'Model'], ascending = [False, True, True])

df_all_paper_sorted = df_all_paper_sorted.round(3)

#display paper table
df_all_paper_sorted

Unnamed: 0,des_recall,dataset,Model,recall,reliability,cost,excess
0,0.9,CLEF2017,CP,1.0,1.0,0.281,0.238
18,0.9,CLEF2017,CP_ClassLabel,0.989,1.0,0.153,0.102
36,0.9,CLEF2017,_CP_CLassScore,0.989,1.0,0.152,0.101
3,0.9,CLEF2018,CP,1.0,1.0,0.293,0.242
21,0.9,CLEF2018,CP_ClassLabel,0.983,1.0,0.137,0.075
39,0.9,CLEF2018,_CP_CLassScore,0.983,1.0,0.137,0.075
6,0.9,CLEF2019,CP,0.999,1.0,0.283,0.228
24,0.9,CLEF2019,CP_ClassLabel,0.996,1.0,0.221,0.161
42,0.9,CLEF2019,_CP_CLassScore,0.996,1.0,0.221,0.161
9,0.9,Legal,CP,1.0,1.0,0.425,0.401
