# Setup - remote processing
Run this section if using remote processing (provided by Google) 

In [1]:

# MOUNT GOOGLE DRIVE 
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
%matplotlib inline

# IMPORT LIBRARIES
import sys
import os
import numpy as np
import pandas as pd
import math
from scipy.optimize import curve_fit
import random
import glob
import subprocess
import matplotlib.pyplot as plt
from scipy.integrate import simps
from scipy.stats import norm
import os

In [3]:
PARENT_DIR = '/content/drive/MyDrive/ReemSharedPhD/SharedCode/TOIS_paper_code/'

# Ensure that Python looks in correct place for local modules 
DIR = PARENT_DIR

# set/create the full working directory/subdirectory
path = PARENT_DIR + 'results/figs/'
if not os.path.exists(path):
    os.makedirs(path)

# reset to main directory
path = PARENT_DIR

sys.path.append(DIR)

# IMPORT OWN FUNCTIONS
from utils.read_data_fns import *
from utils.target_method_fns import *  
from utils.knee_method_fns import *  
from utils.eval_fns import *
from utils.inhomogeneous_pp_fns import *   

In [4]:
import warnings
warnings.filterwarnings('ignore')


# Setup - local processing 


**Execute using local compute** 
(See https://research.google.com/colaboratory/local-runtimes.html)

Start notebook: 

jupyter notebook --NotebookApp.allow_origin='https://colab.research.google.com' --port=8888 --NotebookApp.port_retries=0

In [None]:
'''
# IMPORT LIBRARIES
import sys
import os
import numpy as np
import pandas as pd
import math
from scipy.optimize import curve_fit
import random
import glob
import subprocess
import matplotlib.pyplot as plt
from scipy.integrate import simps
from scipy.stats import norm
import os
'''

In [None]:
'''
PARENT_DIR = 'C:/Users/reemb/content/drive/MyDrive/ReemSharedPhD/SharedCode/TOIS_paper_code/'

# Ensure that Python looks in correct place for local modules 
DIR = PARENT_DIR

sys.path.append(DIR)

# IMPORT OWN FUNCTIONS
from utils.read_data_fns import *
from utils.eval_fns import *

# IMPORT OWN FUNCTIONS
from utils.read_data_fns import *
from utils.target_method_fns import *  
from utils.knee_method_fns import *  
from utils.eval_fns import *
from utils.inhomogeneous_pp_fns import *   
'''

# Setup - shared
Code to setup stopping methods (implemented to be independent of processing run) 

In [None]:
# Setup evaluation directory
EVALDIR = os.path.join(DIR, 'tar_eval_out/')    # Directory for evaluation output files
# Check whether dircetory exists and create it if not
if not os.path.exists(EVALDIR):
    os.makedirs(EVALDIR)

In [None]:
# LOAD TOPIC RELEVANCE DATA
def load_rel_data(qrels): 
  qrel_fname =  os.path.join(DIR, qrels)
  with open(qrel_fname, 'r') as infile:
      qrels_data = infile.readlines()    
  query_rel_dic = make_rel_dic(qrels_data) # make dictionary of list of docids relevant to each queryid

  #print("Number of topics:", len(query_rel_dic))

  return qrel_fname, query_rel_dic

In [None]:
# LOAD RUN DATA
def load_run_data(run): 
  run_fname = os.path.join(DIR, run)
  with open(run_fname, 'r') as infile: 
    run_data = infile.readlines()
  doc_rank_dic = make_rank_dic(run_data)  # make dictionary of ranked docids for each queryid
  rank_rel_dic = make_rank_rel_dic(query_rel_dic,doc_rank_dic) # make dic of list relevances of ranked docs for each queryid

  #return doc_rank_dic, rank_rel_dic, rank_text_dic
  return doc_rank_dic, rank_rel_dic

### Set Parameters

In [None]:
# SET POISSON PROCESS/COX PROCESS PARAMETERS
# dynamically create list with beta increment
alpha = 0.025
beta = 0.025
sample_props = list(np.arange(alpha, (1+beta), beta).round(3)) 
#sample_props[-1] = 0.999

n_windows = 10  # number of windows to make from sample

# SET KNEE METHOD PARAMETERS
knee_rho = 6 # knee method rho 

# SET EXPERIMENTAL PARAMETERS 
des_recalls = [ 0.95, 0.9, 0.8, 0.7] # desired recalls to experiment over
des_probs = [0.95, 0.8] # desired confidences to experiment over

selected_threshold = 0.5 # models default
#selected_threshold = 0.55 # optimized

min_doc_in_sample = 10 # min number docs must be in sample to proceed with pp algorithm 
min_rel_in_sample = 20 # min number rel docs must be in initial sample to proceed with algorithm 
min_rel_in_sample_hold = 20 # hold value
min_rel_in_sample_type = 'dynamic'

detailed_results_flag = 1

###run_tar_eval_macro

In [None]:
# Updated version of function that returns MICRO average (as summarised by tar_eval)
# Function to evaluate output file and return scores for range of metrics

# Runs tar_eval script and parses output
# Uses same metrics as Li and Kanoulas: 
# 1) recall
# 2) cost (== percentage effort)
# 3) relative error (absolute diff between recall achived and target recall)
# 4) loss_er (from tar_eval)
# 5) reliability (%age of times desired recall is achieved)
def run_tar_eval(qrel_fname, out_fname, des_recall): 

      # Location of script
      script = os.path.join(DIR, 'scripts/tar_eval.py')

      # Run tar_eval script
      # print(f"{script} {qrel_fname} {out_fname}")
      ret = subprocess.check_output(['python', script, qrel_fname, out_fname])
      # ret = subprocess.check_output([' tail -27 '], shell=True, input=ret)
      ret = ret.decode(encoding='utf-8')

      #print(ret)

      # Parse eval script output
      teval_dict = {}   # Summary results (computed across all topics)
      recalls = []      # Recall for each topic 
      for line in ret.split('\n'):
          if line != '':
            tid, key, val = line.split()
            #print(f"tid: {tid}, key: {key}, val: {val}")
            if tid != 'ALL':
                if key == 'topic_id':
                    teval_dict[tid] = {}
                teval_dict[tid][key] = val


      # print(teval_dict)

      recalls = []
      costs = []
      reliabilities = []
      loss_ers = []
      rel_errors = []
      topic_dfs = []
      for tid in teval_dict:
          # Compute recall (rels_found / num_rel)
          recall = float(teval_dict[tid]['rels_found']) / float(teval_dict[tid]['num_rels'])
          recalls.append(recall)
          
          # cost (num_shown / num_docs)
          cost = float(teval_dict[tid]['num_shown']) / float(teval_dict[tid]['num_docs'])
          costs.append(cost)

          # reliability (number for which recall >= des_recall)
          if recall >= des_recall: 
            reliability = 1
          else:
            reliability = 0
          reliabilities.append(reliability)

          # loss_er -- available directly
          loss_ers.append(teval_dict[tid]['loss_er'])

          # rel_error -- diff between 
          rel_error = np.abs(recall - des_recall) / des_recall
          rel_errors.append(rel_error)

      

      # Compute mean and (optionall also std_dev)
      recalls = np.array(recalls)
      recall_all = "{:.3f}".format(recalls.mean())
      
      costs = np.array(costs)
      cost_all = "{:.3f}".format(costs.mean())

      reliabilities = np.array(reliabilities)
      reliability = "{:.3f}".format(reliabilities.mean())
      reliability_all = "{:.3f}".format(reliabilities.mean())

      loss_ers = np.array(loss_ers).astype(float)
      loss_er_all = "{:.3f}".format(loss_ers.mean())

      rel_errors = np.array(rel_errors)
      rel_error_all = "{:.3f}".format(rel_errors.mean())

      
      # Optionally print out std dev of scores with mean
      # ddof=1 for sample std like pd.describe
      if detailed_results_flag == 1: 
        recall_all = "{}±{:.3f}".format(recall_all, recalls.std(ddof=1))
        cost_all = "{}±{:.3f}".format(cost_all, costs.std(ddof=1))
        reliability_all = "{}±{:.3f}".format(reliability_all, reliabilities.std(ddof=1))
        loss_er_all = "{}±{:.3f}".format(loss_er_all, loss_ers.std(ddof=1))
        rel_error_all = "{}±{:.3f}".format(rel_error_all, rel_errors.std(ddof=1))

      #return recall_all, cost_all, reliability_all, loss_er_all, rel_error_all 
      return recall_all, cost_all, reliability, loss_er_all, rel_error_all # no need for reliability std (0 or 1)

In [None]:
# Function to run ORACLE METHOD (OR)
def run_oracle_method(des_recall, topics_list):
    # print('run_oracle_method')
    # Create output file 
    out_fname = dataset_name+ "_Oracle_"+str(des_recall)+".txt"
    out_fname = os.path.join(EVALDIR, out_fname)
    out_f = open(out_fname, "w+")  #  Create a new file if it doesn't exist   
 
    for query_id in topics_list:
        rel_list = rank_rel_dic[query_id]  # list binary rel of ranked docs 
        rel_doc_idxs = np.where(np.array(rel_list) == 1)[0]
        oracle_n_rel = math.ceil(len(rel_doc_idxs)*des_recall)
        oracle_idx = rel_doc_idxs[oracle_n_rel-1]

        # Write output file 
        for i in range(oracle_idx + 1):
               out_f.write(f"{query_id}\tAF\t{doc_rank_dic[query_id][i]}\t{i + 1}\t{-i}\tmyrun\n")

    # Evaluate results 
    out_f.close()
    # recall, acc, perc_eff_saved = run_tar_eval(qrel_fname, out_fname, des_recall)
    recall, cost, reliability, loss_er, rel_err = run_tar_eval(qrel_fname, out_fname, des_recall)

    # return recall, acc, perc_eff_saved
    return recall, cost, reliability, loss_er, rel_err
    

In [None]:
# Function to run target method 
def run_target_method(des_recall, des_prob, topics_list): 
    # Setup dictionary for scores 
    # own_score_dic = {} 

    # Create output file 
    out_fname = "TAR_"+str(des_recall)+"_"+str(des_prob)+".txt"
    out_fname = os.path.join(EVALDIR, out_fname)
    out_f = open(out_fname, "w+")  #  Create a new file if it doesn't exist   
    # print(f"Creating file {out_fname}")

    # COUNT TOTAL DOCUMENTS RANKED (FOR PERCENTAGE EFFORT SAVED METRIC)
    # total_effort = len(run_data)

    for query_id in topics_list:
        # print(f"query_id: {query_id}")
        # score_dic[query_id] = []      

        # EXTRACT COUNTS AND REL LISTS
        total_docs = len(doc_rank_dic[query_id])  # total n. docs in topic
        rel_list = rank_rel_dic[query_id]  # list binary rel of ranked docs 
        # print(f"run_target_method total_docs: {total_docs}")

        # Run target method 
        random.seed(1)
        target_size = get_target_size(des_recall, des_prob)
        target_list, examined_list = make_target_set(rel_list, total_docs, target_size)  # get target sample and list all docs examined
        tar_stop_n = get_stopping_target(target_list, total_docs, target_size)  # stopping point
        all_examined_idxs = get_all_target_examined_idxs(examined_list, tar_stop_n)  # list of every doc examined during method
        # print(f"all_examined_idxs: {all_examined_idxs}")
        # print(f"total_docs: {total_docs}; examined: {len(all_examined_idxs)}")

        # Write output file 
        # Make note of whether doc examined (AF = yes; NS = no)
        for i in range(total_docs):
            if i in all_examined_idxs:
                out_f.write(f"{query_id}\tAF\t{doc_rank_dic[query_id][i]}\t{i + 1}\t{-i}\tmyrun\n") 
        #for i in range(total_docs):
        #    if i in all_examined_idxs:
        #        examined_flag = "AF"
        #    else: 
        #        examined_flag = "NS"
        #    out_f.write(f"{query_id}\t{examined_flag}\t{doc_rank_dic[query_id][i]}\t{i + 1}\t{-i}\tmyrun\n") 

    # Close output file
    out_f.close()

    # recall, acc, perc_eff_saved = run_tar_eval(qrel_fname, out_fname, des_recall)
    recall, cost, reliability, loss_er, rel_err = run_tar_eval(qrel_fname, out_fname, des_recall)

    # return recall, acc, perc_eff_saved
    return recall, cost, reliability, loss_er, rel_err


### Rate Functions

In [None]:
# Functions encoding relevant distribution models
# Exponential model 
def exp_model_func(x, a, k): # x = vector x values
    return a*np.exp(-k*x)  

# Power law
def power_model_func(x, a, k): # x = vector x values
    return a*x**k 

# AP Prior distribution
def apprior_model_func(x, a): # x = vector x values
    # print(f"apprior_model_func: n_docs: {n_docs}")
    return a * (n_docs / x)

# hyperbolic model
def hyperbolic_model_func(x, a, b, k):
    return a/((1.0+b*k*x)**(1.0/b))

# Integral of model functions
def model_integral(a, k, n_docs, model):
    if model == "E":
        return (a/-k)*(np.exp(-k*n_docs)-1) 
    elif model == "P":
        return (a/(k+1))*(n_docs**(k+1)-1)  
    elif model == "A":
        mu = a * (n_docs / ((n_docs * math.log(n_docs)) - math.lgamma(n_docs + 1)))
        return mu

# Integral of model functions
def model_integral_b(a, b, k, n_docs, model):

    is_harmonic = 0

    if b == 1:
      is_harmonic = 1 # hyperbolic becomes harmonic

    if model == "E":
        return (a/-k)*(np.exp(-k*n_docs)-1) 
    elif model == "P":
        return (a/(k+1))*(n_docs**(k+1)-1)  
    elif model == "A":
        mu = a * (n_docs / ((n_docs * math.log(n_docs)) - math.lgamma(n_docs + 1)))
        return mu
    elif model == "H":
        if is_harmonic:
          mu = (a/k) * np.log((k*n_docs)+1)
          return mu
        else:
          mu =  (a/k*(b-1)) * np.power((1+(b*k*n_docs)), 1 - (1/b))
          return mu

### Run Point Process

In [None]:
# run the PP stopping algorithm
def run_point_process(des_recall, des_prob, topics_list, process_type, model):

    # des_recall: desired recall
    # des_prob: confidence in des_recall
    # topics_list: list of topics to process
    # process_type: type of point process (either "IP" or "CX")
    # model: model for rate function (either "P": power law; "E":exponential; "A": AP Prior; "H": Hyperbolic)   

    # print(f"point process: type {process_type}, model {model}")

    # n_docs variable needs to be made global so it can be accessed by 
    # curve fit and integral functions 
    global n_docs  

    global RMSE,min_rel_in_sample,alpha, beta
    global min_rel_in_sample_hold # keep value


    # Do check that input is valid 
    if not ((process_type == "IP" or process_type == "CX") and 
            (model == "P" or model == "E" or model == "A" or model == "H")):
        #print("Incorrect arguments to run_point_process - exiting\n")
        return 0, 0, 0 

    # Create output file 
    out_fname = dataset_name+"_"+str(process_type)+"_"+str(model)+"_"+str(des_recall)+"_"+str(des_prob)+".txt"
    out_fname = os.path.join(EVALDIR, out_fname)
    out_f = open(out_fname, "w+")  #  Create a new file if it doesn't exist 


    for query_id in topics_list:
        min_rel_in_sample = min_rel_in_sample_hold # update for each topic

        # print(query_id)         
        # score_dic[query_id] = []  
 
        # EXTRACT COUNTS AND REL LISTS
        n_docs = len(doc_rank_dic[query_id])  # total n. docs in topic
        rel_list = rank_rel_dic[query_id]  # list binary rel of ranked docs 
 
        # Initialise count of documents in sample 
        n_samp_docs = int(round(n_docs*sample_props[0]))  
 
        windows_end_point = 0
        pred_stop_n = n_docs
        


        i = 0 # sample iteration
        #print(i , len(sample_props), pred_stop_n ,n_samp_docs)
        while (i < len(sample_props)) and (pred_stop_n > n_samp_docs):
          
          min_rel_in_sample_flag = 'NA' #determine if min_rel_in_sample achieved
          counting_process_flag = 'NA' # determine if counting process applied or not
          
          pred_unobserved = -9999 # create variable with dummy value
          pred_n_rel = -9999 # create variable with dummy value
          des_n_rel = -9999 # create variable with dummy value
          norm_rmse = -9999 # create variable with dummy value
          x = []
          y = []
          predicted_y = []

          # Check that enough relevant documents have been observed
          n_samp_docs = int(round(n_docs*sample_props[i]))  
          sample_rel_list = rel_list[0:n_samp_docs]  # chunk of rel list examined in sample
          # print(f"Sample: {sample_props[i]} - rel found {np.sum(sample_rel_list)}")


          # check min docs in sample >= n_windows
          if n_samp_docs < min_doc_in_sample:
            #print("docs in sample too little: " , n_samp_docs)
            i = i + 1
            continue # skip to next iteration

          #outside inner if inside bigger loop
          # calculate all actual vs. predicted results
          n_unobserved_docs = n_docs - n_samp_docs
          n_rel = np.sum(rel_list)
          n_rel_at_end_samp = np.sum(sample_rel_list)
          n_rel_unobserved = n_rel - n_rel_at_end_samp  
                    
          min_rel_in_sample_flag = 0 #determine if min_rel_in_sample achieved

          if (np.sum(sample_rel_list) >= min_rel_in_sample):

                #print("min_rel_in_sample : " , min_rel_in_sample)
                #print("n_rel_at_end_samp: ", n_rel_at_end_samp)
                min_rel_in_sample_flag = 1 #set to 1 if min_rel_in_sample achieved
                # print("Running point process")
                sample_prop = sample_props[i]
                # print(f"sample_prop {sample_prop}")

                n_samp_docs = int(round(n_docs*sample_props[i]))
                sample_rel_list = rel_list[0:n_samp_docs]  # chunk of rel list examined in sample


                # get points
                windows = make_windows(n_windows, n_samp_docs)
                #windows = make_windows(n_windows_dynamic, sample_prop, n_docs) 
                window_size = windows[0][1]

                # calculate points that will be used to fit curve
                # !! To do -- check sampling is appropriate for APPrior
                if model == "E" or model == "A" or model == "H":
                    x,y = get_points(windows, window_size, sample_rel_list)  
                elif model == "P":
                  
                    x,y = get_points_power(windows, window_size, sample_rel_list)

                #print(f"x: {x}\ny: {y}")

                y5=y[5:] # include last 5 points only to check early stopping

                # try to fit curve
                good_curve_fit = 0
                if sum(y5) == 0 and n_rel_at_end_samp >= min_rel_in_sample: # check last 5 points relevance & available min_rel_in_sample 

                  #print('sum y5 == 0')
                  break # stop for this topic

                else:
                  try:
                        if model == "E":                    
                          p0 = [0.1, 0.001 ]  # initialise curve parameters
                          opt, pcov = curve_fit(exp_model_func, x, y, p0)  # fit curve
                          good_curve_fit = 1

                          #print(opt)
                          #print(pcov)

                          # Compute residuals and draw graphs as sanity check
                          p1 = opt[0]
                          p2 = opt[1]
                          residuals = np.array(y - exp_model_func(x,p1, p2))
                          diff = np.max(y) - np.min(y)
                          fres = sum(residuals**2) / diff
                          #print(f"Norm RMSE: {fres}") 


                        elif model == "P":
                          p0 = [0.1, 0.001 ]  # initialise curve parameters
                          opt, pcov = curve_fit(power_model_func, x, y, p0)  # fit curve
                          good_curve_fit = 1

                          #print(opt)
                          #print(pcov)

                          # Compute residuals and draw graphs as sanity check
                          p1 = opt[0]
                          p2 = opt[1]
                          residuals = np.array(y - power_model_func(x,p1, p2))
                          diff = np.max(y) - np.min(y)
                          fres = sum(residuals**2) / diff
                          #print(f"Norm RMSE: {fres}")    


                        elif model == "A":
                          #print("Trying to fit APPrior model")
                          opt, pcov = curve_fit(apprior_model_func, x, y)  # fit curve
                          good_curve_fit = 1

                          # Compute residuals 
                          p1 = opt[0]
                          residuals = np.array(y - apprior_model_func(x,p1))
                          diff = np.max(y) - np.min(y)
                          fres = sum(residuals**2) / diff
                          #print(f"Norm RMSE: {fres}")

                        elif model == "H":
                          # plot H (bound)
                          a = max(y) # get the max to use as a value   
                          #a = sum(y) / len(y) #try avrg  
                          p0 = [a,0.5, 0.001]  # initialise curve parameters
                          opt, pcov = curve_fit(hyperbolic_model_func, x, y,  p0, bounds=((-np.inf, 0,-np.inf), (np.inf, 1, np.inf)))  # fit curve 
                          good_curve_fit = 1

                          #print('Hyperbolic Fit Curve-fitted Variables (0<b<1): a='+str(opt[0])+', b='+str(opt[1])+', k='+str(opt[2]))
                          #print(opt)
                          #print(pcov)

                        

                          # Compute residuals and draw graphs as sanity check
                          p1 = opt[0]
                          p2 = opt[1] 
                          p3 = opt[2]
                          residuals = np.array(y - hyperbolic_model_func(x,p1, p2, p3))
 

                          diff = np.max(y) - np.min(y)
                          fres = sum(residuals**2) / diff
                          #print(f"Norm RMSE: {fres}") 


                  except Exception as error: 
                      pass
                      # e = str(error)
                      # print(e)              
            

                # Run point process 
                if(good_curve_fit == 1):
                    # get y-values for fitted curve                    
                    if model == "E":
                      a, k = opt
                      y2 =   exp_model_func(x, a, k) 
                    elif model == "P":
                      a, k = opt
                      y2 = power_model_func(x, a, k) 
                    elif model == "A":
                      a = opt
                      k = 0
                      y2 = apprior_model_func(x, a)
                    elif model == "H":
                      a, b, k = opt 
                      y2 = hyperbolic_model_func(x, a, b, k) 

                    # print(f"y2: {y2}")

                    n_rel_at_end_samp = np.sum(sample_rel_list)

                    # Check error in curve fit (using normalised RMSE)
                    if model == "E":
                        predicted_y =   exp_model_func(x, a, k)
                    elif model == "P":
                        predicted_y = power_model_func(x, a, k)
                    elif model == "A":
                        predicted_y = apprior_model_func(x, a)
                    elif model == "H":
                        predicted_y = hyperbolic_model_func(x, a, b , k)

                    residuals = np.array(y - predicted_y)
                    diff = np.max(y) - np.min(y)
                    norm_rmse = sum(residuals**2) / diff
                    

                    counting_process_flag = 0 # determine if counting process applied or not

                    if(norm_rmse < RMSE):
                        counting_process_flag = 1 # set to 1 if counting process applied

                        # Run point process (Inhomogenous Poisson or Cox Proc.)
                        # Inhom Poisson process
                        if process_type == "IP":
                            if model == 'H':  
                              #print(a,b,k,n_docs, n_samp_docs, model)
                              mu = model_integral_b(a,b, k, n_docs, model) - model_integral_b(a,b, k, n_samp_docs, model)
                            else:
                              mu = model_integral(a, k, n_docs, model) - model_integral(a, k, n_samp_docs, model)

                            pred_unobserved = predict_n_rel(des_prob, n_unobserved_docs, mu) # 18-4-22 update
                            pred_n_rel = n_rel_at_end_samp + pred_unobserved
                            # print(f"pred_n_rel: {pred_n_rel} (n_rel_at_end_samp: {n_rel_at_end_samp} pred_unobserved: {pred_unobserved})")

                        # Cox process
                        elif process_type == "CX":
                            # Sample points from normal distribution; generate probability + predicted
                            # value from Poisson Process
                            norm_samples = np.linspace(norm.ppf(0.01), norm.ppf(0.99), 100)
                            vals = []

                            # Standard deviation errors on curve fit parameters
                            perr = np.sqrt(np.diag(pcov)) 
                            for sample in norm_samples:
                                # Set fit error in parameter values; need to add one and subtract other
                                a_val = a + sample*perr[0]
                                if(model == "A"):  # Set dummy value for model
                                    k_val = 0      # with single parameter
                                else:
                                    k_val = k - sample*perr[1] 
                                # integral of model_func
                                if model == 'H':   
                                  b_val = b + sample*perr[2] 
                                  mu = model_integral_b(a_val, b_val , k_val, n_docs, model) - model_integral_b(a_val, b_val, k_val, n_samp_docs, model)
                                else:
                                  mu = model_integral(a_val, k_val, n_docs, model) - model_integral(a_val, k_val, n_samp_docs, model)

                                #pred_unobserved = predict_n_rel(des_prob, n_docs, mu)
                                pred_unobserved = predict_n_rel(des_prob, n_unobserved_docs, mu) # 18-4-22 update
                                pred_n_rel = n_rel_at_end_samp + pred_unobserved
                                # print(f"pred_n_rel: {pred_n_rel} (n_rel_at_end_samp: {n_rel_at_end_samp} pred_unobserved: {pred_unobserved})")
                                vals.append(norm.pdf(sample) * pred_n_rel)   # predict max number rel docs (using poisson cdf)

                            # Integrate over samples to produce final prediction
                            pred_n_rel = simps(vals, norm_samples)

                        des_n_rel = des_recall*pred_n_rel
                        if des_n_rel <= n_rel_at_end_samp:
                            pred_stop_n = n_rel_at_end_samp    



          # calculate all actual vs. predicted results
          n_unobserved_docs = n_docs - n_samp_docs
          n_rel = np.sum(rel_list)
          n_rel_at_end_samp = np.sum(sample_rel_list)
          n_rel_unobserved = n_rel - n_rel_at_end_samp  

          
          # decrease needed min_rel_in_sample while increasing sample size
          if min_rel_in_sample_type == 'dynamic':
            if (min_rel_in_sample > 0):
              min_rel_in_sample = int(min_rel_in_sample - (sample_props[i]*min_rel_in_sample)) # by % instead of fixed number

          
          i += 1  # increase sample proportion size
 
        # Write output file 
        for i in range(n_samp_docs):
            out_f.write(f"{query_id}\tAF\t{doc_rank_dic[query_id][i]}\t{i + 1}\t{-i}\tmyrun\n")

    # Compute results
    out_f.close()
    recall, cost, reliability, loss_er, rel_err = run_tar_eval(qrel_fname, out_fname, des_recall)

    return recall, cost, reliability, loss_er, rel_err



#Experiments Functions

## run OR

In [None]:
# Function to call stopping approaches and collect results together
def run_sp_approaches(des_recall, des_prob):
 
    # PREPARE SCORING DICTIONARIES
    run_score_dic = {}   # Final dict storing results for each approach, returned by fn
    
    topics_list = make_topics_list(doc_rank_dic,1)  # sort topics by no docs
    

    # ORACLE METHOD
    print("Running oracle method")  
    model = 'OR'

    run_name = dataset_name + '_'+ str(des_prob)+'-'+str(des_recall)+'-OR'

    run_score_dic[run_name] = run_oracle_method(des_recall, topics_list)


    return run_score_dic


# RUN EXPERIMENTS
def run_experiments(): 

  df = pd.DataFrame(columns=['recall', 'cost', 'reliability', 'loss_er','rel_err'])
  for prob in des_probs: 
    print("Confidence level: ", prob)
    for recall in des_recalls: 
      print(f"recall: {recall}")
      results_dict = {}
      results_dict = run_sp_approaches(recall, prob)
      df_tmp = pd.DataFrame.from_dict(results_dict, orient="index",  columns=['recall', 'cost', 'reliability', 'loss_er', 'rel_err'])
      
      # format paper table
      df_tmp['dataset_name'] = dataset_name
      df_tmp['des_prob'] = prob
      df_tmp['des_recall'] = recall

      df = df.append(df_tmp) # append all results togather
      df = df.sort_index(ascending=False)
      #display(df) 


  return df


## set parameters space (GS)

In [None]:
# run GridSearch with training datasets
from sklearn.model_selection import ParameterGrid

min_rel_in_sample_type = 'xx'

hyperparameters_space = {'point_process':['IP', 'CX'], 
                        'model':['P', 'E', 'H', 'A'], 
                        'RMSE':[0.05, 0.1, 0.15],
                        'min_rel_in_sample':[10, 20, 'dynamic20-sample'],
                        'alpha':[0.025],
                        'beta':[0.025]}

grid = ParameterGrid(hyperparameters_space)

##run

In [None]:
def set_point_process_GridSearch(des_prob,des_recall,topics_list,run_score_dic, pp, m, r, relv, a, b):

    global min_rel_in_sample_type
    
    if relv == 'dynamic20-sample':
      min_rel_in_sample_type = 'dynamic'
      relv = 20 #update relv
    else:
      min_rel_in_sample_type = 'static'

    global model,RMSE,min_rel_in_sample, alpha, beta
    point_process = pp
    model = m
    RMSE = r
    min_rel_in_sample = relv
    min_rel_in_sample_hold = relv
    alpha = a
    beta = b
    sample_props = list(np.arange(alpha, (1+beta), beta).round(3)) 


    #print("Running-" +point_process + '-' +model)
    run_name = dataset_name + '_'+ str(des_prob)+'-'+str(des_recall)+'-'+point_process+'-'+model

    
    print("Running-" , run_name)
    run_score_dic[run_name] = run_point_process(des_recall, des_prob, topics_list, point_process, model)

    return
    




# for quick results run ip-p only
# Function to call stopping approaches and collect results together
def run_sp_approaches_GridSearch(des_recall, des_prob):
 
    # PREPARE SCORING DICTIONARIES
    run_score_dic = {}   # Final dict storing results for each approach, returned by fn
    
    topics_list = make_topics_list(doc_rank_dic,1)  # sort topics by no docs
  

    for params in grid:
      set_point_process_GridSearch(des_prob,des_recall,topics_list,run_score_dic,params['point_process'], params['model'], params['RMSE'], params['min_rel_in_sample'], params['alpha'], params['beta'])


    return run_score_dic

# RUN EXPERIMENTS
def run_experiments_GridSearch(): 
  print(dataset_name)
  #define outside func in order to access later
  df = pd.DataFrame(columns=['recall', 'cost', 'reliability', 'loss_er','rel_err'])


  for prob in des_probs: 
    print("Confidence level: ", prob)
    for recall in des_recalls: 
      print(f"recall: {recall}")
      results_dict = {}
      results_dict = run_sp_approaches_GridSearch(recall, prob)
      df_tmp = pd.DataFrame.from_dict(results_dict, orient="index", columns=['recall', 'cost', 'reliability', 'loss_er','rel_err'])

      # format paper table
      df_tmp['dataset_name'] = dataset_name
      df_tmp['des_prob'] = prob
      df_tmp['des_recall'] = recall
 
      df = df.append(df_tmp) # append all results togather
      #df = df.sort_index(ascending=False)
      #display(df)


  return df




## get target size

In [None]:
import math
def get_target_size(des_recall, des_prob):
    if des_recall == 1.0:
      des_recall = 0.9999 #31-8-22 by Reem:update des_recall to avoide devision by zero
      #des_recall = 0.99 #1-9-22 by Reem:update des_recall to avoide devision by zero - solve target TREC

    num = -1*math.log(1 - des_prob)
    denom = 1 - des_recall
    t = math.ceil(num/denom)
    print('target size: ',t)
    return t


get_target_size(1.0,0.95)
get_target_size(0.999,0.95)
get_target_size(0.99,0.95)
get_target_size(0.95,0.95)
get_target_size(0.90,0.95)

In [None]:
get_target_size(1.0,0.95)
get_target_size(0.999,0.95)
get_target_size(0.99,0.95)
get_target_size(0.95,0.95)
get_target_size(0.90,0.95)

target size:  29958
target size:  2996
target size:  300
target size:  60
target size:  30


30

#Results

#OR results

In [None]:
df_all = pd.DataFrame() # all runs in one df

## CLEF2017

In [None]:

dataset_name = 'CLEF2017'


qrels = "data/qrels/CLEF2017_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2017_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)



# SET EXPERIMENTAL PARAMETERS 

des_recalls = [1.0, 0.9, 0.8] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over


df = run_experiments()

df_all = df_all.append(df) # append all results togather

display(df_all)

df.to_latex()

df_all.to_csv(path+'results/df_all_OR.csv')
df_all.to_csv(path+'results/df_all_OR_CI_win_encoding.csv', encoding='Windows-1252') 


Number of topics: 42
Confidence level:  0.95
recall: 1.0
Running oracle method
recall: 0.9
Running oracle method
recall: 0.8
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-1.0-OR,1.000±0.000,0.133±0.194,1.0,0.005±0.007,0.000±0.000,CLEF2017,0.95,1.0
CLEF2017_0.95-0.9-OR,0.923±0.036,0.057±0.061,1.0,0.009±0.006,0.026±0.039,CLEF2017,0.95,0.9
CLEF2017_0.95-0.8-OR,0.834±0.067,0.043±0.048,1.0,0.033±0.014,0.042±0.084,CLEF2017,0.95,0.8


##CLEF2018

In [None]:

dataset_name = 'CLEF2018'


qrels = "data/qrels/CLEF2018_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2018_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)



# SET EXPERIMENTAL PARAMETERS 

des_recalls = [ 0.8, 0.9, 1.0] # desired recalls to experiment over
des_probs = [0.95] # desired confidences to experiment over



df = run_experiments()

df_all = df_all.append(df) # append all results togather

display(df_all)

df.to_latex()

df_all.to_csv(path+'results/df_all_OR.csv')
df_all.to_csv(path+'results/df_all_OR_CI_win_encoding.csv', encoding='Windows-1252') 

Number of topics: 30
Confidence level:  0.95
recall: 0.8
Running oracle method
recall: 0.9
Running oracle method
recall: 1.0
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-1.0-OR,1.000±0.000,0.133±0.194,1.0,0.005±0.007,0.000±0.000,CLEF2017,0.95,1.0
CLEF2017_0.95-0.9-OR,0.923±0.036,0.057±0.061,1.0,0.009±0.006,0.026±0.039,CLEF2017,0.95,0.9
CLEF2017_0.95-0.8-OR,0.834±0.067,0.043±0.048,1.0,0.033±0.014,0.042±0.084,CLEF2017,0.95,0.8
CLEF2018_0.95-1.0-OR,1.000±0.000,0.161±0.207,1.0,0.007±0.013,0.000±0.000,CLEF2018,0.95,1.0
CLEF2018_0.95-0.9-OR,0.912±0.019,0.067±0.064,1.0,0.010±0.004,0.013±0.021,CLEF2018,0.95,0.9
CLEF2018_0.95-0.8-OR,0.812±0.016,0.051±0.054,1.0,0.037±0.006,0.015±0.020,CLEF2018,0.95,0.8


##CLEF2019

In [None]:

dataset_name = 'CLEF2019'


qrels = "data/qrels/CLEF2019_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2019_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)


# SET EXPERIMENTAL PARAMETERS 

des_recalls = [ 0.8, 0.9, 1.0] # desired recalls to experiment over
des_probs = [0.95] # desired confidences to experiment over



df = run_experiments()

df_all = df_all.append(df) # append all results togather

display(df_all)

df.to_latex()

df_all.to_csv(path+'results/df_all_OR.csv')
df_all.to_csv(path+'results/df_all_OR_CI_win_encoding.csv', encoding='Windows-1252') 

Number of topics: 31
Confidence level:  0.95
recall: 0.8
Running oracle method
recall: 0.9
Running oracle method
recall: 1.0
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-1.0-OR,1.000±0.000,0.133±0.194,1.0,0.005±0.007,0.000±0.000,CLEF2017,0.95,1.0
CLEF2017_0.95-0.9-OR,0.923±0.036,0.057±0.061,1.0,0.009±0.006,0.026±0.039,CLEF2017,0.95,0.9
CLEF2017_0.95-0.8-OR,0.834±0.067,0.043±0.048,1.0,0.033±0.014,0.042±0.084,CLEF2017,0.95,0.8
CLEF2018_0.95-1.0-OR,1.000±0.000,0.161±0.207,1.0,0.007±0.013,0.000±0.000,CLEF2018,0.95,1.0
CLEF2018_0.95-0.9-OR,0.912±0.019,0.067±0.064,1.0,0.010±0.004,0.013±0.021,CLEF2018,0.95,0.9
CLEF2018_0.95-0.8-OR,0.812±0.016,0.051±0.054,1.0,0.037±0.006,0.015±0.020,CLEF2018,0.95,0.8
CLEF2019_0.95-1.0-OR,1.000±0.000,0.116±0.123,1.0,0.009±0.017,0.000±0.000,CLEF2019,0.95,1.0
CLEF2019_0.95-0.9-OR,0.929±0.037,0.071±0.082,1.0,0.011±0.013,0.033±0.041,CLEF2019,0.95,0.9
CLEF2019_0.95-0.8-OR,0.830±0.050,0.057±0.069,1.0,0.035±0.015,0.037±0.062,CLEF2019,0.95,0.8


##TREC-TR

In [None]:

dataset_name = 'TR'


qrels = "data/qrels/TREC_TR_Test_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/tr_test_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)



# SET EXPERIMENTAL PARAMETERS 

des_recalls = [ 0.8, 0.9, 1.0] # desired recalls to experiment over
des_probs = [0.95] # desired confidences to experiment over


df = run_experiments()

df_all = df_all.append(df) # append all results togather

display(df_all)

df.to_latex()

df_all.to_csv(path+'results/df_all_OR.csv')
df_all.to_csv(path+'results/df_all_OR_CI_win_encoding.csv', encoding='Windows-1252') 

Number of topics: 34
Confidence level:  0.95
recall: 0.8
Running oracle method
recall: 0.9
Running oracle method
recall: 1.0
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-1.0-OR,1.000±0.000,0.133±0.194,1.0,0.005±0.007,0.000±0.000,CLEF2017,0.95,1.0
CLEF2017_0.95-0.9-OR,0.923±0.036,0.057±0.061,1.0,0.009±0.006,0.026±0.039,CLEF2017,0.95,0.9
CLEF2017_0.95-0.8-OR,0.834±0.067,0.043±0.048,1.0,0.033±0.014,0.042±0.084,CLEF2017,0.95,0.8
CLEF2018_0.95-1.0-OR,1.000±0.000,0.161±0.207,1.0,0.007±0.013,0.000±0.000,CLEF2018,0.95,1.0
CLEF2018_0.95-0.9-OR,0.912±0.019,0.067±0.064,1.0,0.010±0.004,0.013±0.021,CLEF2018,0.95,0.9
CLEF2018_0.95-0.8-OR,0.812±0.016,0.051±0.054,1.0,0.037±0.006,0.015±0.020,CLEF2018,0.95,0.8
CLEF2019_0.95-1.0-OR,1.000±0.000,0.116±0.123,1.0,0.009±0.017,0.000±0.000,CLEF2019,0.95,1.0
CLEF2019_0.95-0.9-OR,0.929±0.037,0.071±0.082,1.0,0.011±0.013,0.033±0.041,CLEF2019,0.95,0.9
CLEF2019_0.95-0.8-OR,0.830±0.050,0.057±0.069,1.0,0.035±0.015,0.037±0.062,CLEF2019,0.95,0.8
TR_0.95-1.0-OR,1.000±0.000,0.043±0.108,1.0,0.000±0.000,0.000±0.000,TR,0.95,1.0


##TREC-Legal

In [None]:

dataset_name = 'Legal'


qrels = "data/qrels/TREC_Legal_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/legal_test_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)



# SET EXPERIMENTAL PARAMETERS 

des_recalls = [ 0.8, 0.9, 1.0] # desired recalls to experiment over
des_probs = [0.95] # desired confidences to experiment over


df = run_experiments()

df_all = df_all.append(df) # append all results togather

display(df_all)

df.to_latex()

df_all.to_csv(path+'results/df_all_OR.csv')
df_all.to_csv(path+'results/df_all_OR_CI_win_encoding.csv', encoding='Windows-1252') 

Number of topics: 4
Confidence level:  0.95
recall: 0.8
Running oracle method
recall: 0.9
Running oracle method
recall: 1.0
Running oracle method


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-1.0-OR,1.000±0.000,0.133±0.194,1.0,0.005±0.007,0.000±0.000,CLEF2017,0.95,1.0
CLEF2017_0.95-0.9-OR,0.923±0.036,0.057±0.061,1.0,0.009±0.006,0.026±0.039,CLEF2017,0.95,0.9
CLEF2017_0.95-0.8-OR,0.834±0.067,0.043±0.048,1.0,0.033±0.014,0.042±0.084,CLEF2017,0.95,0.8
CLEF2018_0.95-1.0-OR,1.000±0.000,0.161±0.207,1.0,0.007±0.013,0.000±0.000,CLEF2018,0.95,1.0
CLEF2018_0.95-0.9-OR,0.912±0.019,0.067±0.064,1.0,0.010±0.004,0.013±0.021,CLEF2018,0.95,0.9
CLEF2018_0.95-0.8-OR,0.812±0.016,0.051±0.054,1.0,0.037±0.006,0.015±0.020,CLEF2018,0.95,0.8
CLEF2019_0.95-1.0-OR,1.000±0.000,0.116±0.123,1.0,0.009±0.017,0.000±0.000,CLEF2019,0.95,1.0
CLEF2019_0.95-0.9-OR,0.929±0.037,0.071±0.082,1.0,0.011±0.013,0.033±0.041,CLEF2019,0.95,0.9
CLEF2019_0.95-0.8-OR,0.830±0.050,0.057±0.069,1.0,0.035±0.015,0.037±0.062,CLEF2019,0.95,0.8
TR_0.95-1.0-OR,1.000±0.000,0.043±0.108,1.0,0.000±0.000,0.000±0.000,TR,0.95,1.0


# IP-H with all CI (0.8 to 0.2)

In [None]:
from sklearn.model_selection import ParameterGrid

hyperparameters_space = {'point_process':['IP'], 
                        'model':['H'], 
                        'RMSE':[0.1],
                        'min_rel_in_sample':['dynamic20-sample'],
                        'alpha':[0.025],
                        'beta':[0.025]}

grid = ParameterGrid(hyperparameters_space)

In [None]:
df_all = pd.DataFrame() # all runs in one df


### CLEF2017

In [None]:


dataset_name = 'CLEF2017'


qrels = "data/qrels/CLEF2017_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2017_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)



# SET EXPERIMENTAL PARAMETERS 

des_recalls = [1.0, 0.9, 0.8] # desired recalls to experiment over

des_probs = [0.8, 0.6, 0.4, 0.2] # desired confidences to experiment over




df = run_experiments_GridSearch()

df = df.sort_index(ascending=False)
df = df.sort_values(by=['des_recall'], ascending=False)
display(df)

df_all = df_all.append(df) # append all results togather

df.to_latex()

df_all.to_csv(path+'results/df_all_CI.csv')
df_all.to_csv(path+'results/df_all_CI_win_encoding.csv', encoding='Windows-1252') 


Number of topics: 42
CLEF2017
Confidence level:  0.8
recall: 1.0
Running- CLEF2017_0.8-1.0-IP-H
recall: 0.9
Running- CLEF2017_0.8-0.9-IP-H
recall: 0.8
Running- CLEF2017_0.8-0.8-IP-H
Confidence level:  0.6
recall: 1.0
Running- CLEF2017_0.6-1.0-IP-H
recall: 0.9
Running- CLEF2017_0.6-0.9-IP-H
recall: 0.8
Running- CLEF2017_0.6-0.8-IP-H
Confidence level:  0.4
recall: 1.0
Running- CLEF2017_0.4-1.0-IP-H
recall: 0.9
Running- CLEF2017_0.4-0.9-IP-H
recall: 0.8
Running- CLEF2017_0.4-0.8-IP-H
Confidence level:  0.2
recall: 1.0
Running- CLEF2017_0.2-1.0-IP-H
recall: 0.9
Running- CLEF2017_0.2-0.9-IP-H
recall: 0.8
Running- CLEF2017_0.2-0.8-IP-H


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.8-1.0-IP-H,0.970±0.095,0.202±0.169,0.867,0.035±0.042,0.030±0.095,CLEF2017,0.8,1.0
CLEF2017_0.6-1.0-IP-H,0.969±0.095,0.185±0.143,0.8,0.033±0.043,0.031±0.095,CLEF2017,0.6,1.0
CLEF2017_0.4-1.0-IP-H,0.969±0.095,0.180±0.140,0.767,0.033±0.043,0.031±0.095,CLEF2017,0.4,1.0
CLEF2017_0.2-1.0-IP-H,0.968±0.095,0.171±0.135,0.767,0.032±0.043,0.032±0.095,CLEF2017,0.2,1.0
CLEF2017_0.8-0.9-IP-H,0.955±0.119,0.144±0.114,0.9,0.036±0.056,0.123±0.076,CLEF2017,0.8,0.9
CLEF2017_0.6-0.9-IP-H,0.954±0.119,0.143±0.116,0.9,0.036±0.056,0.122±0.076,CLEF2017,0.6,0.9
CLEF2017_0.4-0.9-IP-H,0.953±0.119,0.140±0.114,0.867,0.036±0.056,0.122±0.076,CLEF2017,0.4,0.9
CLEF2017_0.2-0.9-IP-H,0.950±0.121,0.138±0.114,0.867,0.037±0.055,0.123±0.074,CLEF2017,0.2,0.9
CLEF2017_0.8-0.8-IP-H,0.949±0.120,0.137±0.113,0.9,0.037±0.055,0.231±0.056,CLEF2017,0.8,0.8
CLEF2017_0.6-0.8-IP-H,0.947±0.123,0.135±0.112,0.867,0.037±0.056,0.231±0.057,CLEF2017,0.6,0.8


### CLEF2018

In [None]:

dataset_name = 'CLEF2018'


qrels = "data/qrels/CLEF2018_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2018_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)



# SET EXPERIMENTAL PARAMETERS 

des_recalls = [1.0, 0.9, 0.8] # desired recalls to experiment over

des_probs = [0.8, 0.6, 0.4, 0.2] # desired confidences to experiment over



df = run_experiments_GridSearch()

df = df.sort_index(ascending=False)
df = df.sort_values(by=['des_recall'], ascending=False)
display(df)

df_all = df_all.append(df) # append all results togather

df.to_latex()

df_all.to_csv(path+'results/df_all_CI.csv')
df_all.to_csv(path+'results/df_all_CI_win_encoding.csv', encoding='Windows-1252') 
 

###CLEF2019

In [None]:

dataset_name = 'CLEF2019'


qrels = "data/qrels/CLEF2019_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2019_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)



# SET EXPERIMENTAL PARAMETERS 

des_recalls = [1.0, 0.9, 0.8] # desired recalls to experiment over

des_probs = [0.8, 0.6, 0.4, 0.2] # desired confidences to experiment over



df = run_experiments_GridSearch()

df = df.sort_index(ascending=False)
df = df.sort_values(by=['des_recall'], ascending=False)
display(df)

df_all = df_all.append(df) # append all results togather

df.to_latex()

df_all.to_csv(path+'results/df_all_CI.csv')
df_all.to_csv(path+'results/df_all_CI_win_encoding.csv', encoding='Windows-1252') 


### TREC-TR

In [None]:

dataset_name = 'TR'


qrels = "data/qrels/TREC_TR_Test_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/tr_test_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)



# SET EXPERIMENTAL PARAMETERS 

des_recalls = [1.0, 0.9, 0.8] # desired recalls to experiment over

des_probs = [0.8, 0.6, 0.4, 0.2] # desired confidences to experiment over



df = run_experiments_GridSearch()

df = df.sort_index(ascending=False)
df = df.sort_values(by=['des_recall'], ascending=False)
display(df)

df_all = df_all.append(df) # append all results togather

df.to_latex()

df_all.to_csv(path+'results/df_all_CI.csv')
df_all.to_csv(path+'results/df_all_CI_win_encoding.csv', encoding='Windows-1252') 


### TREC-Legal

In [None]:

dataset_name = 'Legal'


qrels = "data/qrels/TREC_Legal_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/legal_test_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)



# SET EXPERIMENTAL PARAMETERS 

des_recalls = [1.0, 0.9, 0.8] # desired recalls to experiment over

des_probs = [0.8, 0.6, 0.4, 0.2] # desired confidences to experiment over


df = run_experiments_GridSearch()

df = df.sort_index(ascending=False)
df = df.sort_values(by=['des_recall'], ascending=False)
display(df)

df_all = df_all.append(df) # append all results togather

df.to_latex()

df_all.to_csv(path+'results/df_all_CI.csv')
df_all.to_csv(path+'results/df_all_CI_win_encoding.csv', encoding='Windows-1252') 


# IP/CX all rates results (CI 0.95- Target 0.9)

In [None]:
# set model parameters space
from sklearn.model_selection import ParameterGrid

hyperparameters_space = {'point_process':['IP', 'CX'], 
                        'model':['P', 'H', 'E', 'A'], 
                        'RMSE':[0.1],
                        'min_rel_in_sample':['dynamic20-sample'],
                        'alpha':[0.025],
                        'beta':[0.025]}

grid = ParameterGrid(hyperparameters_space)

In [None]:
df_all = pd.DataFrame() # all runs in one df


### CLEF2017

In [None]:

dataset_name = 'CLEF2017'


qrels = "data/qrels/CLEF2017_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2017_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)



# SET EXPERIMENTAL PARAMETERS 
des_recalls = [0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over



df = run_experiments_GridSearch()

df_all = df_all.append(df) # append all results togather
df_all = df_all.sort_values(by=['dataset_name'], ascending=True)
display(df_all)

# save all
df_all.to_csv(path+'results/df_all_rates.csv')
df_all.to_csv(path+'results/df_all_rates_win_encoding.csv', encoding='Windows-1252') 


Number of topics: 42
CLEF2017
Confidence level:  0.95
recall: 0.9
Running- CLEF2017_0.95-0.9-IP-P
Running- CLEF2017_0.95-0.9-CX-P
Running- CLEF2017_0.95-0.9-IP-H
Running- CLEF2017_0.95-0.9-CX-H
Running- CLEF2017_0.95-0.9-IP-E
Running- CLEF2017_0.95-0.9-CX-E
Running- CLEF2017_0.95-0.9-IP-A
Running- CLEF2017_0.95-0.9-CX-A


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-0.9-IP-P,1.000±0.000,0.281±0.226,1.0,0.030±0.031,0.111±0.000,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-P,1.000±0.000,0.278±0.220,1.0,0.030±0.031,0.111±0.000,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-H,0.955±0.119,0.147±0.114,0.9,0.036±0.055,0.123±0.076,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-H,0.951±0.137,0.172±0.140,0.9,0.044±0.071,0.130±0.096,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-E,0.984±0.031,0.154±0.113,0.967,0.022±0.032,0.094±0.034,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-E,0.984±0.031,0.153±0.112,0.967,0.022±0.032,0.094±0.034,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-A,0.994±0.017,0.205±0.153,1.0,0.025±0.032,0.105±0.019,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-A,0.994±0.017,0.205±0.153,1.0,0.025±0.032,0.105±0.019,CLEF2017,0.95,0.9


### CLEF2018

In [None]:

dataset_name = 'CLEF2018'


qrels = "data/qrels/CLEF2018_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2018_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)


# SET EXPERIMENTAL PARAMETERS 
des_recalls = [0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over




df = run_experiments_GridSearch()

df_all = df_all.append(df) # append all results togather
df_all = df_all.sort_values(by=['dataset_name'], ascending=True)
display(df_all)

# save all
df_all.to_csv(path+'results/df_all_rates.csv')
df_all.to_csv(path+'results/df_all_rates_win_encoding.csv', encoding='Windows-1252') 


Number of topics: 30
CLEF2018
Confidence level:  0.95
recall: 0.9
Running- CLEF2018_0.95-0.9-IP-P
Running- CLEF2018_0.95-0.9-CX-P
Running- CLEF2018_0.95-0.9-IP-H
Running- CLEF2018_0.95-0.9-CX-H
Running- CLEF2018_0.95-0.9-IP-E
Running- CLEF2018_0.95-0.9-CX-E
Running- CLEF2018_0.95-0.9-IP-A
Running- CLEF2018_0.95-0.9-CX-A


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-0.9-IP-P,1.000±0.000,0.281±0.226,1.0,0.030±0.031,0.111±0.000,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-P,1.000±0.000,0.278±0.220,1.0,0.030±0.031,0.111±0.000,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-H,0.955±0.119,0.147±0.114,0.9,0.036±0.055,0.123±0.076,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-H,0.951±0.137,0.172±0.140,0.9,0.044±0.071,0.130±0.096,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-E,0.984±0.031,0.154±0.113,0.967,0.022±0.032,0.094±0.034,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-E,0.984±0.031,0.153±0.112,0.967,0.022±0.032,0.094±0.034,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-A,0.994±0.017,0.205±0.153,1.0,0.025±0.032,0.105±0.019,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-A,0.994±0.017,0.205±0.153,1.0,0.025±0.032,0.105±0.019,CLEF2017,0.95,0.9
CLEF2018_0.95-0.9-IP-P,1.000±0.001,0.293±0.213,1.0,0.024±0.026,0.111±0.001,CLEF2018,0.95,0.9
CLEF2018_0.95-0.9-CX-P,1.000±0.001,0.292±0.211,1.0,0.024±0.026,0.111±0.001,CLEF2018,0.95,0.9


###CLEF2019

In [None]:

dataset_name = 'CLEF2019'


qrels = "data/qrels/CLEF2019_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2019_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)


# SET EXPERIMENTAL PARAMETERS 
des_recalls = [0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over




df = run_experiments_GridSearch()

df_all = df_all.append(df) # append all results togather
df_all = df_all.sort_values(by=['dataset_name'], ascending=True)
display(df_all)

# save all
df_all.to_csv(path+'results/df_all_rates.csv')
df_all.to_csv(path+'results/df_all_rates_win_encoding.csv', encoding='Windows-1252') 

Number of topics: 31
CLEF2019
Confidence level:  0.95
recall: 0.9
Running- CLEF2019_0.95-0.9-IP-P
Running- CLEF2019_0.95-0.9-CX-P
Running- CLEF2019_0.95-0.9-IP-H
Running- CLEF2019_0.95-0.9-CX-H
Running- CLEF2019_0.95-0.9-IP-E
Running- CLEF2019_0.95-0.9-CX-E
Running- CLEF2019_0.95-0.9-IP-A
Running- CLEF2019_0.95-0.9-CX-A


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-0.9-IP-P,1.000±0.000,0.281±0.226,1.0,0.030±0.031,0.111±0.000,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-P,1.000±0.000,0.278±0.220,1.0,0.030±0.031,0.111±0.000,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-H,0.955±0.119,0.147±0.114,0.9,0.036±0.055,0.123±0.076,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-H,0.951±0.137,0.172±0.140,0.9,0.044±0.071,0.130±0.096,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-E,0.984±0.031,0.154±0.113,0.967,0.022±0.032,0.094±0.034,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-E,0.984±0.031,0.153±0.112,0.967,0.022±0.032,0.094±0.034,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-A,0.994±0.017,0.205±0.153,1.0,0.025±0.032,0.105±0.019,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-A,0.994±0.017,0.205±0.153,1.0,0.025±0.032,0.105±0.019,CLEF2017,0.95,0.9
CLEF2018_0.95-0.9-CX-A,0.988±0.029,0.221±0.189,0.967,0.018±0.021,0.099±0.028,CLEF2018,0.95,0.9
CLEF2018_0.95-0.9-IP-A,0.988±0.029,0.221±0.189,0.967,0.018±0.021,0.099±0.028,CLEF2018,0.95,0.9


### TREC-TR

In [None]:

dataset_name = 'TR'


qrels = "data/qrels/TREC_TR_Test_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/tr_test_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)


# SET EXPERIMENTAL PARAMETERS 
des_recalls = [0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over




df = run_experiments_GridSearch()

df_all = df_all.append(df) # append all results togather
df_all = df_all.sort_values(by=['dataset_name'], ascending=True)
display(df_all)

# save all
df_all.to_csv(path+'results/df_all_rates.csv')
df_all.to_csv(path+'results/df_all_rates_win_encoding.csv', encoding='Windows-1252') 

Number of topics: 34
TR
Confidence level:  0.95
recall: 0.9
Running- TR_0.95-0.9-IP-P
Running- TR_0.95-0.9-CX-P
Running- TR_0.95-0.9-IP-H
Running- TR_0.95-0.9-CX-H
Running- TR_0.95-0.9-IP-E
Running- TR_0.95-0.9-CX-E
Running- TR_0.95-0.9-IP-A
Running- TR_0.95-0.9-CX-A


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-0.9-IP-P,1.000±0.000,0.281±0.226,1.0,0.030±0.031,0.111±0.000,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-P,1.000±0.000,0.278±0.220,1.0,0.030±0.031,0.111±0.000,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-H,0.955±0.119,0.147±0.114,0.9,0.036±0.055,0.123±0.076,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-H,0.951±0.137,0.172±0.140,0.9,0.044±0.071,0.130±0.096,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-E,0.984±0.031,0.154±0.113,0.967,0.022±0.032,0.094±0.034,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-E,0.984±0.031,0.153±0.112,0.967,0.022±0.032,0.094±0.034,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-A,0.994±0.017,0.205±0.153,1.0,0.025±0.032,0.105±0.019,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-A,0.994±0.017,0.205±0.153,1.0,0.025±0.032,0.105±0.019,CLEF2017,0.95,0.9
CLEF2018_0.95-0.9-CX-P,1.000±0.001,0.292±0.211,1.0,0.024±0.026,0.111±0.001,CLEF2018,0.95,0.9
CLEF2018_0.95-0.9-IP-H,0.941±0.125,0.141±0.145,0.833,0.030±0.063,0.115±0.088,CLEF2018,0.95,0.9


### TREC-Legal

In [None]:

dataset_name = 'Legal'


qrels = "data/qrels/TREC_Legal_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/legal_test_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)


# SET EXPERIMENTAL PARAMETERS 
des_recalls = [0.9] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over



df = run_experiments_GridSearch()

df_all = df_all.append(df) # append all results togather
df_all = df_all.sort_values(by=['dataset_name'], ascending=True)
display(df_all)

# save all
df_all.to_csv(path+'results/df_all_rates.csv')
df_all.to_csv(path+'results/df_all_rates_win_encoding.csv', encoding='Windows-1252') 

Number of topics: 4
Legal
Confidence level:  0.95
recall: 0.9
Running- Legal_0.95-0.9-IP-P
Running- Legal_0.95-0.9-CX-P
Running- Legal_0.95-0.9-IP-H
Running- Legal_0.95-0.9-CX-H
Running- Legal_0.95-0.9-IP-E
Running- Legal_0.95-0.9-CX-E
Running- Legal_0.95-0.9-IP-A
Running- Legal_0.95-0.9-CX-A


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-0.9-IP-P,1.000±0.000,0.281±0.226,1.0,0.030±0.031,0.111±0.000,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-P,1.000±0.000,0.278±0.220,1.0,0.030±0.031,0.111±0.000,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-H,0.955±0.119,0.147±0.114,0.9,0.036±0.055,0.123±0.076,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-H,0.951±0.137,0.172±0.140,0.9,0.044±0.071,0.130±0.096,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-E,0.984±0.031,0.154±0.113,0.967,0.022±0.032,0.094±0.034,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-E,0.984±0.031,0.153±0.112,0.967,0.022±0.032,0.094±0.034,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-A,0.994±0.017,0.205±0.153,1.0,0.025±0.032,0.105±0.019,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-A,0.994±0.017,0.205±0.153,1.0,0.025±0.032,0.105±0.019,CLEF2017,0.95,0.9
CLEF2018_0.95-0.9-IP-E,0.978±0.034,0.169±0.175,0.933,0.015±0.020,0.091±0.027,CLEF2018,0.95,0.9
CLEF2018_0.95-0.9-CX-A,0.988±0.029,0.221±0.189,0.967,0.018±0.021,0.099±0.028,CLEF2018,0.95,0.9


In [None]:
df_all.to_latex()

'\\begin{tabular}{lllllllrr}\n\\toprule\n{} &       recall &         cost & reliability &      loss\\_er &      rel\\_err & dataset\\_name &  des\\_prob &  des\\_recall \\\\\n\\midrule\nCLEF2017\\_0.95-0.9-IP-P &  1.000±0.000 &  0.281±0.226 &       1.000 &  0.030±0.031 &  0.111±0.000 &     CLEF2017 &      0.95 &         0.9 \\\\\nCLEF2017\\_0.95-0.9-CX-P &  1.000±0.000 &  0.278±0.220 &       1.000 &  0.030±0.031 &  0.111±0.000 &     CLEF2017 &      0.95 &         0.9 \\\\\nCLEF2017\\_0.95-0.9-IP-H &  0.955±0.119 &  0.147±0.114 &       0.900 &  0.036±0.055 &  0.123±0.076 &     CLEF2017 &      0.95 &         0.9 \\\\\nCLEF2017\\_0.95-0.9-CX-H &  0.951±0.137 &  0.172±0.140 &       0.900 &  0.044±0.071 &  0.130±0.096 &     CLEF2017 &      0.95 &         0.9 \\\\\nCLEF2017\\_0.95-0.9-IP-E &  0.984±0.031 &  0.154±0.113 &       0.967 &  0.022±0.032 &  0.094±0.034 &     CLEF2017 &      0.95 &         0.9 \\\\\nCLEF2017\\_0.95-0.9-CX-E &  0.984±0.031 &  0.153±0.112 &       0.967 &  0.022±0.032 

In [None]:
df_all

Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-0.9-IP-P,1.000±0.000,0.281±0.226,1.0,0.030±0.031,0.111±0.000,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-P,1.000±0.000,0.278±0.220,1.0,0.030±0.031,0.111±0.000,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-H,0.955±0.119,0.147±0.114,0.9,0.036±0.055,0.123±0.076,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-H,0.951±0.137,0.172±0.140,0.9,0.044±0.071,0.130±0.096,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-E,0.984±0.031,0.154±0.113,0.967,0.022±0.032,0.094±0.034,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-E,0.984±0.031,0.153±0.112,0.967,0.022±0.032,0.094±0.034,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-A,0.994±0.017,0.205±0.153,1.0,0.025±0.032,0.105±0.019,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-A,0.994±0.017,0.205±0.153,1.0,0.025±0.032,0.105±0.019,CLEF2017,0.95,0.9
CLEF2018_0.95-0.9-IP-E,0.978±0.034,0.169±0.175,0.933,0.015±0.020,0.091±0.027,CLEF2018,0.95,0.9
CLEF2018_0.95-0.9-CX-A,0.988±0.029,0.221±0.189,0.967,0.018±0.021,0.099±0.028,CLEF2018,0.95,0.9


In [None]:
df_all.sort_index(ascending=True)

Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-0.9-CX-A,0.994±0.017,0.205±0.153,1.0,0.025±0.032,0.105±0.019,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-E,0.984±0.031,0.153±0.112,0.967,0.022±0.032,0.094±0.034,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-H,0.951±0.137,0.172±0.140,0.9,0.044±0.071,0.130±0.096,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-P,1.000±0.000,0.278±0.220,1.0,0.030±0.031,0.111±0.000,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-A,0.994±0.017,0.205±0.153,1.0,0.025±0.032,0.105±0.019,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-E,0.984±0.031,0.154±0.113,0.967,0.022±0.032,0.094±0.034,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-H,0.955±0.119,0.147±0.114,0.9,0.036±0.055,0.123±0.076,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-P,1.000±0.000,0.281±0.226,1.0,0.030±0.031,0.111±0.000,CLEF2017,0.95,0.9
CLEF2018_0.95-0.9-CX-A,0.988±0.029,0.221±0.189,0.967,0.018±0.021,0.099±0.028,CLEF2018,0.95,0.9
CLEF2018_0.95-0.9-CX-E,0.977±0.034,0.167±0.176,0.933,0.015±0.020,0.089±0.028,CLEF2018,0.95,0.9


In [None]:
# print sorted

#get all files
all_rates_csv = glob.glob(path + 'results/0.95-0.9'+ "*Summary_mean_std_macro.csv")

#read into df
df_list = (pd.read_csv(file, index_col=0) for file in all_rates_csv)

#concat all dfs
df_all_sorted   = pd.concat(df_list)

df_all_sorted


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-0.9-IP-P,1.000±0.000,0.281±0.226,1.0,0.030±0.031,0.111±0.000,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-P,1.000±0.000,0.278±0.220,1.0,0.030±0.031,0.111±0.000,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-H,0.955±0.119,0.147±0.114,0.9,0.036±0.055,0.123±0.076,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-H,0.951±0.137,0.172±0.140,0.9,0.044±0.071,0.130±0.096,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-E,0.984±0.031,0.154±0.113,0.967,0.022±0.032,0.094±0.034,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-E,0.984±0.031,0.153±0.112,0.967,0.022±0.032,0.094±0.034,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-A,0.994±0.017,0.205±0.153,1.0,0.025±0.032,0.105±0.019,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-A,0.994±0.017,0.205±0.153,1.0,0.025±0.032,0.105±0.019,CLEF2017,0.95,0.9
CLEF2018_0.95-0.9-IP-P,1.000±0.001,0.293±0.213,1.0,0.024±0.026,0.111±0.001,CLEF2018,0.95,0.9
CLEF2018_0.95-0.9-CX-P,1.000±0.001,0.292±0.211,1.0,0.024±0.026,0.111±0.001,CLEF2018,0.95,0.9


In [None]:
df_all_sorted.to_latex()

'\\begin{tabular}{lllrlllrr}\n\\toprule\n{} &       recall &         cost &  reliability &      loss\\_er &      rel\\_err & dataset\\_name &  des\\_prob &  des\\_recall \\\\\n\\midrule\nCLEF2017\\_0.95-0.9-IP-P &  1.000±0.000 &  0.281±0.226 &        1.000 &  0.030±0.031 &  0.111±0.000 &     CLEF2017 &      0.95 &         0.9 \\\\\nCLEF2017\\_0.95-0.9-CX-P &  1.000±0.000 &  0.278±0.220 &        1.000 &  0.030±0.031 &  0.111±0.000 &     CLEF2017 &      0.95 &         0.9 \\\\\nCLEF2017\\_0.95-0.9-IP-H &  0.955±0.119 &  0.147±0.114 &        0.900 &  0.036±0.055 &  0.123±0.076 &     CLEF2017 &      0.95 &         0.9 \\\\\nCLEF2017\\_0.95-0.9-CX-H &  0.951±0.137 &  0.172±0.140 &        0.900 &  0.044±0.071 &  0.130±0.096 &     CLEF2017 &      0.95 &         0.9 \\\\\nCLEF2017\\_0.95-0.9-IP-E &  0.984±0.031 &  0.154±0.113 &        0.967 &  0.022±0.032 &  0.094±0.034 &     CLEF2017 &      0.95 &         0.9 \\\\\nCLEF2017\\_0.95-0.9-CX-E &  0.984±0.031 &  0.153±0.112 &        0.967 &  0.022

In [None]:
df_all_sorted = df_all_sorted[['dataset_name','recall', 'cost', 'reliability', 'loss_er','rel_err']]
df_all_sorted

Unnamed: 0,dataset_name,recall,cost,reliability,loss_er,rel_err
CLEF2017_0.95-0.9-IP-P,CLEF2017,1.000±0.000,0.281±0.226,1.0,0.030±0.031,0.111±0.000
CLEF2017_0.95-0.9-CX-P,CLEF2017,1.000±0.000,0.278±0.220,1.0,0.030±0.031,0.111±0.000
CLEF2017_0.95-0.9-IP-H,CLEF2017,0.955±0.119,0.147±0.114,0.9,0.036±0.055,0.123±0.076
CLEF2017_0.95-0.9-CX-H,CLEF2017,0.951±0.137,0.172±0.140,0.9,0.044±0.071,0.130±0.096
CLEF2017_0.95-0.9-IP-E,CLEF2017,0.984±0.031,0.154±0.113,0.967,0.022±0.032,0.094±0.034
CLEF2017_0.95-0.9-CX-E,CLEF2017,0.984±0.031,0.153±0.112,0.967,0.022±0.032,0.094±0.034
CLEF2017_0.95-0.9-IP-A,CLEF2017,0.994±0.017,0.205±0.153,1.0,0.025±0.032,0.105±0.019
CLEF2017_0.95-0.9-CX-A,CLEF2017,0.994±0.017,0.205±0.153,1.0,0.025±0.032,0.105±0.019
CLEF2018_0.95-0.9-IP-P,CLEF2018,1.000±0.001,0.293±0.213,1.0,0.024±0.026,0.111±0.001
CLEF2018_0.95-0.9-CX-P,CLEF2018,1.000±0.001,0.292±0.211,1.0,0.024±0.026,0.111±0.001


In [None]:
df_all_sorted.to_latex()

'\\begin{tabular}{llllrll}\n\\toprule\n{} & dataset\\_name &       recall &         cost &  reliability &      loss\\_er &      rel\\_err \\\\\n\\midrule\nCLEF2017\\_0.95-0.9-IP-P &     CLEF2017 &  1.000±0.000 &  0.281±0.226 &        1.000 &  0.030±0.031 &  0.111±0.000 \\\\\nCLEF2017\\_0.95-0.9-CX-P &     CLEF2017 &  1.000±0.000 &  0.278±0.220 &        1.000 &  0.030±0.031 &  0.111±0.000 \\\\\nCLEF2017\\_0.95-0.9-IP-H &     CLEF2017 &  0.955±0.119 &  0.147±0.114 &        0.900 &  0.036±0.055 &  0.123±0.076 \\\\\nCLEF2017\\_0.95-0.9-CX-H &     CLEF2017 &  0.951±0.137 &  0.172±0.140 &        0.900 &  0.044±0.071 &  0.130±0.096 \\\\\nCLEF2017\\_0.95-0.9-IP-E &     CLEF2017 &  0.984±0.031 &  0.154±0.113 &        0.967 &  0.022±0.032 &  0.094±0.034 \\\\\nCLEF2017\\_0.95-0.9-CX-E &     CLEF2017 &  0.984±0.031 &  0.153±0.112 &        0.967 &  0.022±0.032 &  0.094±0.034 \\\\\nCLEF2017\\_0.95-0.9-IP-A &     CLEF2017 &  0.994±0.017 &  0.205±0.153 &        1.000 &  0.025±0.032 &  0.105±0.019 \\\\

# IP-H results (CI 0.95-Target 1.0, 0.9, 0.8)

In [None]:
# set model parameters space
from sklearn.model_selection import ParameterGrid

hyperparameters_space = {'point_process':['IP'], 
                        'model':['H'], 
                        'RMSE':[0.1],
                        'min_rel_in_sample':['dynamic20-sample'],
                        'alpha':[0.025],
                        'beta':[0.025]}

grid = ParameterGrid(hyperparameters_space)

In [None]:
df_all = pd.DataFrame() # all runs in one df


### CLEF2017

In [None]:

dataset_name = 'CLEF2017'


qrels = "data/qrels/CLEF2017_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2017_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)



# SET EXPERIMENTAL PARAMETERS 
des_recalls = [1.0, 0.9, 0.8] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over



df = run_experiments_GridSearch()

df_all = df_all.append(df) # append all results togather
df_all = df_all.sort_values(by=['dataset_name'], ascending=True)
display(df_all)

# save all
df_all.to_csv(path+'results/df_all_rates.csv')
df_all.to_csv(path+'results/df_all_rates_win_encoding.csv', encoding='Windows-1252') 


Number of topics: 42
CLEF2017
Confidence level:  0.95
recall: 1.0
Running- CLEF2017_0.95-1.0-IP-H
recall: 0.9
Running- CLEF2017_0.95-0.9-IP-H
recall: 0.8
Running- CLEF2017_0.95-0.8-IP-H


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-1.0-IP-H,0.970±0.095,0.213±0.187,0.9,0.037±0.043,0.030±0.095,CLEF2017,0.95,1.0
CLEF2017_0.95-0.9-IP-H,0.955±0.119,0.147±0.114,0.9,0.036±0.055,0.123±0.076,CLEF2017,0.95,0.9
CLEF2017_0.95-0.8-IP-H,0.949±0.120,0.137±0.113,0.9,0.037±0.055,0.231±0.056,CLEF2017,0.95,0.8


### CLEF2018

In [None]:

dataset_name = 'CLEF2018'


qrels = "data/qrels/CLEF2018_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2018_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)


# SET EXPERIMENTAL PARAMETERS 
des_recalls = [1.0, 0.9, 0.8] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over



df = run_experiments_GridSearch()

df_all = df_all.append(df) # append all results togather
df_all = df_all.sort_values(by=['dataset_name'], ascending=True)
display(df_all)

# save all
df_all.to_csv(path+'results/df_all_rates.csv')
df_all.to_csv(path+'results/df_all_rates_win_encoding.csv', encoding='Windows-1252') 


Number of topics: 30
CLEF2018
Confidence level:  0.95
recall: 1.0
Running- CLEF2018_0.95-1.0-IP-H
recall: 0.9
Running- CLEF2018_0.95-0.9-IP-H
recall: 0.8
Running- CLEF2018_0.95-0.8-IP-H


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-1.0-IP-H,0.970±0.095,0.213±0.187,0.9,0.037±0.043,0.030±0.095,CLEF2017,0.95,1.0
CLEF2017_0.95-0.9-IP-H,0.955±0.119,0.147±0.114,0.9,0.036±0.055,0.123±0.076,CLEF2017,0.95,0.9
CLEF2017_0.95-0.8-IP-H,0.949±0.120,0.137±0.113,0.9,0.037±0.055,0.231±0.056,CLEF2017,0.95,0.8
CLEF2018_0.95-1.0-IP-H,0.956±0.127,0.212±0.168,0.8,0.035±0.062,0.044±0.127,CLEF2018,0.95,1.0
CLEF2018_0.95-0.9-IP-H,0.941±0.125,0.141±0.145,0.833,0.030±0.063,0.115±0.088,CLEF2018,0.95,0.9
CLEF2018_0.95-0.8-IP-H,0.936±0.124,0.134±0.146,0.933,0.031±0.063,0.215±0.079,CLEF2018,0.95,0.8


###CLEF2019

In [None]:

dataset_name = 'CLEF2019'


qrels = "data/qrels/CLEF2019_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2019_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)


# SET EXPERIMENTAL PARAMETERS 
des_recalls = [1.0, 0.9, 0.8] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over




df = run_experiments_GridSearch()

df_all = df_all.append(df) # append all results togather
df_all = df_all.sort_values(by=['dataset_name'], ascending=True)
display(df_all)

# save all
df_all.to_csv(path+'results/df_all_rates.csv')
df_all.to_csv(path+'results/df_all_rates_win_encoding.csv', encoding='Windows-1252') 

Number of topics: 31
CLEF2019
Confidence level:  0.95
recall: 1.0
Running- CLEF2019_0.95-1.0-IP-H
recall: 0.9
Running- CLEF2019_0.95-0.9-IP-H
recall: 0.8
Running- CLEF2019_0.95-0.8-IP-H


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-1.0-IP-H,0.970±0.095,0.213±0.187,0.9,0.037±0.043,0.030±0.095,CLEF2017,0.95,1.0
CLEF2017_0.95-0.9-IP-H,0.955±0.119,0.147±0.114,0.9,0.036±0.055,0.123±0.076,CLEF2017,0.95,0.9
CLEF2017_0.95-0.8-IP-H,0.949±0.120,0.137±0.113,0.9,0.037±0.055,0.231±0.056,CLEF2017,0.95,0.8
CLEF2018_0.95-1.0-IP-H,0.956±0.127,0.212±0.168,0.8,0.035±0.062,0.044±0.127,CLEF2018,0.95,1.0
CLEF2018_0.95-0.9-IP-H,0.941±0.125,0.141±0.145,0.833,0.030±0.063,0.115±0.088,CLEF2018,0.95,0.9
CLEF2018_0.95-0.8-IP-H,0.936±0.124,0.134±0.146,0.933,0.031±0.063,0.215±0.079,CLEF2018,0.95,0.8
CLEF2019_0.95-1.0-IP-H,0.988±0.061,0.266±0.201,0.935,0.049±0.062,0.012±0.061,CLEF2019,0.95,1.0
CLEF2019_0.95-0.9-IP-H,0.984±0.061,0.214±0.177,0.968,0.043±0.063,0.110±0.032,CLEF2019,0.95,0.9
CLEF2019_0.95-0.8-IP-H,0.982±0.062,0.209±0.178,0.968,0.043±0.063,0.238±0.026,CLEF2019,0.95,0.8


### TREC-TR

In [None]:

dataset_name = 'TR'


qrels = "data/qrels/TREC_TR_Test_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/tr_test_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)


# SET EXPERIMENTAL PARAMETERS 
des_recalls = [1.0, 0.9, 0.8] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over




df = run_experiments_GridSearch()

df_all = df_all.append(df) # append all results togather
df_all = df_all.sort_values(by=['dataset_name'], ascending=True)
display(df_all)

# save all
df_all.to_csv(path+'results/df_all_rates.csv')
df_all.to_csv(path+'results/df_all_rates_win_encoding.csv', encoding='Windows-1252') 

Number of topics: 34
TR
Confidence level:  0.95
recall: 1.0
Running- TR_0.95-1.0-IP-H
recall: 0.9
Running- TR_0.95-0.9-IP-H
recall: 0.8
Running- TR_0.95-0.8-IP-H


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-1.0-IP-H,0.970±0.095,0.213±0.187,0.9,0.037±0.043,0.030±0.095,CLEF2017,0.95,1.0
CLEF2017_0.95-0.9-IP-H,0.955±0.119,0.147±0.114,0.9,0.036±0.055,0.123±0.076,CLEF2017,0.95,0.9
CLEF2017_0.95-0.8-IP-H,0.949±0.120,0.137±0.113,0.9,0.037±0.055,0.231±0.056,CLEF2017,0.95,0.8
CLEF2018_0.95-1.0-IP-H,0.956±0.127,0.212±0.168,0.8,0.035±0.062,0.044±0.127,CLEF2018,0.95,1.0
CLEF2018_0.95-0.9-IP-H,0.941±0.125,0.141±0.145,0.833,0.030±0.063,0.115±0.088,CLEF2018,0.95,0.9
CLEF2018_0.95-0.8-IP-H,0.936±0.124,0.134±0.146,0.933,0.031±0.063,0.215±0.079,CLEF2018,0.95,0.8
CLEF2019_0.95-1.0-IP-H,0.988±0.061,0.266±0.201,0.935,0.049±0.062,0.012±0.061,CLEF2019,0.95,1.0
CLEF2019_0.95-0.9-IP-H,0.984±0.061,0.214±0.177,0.968,0.043±0.063,0.110±0.032,CLEF2019,0.95,0.9
CLEF2019_0.95-0.8-IP-H,0.982±0.062,0.209±0.178,0.968,0.043±0.063,0.238±0.026,CLEF2019,0.95,0.8
TR_0.95-1.0-IP-H,1.000±0.001,0.046±0.064,0.765,0.000±0.000,0.000±0.001,TR,0.95,1.0


### TREC-Legal

In [None]:

dataset_name = 'Legal'


qrels = "data/qrels/TREC_Legal_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/legal_test_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)


# SET EXPERIMENTAL PARAMETERS 
des_recalls = [1.0, 0.9, 0.8] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over




df = run_experiments_GridSearch()

df_all = df_all.append(df) # append all results togather
df_all = df_all.sort_values(by=['dataset_name'], ascending=True)
display(df_all)

# save all
df_all.to_csv(path+'results/df_all_rates.csv')
df_all.to_csv(path+'results/df_all_rates_win_encoding.csv', encoding='Windows-1252') 

Number of topics: 4
Legal
Confidence level:  0.95
recall: 1.0
Running- Legal_0.95-1.0-IP-H
recall: 0.9
Running- Legal_0.95-0.9-IP-H
recall: 0.8
Running- Legal_0.95-0.8-IP-H


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-1.0-IP-H,0.970±0.095,0.213±0.187,0.9,0.037±0.043,0.030±0.095,CLEF2017,0.95,1.0
CLEF2017_0.95-0.9-IP-H,0.955±0.119,0.147±0.114,0.9,0.036±0.055,0.123±0.076,CLEF2017,0.95,0.9
CLEF2017_0.95-0.8-IP-H,0.949±0.120,0.137±0.113,0.9,0.037±0.055,0.231±0.056,CLEF2017,0.95,0.8
CLEF2018_0.95-1.0-IP-H,0.956±0.127,0.212±0.168,0.8,0.035±0.062,0.044±0.127,CLEF2018,0.95,1.0
CLEF2018_0.95-0.9-IP-H,0.941±0.125,0.141±0.145,0.833,0.030±0.063,0.115±0.088,CLEF2018,0.95,0.9
CLEF2018_0.95-0.8-IP-H,0.936±0.124,0.134±0.146,0.933,0.031±0.063,0.215±0.079,CLEF2018,0.95,0.8
CLEF2019_0.95-1.0-IP-H,0.988±0.061,0.266±0.201,0.935,0.049±0.062,0.012±0.061,CLEF2019,0.95,1.0
CLEF2019_0.95-0.9-IP-H,0.984±0.061,0.214±0.177,0.968,0.043±0.063,0.110±0.032,CLEF2019,0.95,0.9
CLEF2019_0.95-0.8-IP-H,0.982±0.062,0.209±0.178,0.968,0.043±0.063,0.238±0.026,CLEF2019,0.95,0.8
Legal_0.95-1.0-IP-H,0.870±0.182,0.225±0.283,0.0,0.034±0.047,0.130±0.182,Legal,0.95,1.0


In [None]:
df_all.to_latex()

'\\begin{tabular}{lllllllrr}\n\\toprule\n{} &       recall &         cost & reliability &      loss\\_er &      rel\\_err & dataset\\_name &  des\\_prob &  des\\_recall \\\\\n\\midrule\nCLEF2017\\_0.95-1.0-IP-H &  0.970±0.095 &  0.213±0.187 &       0.900 &  0.037±0.043 &  0.030±0.095 &     CLEF2017 &      0.95 &         1.0 \\\\\nCLEF2017\\_0.95-0.9-IP-H &  0.955±0.119 &  0.147±0.114 &       0.900 &  0.036±0.055 &  0.123±0.076 &     CLEF2017 &      0.95 &         0.9 \\\\\nCLEF2017\\_0.95-0.8-IP-H &  0.949±0.120 &  0.137±0.113 &       0.900 &  0.037±0.055 &  0.231±0.056 &     CLEF2017 &      0.95 &         0.8 \\\\\nCLEF2018\\_0.95-1.0-IP-H &  0.956±0.127 &  0.212±0.168 &       0.800 &  0.035±0.062 &  0.044±0.127 &     CLEF2018 &      0.95 &         1.0 \\\\\nCLEF2018\\_0.95-0.9-IP-H &  0.941±0.125 &  0.141±0.145 &       0.833 &  0.030±0.063 &  0.115±0.088 &     CLEF2018 &      0.95 &         0.9 \\\\\nCLEF2018\\_0.95-0.8-IP-H &  0.936±0.124 &  0.134±0.146 &       0.933 &  0.031±0.063 

# CLEF2017 IP results (CI 0.95-Target 1.0, 0.9, 0.8) -- for per topic figure

In [None]:
# set model parameters space
from sklearn.model_selection import ParameterGrid

hyperparameters_space = {'point_process':['IP'], 
                        'model':['P', 'H', 'E', 'A'], 
                        'RMSE':[0.1],
                        'min_rel_in_sample':['dynamic20-sample'],
                        'alpha':[0.025],
                        'beta':[0.025]}

grid = ParameterGrid(hyperparameters_space)

In [None]:
df_all = pd.DataFrame() # all runs in one df


### CLEF2017

In [None]:

dataset_name = 'CLEF2017'


qrels = "data/qrels/CLEF2017_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2017_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)



# SET EXPERIMENTAL PARAMETERS 
des_recalls = [1.0, 0.9, 0.8] # desired recalls to experiment over

des_probs = [0.95] # desired confidences to experiment over



df = run_experiments_GridSearch()

df_all = df_all.append(df) # append all results togather
df_all = df_all.sort_values(by=['dataset_name'], ascending=True)
display(df_all)

# save all
df_all.to_csv(path+'results/df_all_rates.csv')
df_all.to_csv(path+'results/df_all_rates_win_encoding.csv', encoding='Windows-1252') 


Number of topics: 42
CLEF2017
Confidence level:  0.95
recall: 1.0
Running- CLEF2017_0.95-1.0-IP-P
Running- CLEF2017_0.95-1.0-IP-H
Running- CLEF2017_0.95-1.0-IP-E
Running- CLEF2017_0.95-1.0-IP-A
recall: 0.9
Running- CLEF2017_0.95-0.9-IP-P
Running- CLEF2017_0.95-0.9-IP-H
Running- CLEF2017_0.95-0.9-IP-E
Running- CLEF2017_0.95-0.9-IP-A
recall: 0.8
Running- CLEF2017_0.95-0.8-IP-P
Running- CLEF2017_0.95-0.8-IP-H
Running- CLEF2017_0.95-0.8-IP-E
Running- CLEF2017_0.95-0.8-IP-A


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-1.0-IP-P,1.000±0.000,0.287±0.240,0.967,0.030±0.031,0.000±0.000,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-IP-H,0.970±0.095,0.213±0.187,0.9,0.037±0.043,0.030±0.095,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-IP-E,0.999±0.005,0.246±0.204,0.933,0.026±0.031,0.001±0.005,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-IP-A,0.994±0.017,0.205±0.153,0.767,0.025±0.032,0.006±0.017,CLEF2017,0.95,1.0
CLEF2017_0.95-0.9-IP-P,1.000±0.000,0.281±0.226,1.0,0.030±0.031,0.111±0.000,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-H,0.955±0.119,0.147±0.114,0.9,0.036±0.055,0.123±0.076,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-E,0.984±0.031,0.154±0.113,0.967,0.022±0.032,0.094±0.034,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-IP-A,0.994±0.017,0.205±0.153,1.0,0.025±0.032,0.105±0.019,CLEF2017,0.95,0.9
CLEF2017_0.95-0.8-IP-P,1.000±0.000,0.265±0.194,1.0,0.029±0.031,0.250±0.000,CLEF2017,0.95,0.8
CLEF2017_0.95-0.8-IP-H,0.949±0.120,0.137±0.113,0.9,0.037±0.055,0.231±0.056,CLEF2017,0.95,0.8


# IP/CX all rates results (CI 0.95- Target 1.0,0.9,0.8) - for T-Test


In [None]:
# set model parameters space
from sklearn.model_selection import ParameterGrid

hyperparameters_space = {'point_process':['IP', 'CX'], 
                        'model':['P', 'H', 'E', 'A'], 
                        'RMSE':[0.1],
                        'min_rel_in_sample':['dynamic20-sample'],
                        'alpha':[0.025],
                        'beta':[0.025]}

grid = ParameterGrid(hyperparameters_space)


# SET EXPERIMENTAL PARAMETERS 
des_recalls = [1.0, 0.9, 0.8] # desired recalls to experiment over
des_probs = [0.95] # desired confidences to experiment over

detailed_results_flag = 0

In [None]:
# set T-Test directories

EVALDIR = os.path.join(DIR, 'TTest/tar_eval_out/')    # Directory for evaluation output files
# Check whether dircetory exists and create it if not
if not os.path.exists(EVALDIR):
    os.makedirs(EVALDIR)

path = '/content/drive/MyDrive/ReemSharedPhD/SharedCode/TOIS_paper_code/TTest/results/figs/'

if not os.path.exists(path):
    os.makedirs(path)

path = '/content/drive/MyDrive/ReemSharedPhD/SharedCode/TOIS_paper_code/TTest/'


In [None]:
df_all = pd.DataFrame() # all runs in one df


### CLEF2017

In [None]:

# Create an empty global dataframe to append results of each topic
topics_results_df = pd.DataFrame()
dataset_name = 'CLEF2017'


qrels = "data/qrels/CLEF2017_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2017_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)


# Create an empty global dataframe to append results
results_df = pd.DataFrame()
actual_predicted_df =  pd.DataFrame()
df2 =  pd.DataFrame()


df = run_experiments_GridSearch()

df_all = df_all.append(df) # append all results togather
#df_all = df_all.sort_values(by=['dataset_name'], ascending=True)
display(df_all)

# save all
df_all.to_csv(path+'results/df_all_rates_forTTest.csv')
df_all.to_csv(path+'results/df_all_rates_forTTest_win_encoding.csv', encoding='Windows-1252') 


Number of topics: 42
CLEF2017
Confidence level:  0.95
recall: 1.0
Running- CLEF2017_0.95-1.0-IP-P
Running- CLEF2017_0.95-1.0-CX-P
Running- CLEF2017_0.95-1.0-IP-H
Running- CLEF2017_0.95-1.0-CX-H
Running- CLEF2017_0.95-1.0-IP-E
Running- CLEF2017_0.95-1.0-CX-E
Running- CLEF2017_0.95-1.0-IP-A
Running- CLEF2017_0.95-1.0-CX-A
recall: 0.9
Running- CLEF2017_0.95-0.9-IP-P
Running- CLEF2017_0.95-0.9-CX-P
Running- CLEF2017_0.95-0.9-IP-H
Running- CLEF2017_0.95-0.9-CX-H
Running- CLEF2017_0.95-0.9-IP-E
Running- CLEF2017_0.95-0.9-CX-E
Running- CLEF2017_0.95-0.9-IP-A
Running- CLEF2017_0.95-0.9-CX-A
recall: 0.8
Running- CLEF2017_0.95-0.8-IP-P
Running- CLEF2017_0.95-0.8-CX-P
Running- CLEF2017_0.95-0.8-IP-H
Running- CLEF2017_0.95-0.8-CX-H
Running- CLEF2017_0.95-0.8-IP-E
Running- CLEF2017_0.95-0.8-CX-E
Running- CLEF2017_0.95-0.8-IP-A
Running- CLEF2017_0.95-0.8-CX-A


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-1.0-IP-P,1.0,0.287,0.967,0.03,0.0,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-CX-P,1.0,0.287,0.967,0.03,0.0,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-IP-H,0.97,0.213,0.9,0.037,0.03,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-CX-H,0.955,0.188,0.8,0.045,0.045,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-IP-E,0.999,0.246,0.933,0.026,0.001,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-CX-E,0.996,0.2,0.733,0.024,0.004,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-IP-A,0.994,0.205,0.767,0.025,0.006,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-CX-A,0.994,0.205,0.767,0.025,0.006,CLEF2017,0.95,1.0
CLEF2017_0.95-0.9-IP-P,1.0,0.281,1.0,0.03,0.111,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-P,1.0,0.278,1.0,0.03,0.111,CLEF2017,0.95,0.9


In [None]:
df_all['Model'] = df_all.index
df_all['process_type'] = df_all['Model'].apply(lambda x: 'IP' if any(i in x for i in 'IP') else 'CX')
df_all = df_all.drop(['Model'], axis=1)
df_all

Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall,process_type
CLEF2017_0.95-1.0-IP-P,1.0,0.287,0.967,0.03,0.0,CLEF2017,0.95,1.0,IP
CLEF2017_0.95-1.0-CX-P,1.0,0.287,0.967,0.03,0.0,CLEF2017,0.95,1.0,IP
CLEF2017_0.95-1.0-IP-H,0.97,0.213,0.9,0.037,0.03,CLEF2017,0.95,1.0,IP
CLEF2017_0.95-1.0-CX-H,0.955,0.188,0.8,0.045,0.045,CLEF2017,0.95,1.0,CX
CLEF2017_0.95-1.0-IP-E,0.999,0.246,0.933,0.026,0.001,CLEF2017,0.95,1.0,IP
CLEF2017_0.95-1.0-CX-E,0.996,0.2,0.733,0.024,0.004,CLEF2017,0.95,1.0,CX
CLEF2017_0.95-1.0-IP-A,0.994,0.205,0.767,0.025,0.006,CLEF2017,0.95,1.0,IP
CLEF2017_0.95-1.0-CX-A,0.994,0.205,0.767,0.025,0.006,CLEF2017,0.95,1.0,CX
CLEF2017_0.95-0.9-IP-P,1.0,0.281,1.0,0.03,0.111,CLEF2017,0.95,0.9,IP
CLEF2017_0.95-0.9-CX-P,1.0,0.278,1.0,0.03,0.111,CLEF2017,0.95,0.9,IP


In [None]:

df_all = df_all.drop(['process_type'], axis=1)
df_all

Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-1.0-IP-P,1.0,0.287,0.967,0.03,0.0,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-CX-P,1.0,0.287,0.967,0.03,0.0,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-IP-H,0.97,0.213,0.9,0.037,0.03,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-CX-H,0.955,0.188,0.8,0.045,0.045,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-IP-E,0.999,0.246,0.933,0.026,0.001,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-CX-E,0.996,0.2,0.733,0.024,0.004,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-IP-A,0.994,0.205,0.767,0.025,0.006,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-CX-A,0.994,0.205,0.767,0.025,0.006,CLEF2017,0.95,1.0
CLEF2017_0.95-0.9-IP-P,1.0,0.281,1.0,0.03,0.111,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-P,1.0,0.278,1.0,0.03,0.111,CLEF2017,0.95,0.9


### CLEF2018

In [None]:

dataset_name = 'CLEF2018'


qrels = "data/qrels/CLEF2018_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2018_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)



df = run_experiments_GridSearch()

df_all = df_all.append(df) # append all results togather
#df_all = df_all.sort_values(by=['dataset_name'], ascending=True)
display(df_all)

# save all
df_all.to_csv(path+'results/df_all_rates_forTTest.csv')
df_all.to_csv(path+'results/df_all_rates_forTTest_win_encoding.csv', encoding='Windows-1252') 


Number of topics: 30
CLEF2018
Confidence level:  0.95
recall: 1.0
Running- CLEF2018_0.95-1.0-IP-P
Running- CLEF2018_0.95-1.0-CX-P
Running- CLEF2018_0.95-1.0-IP-H
Running- CLEF2018_0.95-1.0-CX-H
Running- CLEF2018_0.95-1.0-IP-E
Running- CLEF2018_0.95-1.0-CX-E
Running- CLEF2018_0.95-1.0-IP-A
Running- CLEF2018_0.95-1.0-CX-A
recall: 0.9
Running- CLEF2018_0.95-0.9-IP-P
Running- CLEF2018_0.95-0.9-CX-P
Running- CLEF2018_0.95-0.9-IP-H
Running- CLEF2018_0.95-0.9-CX-H
Running- CLEF2018_0.95-0.9-IP-E
Running- CLEF2018_0.95-0.9-CX-E
Running- CLEF2018_0.95-0.9-IP-A
Running- CLEF2018_0.95-0.9-CX-A
recall: 0.8
Running- CLEF2018_0.95-0.8-IP-P
Running- CLEF2018_0.95-0.8-CX-P
Running- CLEF2018_0.95-0.8-IP-H
Running- CLEF2018_0.95-0.8-CX-H
Running- CLEF2018_0.95-0.8-IP-E
Running- CLEF2018_0.95-0.8-CX-E
Running- CLEF2018_0.95-0.8-IP-A
Running- CLEF2018_0.95-0.8-CX-A


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-1.0-IP-P,1.0,0.287,0.967,0.03,0.0,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-CX-P,1.0,0.287,0.967,0.03,0.0,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-IP-H,0.97,0.213,0.9,0.037,0.03,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-CX-H,0.955,0.188,0.8,0.045,0.045,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-IP-E,0.999,0.246,0.933,0.026,0.001,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-CX-E,0.996,0.2,0.733,0.024,0.004,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-IP-A,0.994,0.205,0.767,0.025,0.006,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-CX-A,0.994,0.205,0.767,0.025,0.006,CLEF2017,0.95,1.0
CLEF2017_0.95-0.9-IP-P,1.0,0.281,1.0,0.03,0.111,CLEF2017,0.95,0.9
CLEF2017_0.95-0.9-CX-P,1.0,0.278,1.0,0.03,0.111,CLEF2017,0.95,0.9


###CLEF2019

In [None]:

dataset_name = 'CLEF2019'


qrels = "data/qrels/CLEF2019_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/clef2019_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)




df = run_experiments_GridSearch()

df_all = df_all.append(df) # append all results togather
#df_all = df_all.sort_values(by=['dataset_name'], ascending=True)
display(df_all)

# save all
df_all.to_csv(path+'results/df_all_rates_forTTest.csv')
df_all.to_csv(path+'results/df_all_rates_forTTest_win_encoding.csv', encoding='Windows-1252') 

Number of topics: 31
CLEF2019
Confidence level:  0.95
recall: 1.0
Running- CLEF2019_0.95-1.0-IP-P
Running- CLEF2019_0.95-1.0-CX-P
Running- CLEF2019_0.95-1.0-IP-H
Running- CLEF2019_0.95-1.0-CX-H
Running- CLEF2019_0.95-1.0-IP-E
Running- CLEF2019_0.95-1.0-CX-E
Running- CLEF2019_0.95-1.0-IP-A
Running- CLEF2019_0.95-1.0-CX-A
recall: 0.9
Running- CLEF2019_0.95-0.9-IP-P
Running- CLEF2019_0.95-0.9-CX-P
Running- CLEF2019_0.95-0.9-IP-H
Running- CLEF2019_0.95-0.9-CX-H
Running- CLEF2019_0.95-0.9-IP-E
Running- CLEF2019_0.95-0.9-CX-E
Running- CLEF2019_0.95-0.9-IP-A
Running- CLEF2019_0.95-0.9-CX-A
recall: 0.8
Running- CLEF2019_0.95-0.8-IP-P
Running- CLEF2019_0.95-0.8-CX-P
Running- CLEF2019_0.95-0.8-IP-H
Running- CLEF2019_0.95-0.8-CX-H
Running- CLEF2019_0.95-0.8-IP-E
Running- CLEF2019_0.95-0.8-CX-E
Running- CLEF2019_0.95-0.8-IP-A
Running- CLEF2019_0.95-0.8-CX-A


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-1.0-IP-P,1.000,0.287,0.967,0.030,0.000,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-CX-P,1.000,0.287,0.967,0.030,0.000,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-IP-H,0.970,0.213,0.900,0.037,0.030,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-CX-H,0.955,0.188,0.800,0.045,0.045,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-IP-E,0.999,0.246,0.933,0.026,0.001,CLEF2017,0.95,1.0
...,...,...,...,...,...,...,...,...
CLEF2019_0.95-0.8-CX-H,0.984,0.227,0.968,0.046,0.241,CLEF2019,0.95,0.8
CLEF2019_0.95-0.8-IP-E,0.982,0.209,0.968,0.043,0.238,CLEF2019,0.95,0.8
CLEF2019_0.95-0.8-CX-E,0.982,0.209,0.968,0.043,0.238,CLEF2019,0.95,0.8
CLEF2019_0.95-0.8-IP-A,0.993,0.247,1.000,0.044,0.242,CLEF2019,0.95,0.8


### TREC-TR

In [None]:

dataset_name = 'TR' 


qrels = "data/qrels/TREC_TR_Test_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/tr_test_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)




df = run_experiments_GridSearch()

df_all = df_all.append(df) # append all results togather
#df_all = df_all.sort_values(by=['dataset_name'], ascending=True)
display(df_all)

# save all
df_all.to_csv(path+'results/df_all_rates_forTTest.csv')
df_all.to_csv(path+'results/df_all_rates_forTTest_win_encoding.csv', encoding='Windows-1252') 

Number of topics: 34
TR
Confidence level:  0.95
recall: 1.0
Running- TR_0.95-1.0-IP-P
Running- TR_0.95-1.0-CX-P
Running- TR_0.95-1.0-IP-H
Running- TR_0.95-1.0-CX-H
Running- TR_0.95-1.0-IP-E
Running- TR_0.95-1.0-CX-E
Running- TR_0.95-1.0-IP-A
Running- TR_0.95-1.0-CX-A
recall: 0.9
Running- TR_0.95-0.9-IP-P
Running- TR_0.95-0.9-CX-P
Running- TR_0.95-0.9-IP-H
Running- TR_0.95-0.9-CX-H
Running- TR_0.95-0.9-IP-E
Running- TR_0.95-0.9-CX-E
Running- TR_0.95-0.9-IP-A
Running- TR_0.95-0.9-CX-A
recall: 0.8
Running- TR_0.95-0.8-IP-P
Running- TR_0.95-0.8-CX-P
Running- TR_0.95-0.8-IP-H
Running- TR_0.95-0.8-CX-H
Running- TR_0.95-0.8-IP-E
Running- TR_0.95-0.8-CX-E
Running- TR_0.95-0.8-IP-A
Running- TR_0.95-0.8-CX-A


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-1.0-IP-P,1.000,0.287,0.967,0.030,0.000,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-CX-P,1.000,0.287,0.967,0.030,0.000,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-IP-H,0.970,0.213,0.900,0.037,0.030,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-CX-H,0.955,0.188,0.800,0.045,0.045,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-IP-E,0.999,0.246,0.933,0.026,0.001,CLEF2017,0.95,1.0
...,...,...,...,...,...,...,...,...
TR_0.95-0.8-CX-H,1.000,0.043,1.000,0.000,0.250,TR,0.95,0.8
TR_0.95-0.8-IP-E,0.999,0.030,1.000,0.000,0.249,TR,0.95,0.8
TR_0.95-0.8-CX-E,0.999,0.030,1.000,0.000,0.249,TR,0.95,0.8
TR_0.95-0.8-IP-A,1.000,0.041,1.000,0.000,0.250,TR,0.95,0.8


### TREC-Legal

In [None]:

dataset_name = 'Legal'


qrels = "data/qrels/TREC_Legal_qrels.txt" # use the same qrel list as their rankings


qrel_fname, query_rel_dic = load_rel_data(qrels)
print("Number of topics:", len(query_rel_dic))

run = "data/rankings/legal_test_autotar_ranking.txt"

doc_rank_dic, rank_rel_dic = load_run_data(run)



df = run_experiments_GridSearch()

df_all = df_all.append(df) # append all results togather
#df_all = df_all.sort_values(by=['dataset_name'], ascending=True)
display(df_all)

# save all
df_all.to_csv(path+'results/df_all_rates_forTTest.csv')
df_all.to_csv(path+'results/df_all_rates_forTTest_win_encoding.csv', encoding='Windows-1252') 

Number of topics: 4
Legal
Confidence level:  0.95
recall: 1.0
Running- Legal_0.95-1.0-IP-P
Running- Legal_0.95-1.0-CX-P
Running- Legal_0.95-1.0-IP-H
Running- Legal_0.95-1.0-CX-H
Running- Legal_0.95-1.0-IP-E
Running- Legal_0.95-1.0-CX-E
Running- Legal_0.95-1.0-IP-A
Running- Legal_0.95-1.0-CX-A
recall: 0.9
Running- Legal_0.95-0.9-IP-P
Running- Legal_0.95-0.9-CX-P
Running- Legal_0.95-0.9-IP-H
Running- Legal_0.95-0.9-CX-H
Running- Legal_0.95-0.9-IP-E
Running- Legal_0.95-0.9-CX-E
Running- Legal_0.95-0.9-IP-A
Running- Legal_0.95-0.9-CX-A
recall: 0.8
Running- Legal_0.95-0.8-IP-P
Running- Legal_0.95-0.8-CX-P
Running- Legal_0.95-0.8-IP-H
Running- Legal_0.95-0.8-CX-H
Running- Legal_0.95-0.8-IP-E
Running- Legal_0.95-0.8-CX-E
Running- Legal_0.95-0.8-IP-A
Running- Legal_0.95-0.8-CX-A


Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall
CLEF2017_0.95-1.0-IP-P,1.000,0.287,0.967,0.030,0.000,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-CX-P,1.000,0.287,0.967,0.030,0.000,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-IP-H,0.970,0.213,0.900,0.037,0.030,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-CX-H,0.955,0.188,0.800,0.045,0.045,CLEF2017,0.95,1.0
CLEF2017_0.95-1.0-IP-E,0.999,0.246,0.933,0.026,0.001,CLEF2017,0.95,1.0
...,...,...,...,...,...,...,...,...
Legal_0.95-0.8-CX-H,0.793,0.025,0.500,0.045,0.065,Legal,0.95,0.8
Legal_0.95-0.8-IP-E,0.793,0.025,0.500,0.045,0.065,Legal,0.95,0.8
Legal_0.95-0.8-CX-E,0.793,0.025,0.500,0.045,0.065,Legal,0.95,0.8
Legal_0.95-0.8-IP-A,0.793,0.025,0.500,0.045,0.065,Legal,0.95,0.8


In [None]:
df_all['Model'] = df_all.index
df_all['process_type'] = df_all['Model'].apply(lambda x: 'IP' if any(i in x for i in 'IP') else 'CX')
df_all = df_all.drop(['Model'], axis=1)
df_all

Unnamed: 0,recall,cost,reliability,loss_er,rel_err,dataset_name,des_prob,des_recall,process_type
CLEF2017_0.95-1.0-IP-P,1.000,0.287,0.967,0.030,0.000,CLEF2017,0.95,1.0,IP
CLEF2017_0.95-1.0-CX-P,1.000,0.287,0.967,0.030,0.000,CLEF2017,0.95,1.0,IP
CLEF2017_0.95-1.0-IP-H,0.970,0.213,0.900,0.037,0.030,CLEF2017,0.95,1.0,IP
CLEF2017_0.95-1.0-CX-H,0.955,0.188,0.800,0.045,0.045,CLEF2017,0.95,1.0,CX
CLEF2017_0.95-1.0-IP-E,0.999,0.246,0.933,0.026,0.001,CLEF2017,0.95,1.0,IP
...,...,...,...,...,...,...,...,...,...
Legal_0.95-0.8-CX-H,0.793,0.025,0.500,0.045,0.065,Legal,0.95,0.8,CX
Legal_0.95-0.8-IP-E,0.793,0.025,0.500,0.045,0.065,Legal,0.95,0.8,IP
Legal_0.95-0.8-CX-E,0.793,0.025,0.500,0.045,0.065,Legal,0.95,0.8,CX
Legal_0.95-0.8-IP-A,0.793,0.025,0.500,0.045,0.065,Legal,0.95,0.8,IP


In [None]:
# print sorted

#get all files
all_rates_csv = glob.glob(path + 'results/0.95-0.9'+ "*Summary_mean_std_macro.csv")

#read into df
df_list = (pd.read_csv(file, index_col=0) for file in all_rates_csv)

#concat all dfs
df_all_sorted   = pd.concat(df_list)

display(df_all_sorted)


In [None]:
df_all_sorted = df_all_sorted[['dataset_name','recall', 'cost', 'reliability', 'loss_er','rel_err']]
df_all_sorted

In [None]:
df_all_sorted.to_latex()