In [1]:

"""
Jupyter file to perform the non-optimised re-ranking task for ECIR 2023 paper 
titled: Automatic and Analytical Field Weighting for Structured Document Retrieval
"""

'\nJupyter file to perform the non-optimised re-ranking task for ECIR 2023 paper \ntitled: Automatic and Analytical Field Weighting for Structured Document Retrieval\n'

In [2]:
# import relevant libraries
import numpy as np
import os
import json
import csv
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.metrics import ndcg_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_score

from collections import defaultdict


In [3]:
def open_json(path):
    """
    function to open .json files
    """
    with open(path, 'r') as in_:
        f = json.load(in_)
    return f

In [4]:
def calc_ndcg(query_id, ranking, q_qrels,k=100):
    """
    calculate ndcg@k for a given value of k
    """
    
    true_relevance = []
    prediction = []
    qrel_doc_ids = [x[2] for x in q_qrels]
    prediction_doc_ids = list(ranking.keys())
    for qrel in q_qrels:
        doc_id = qrel[2]
        true_relevance.append(float(qrel[3]))
        try:
            prediction.append(ranking[doc_id])
        except KeyError:
            prediction.append(0)
        
    for doc_id in list(set(prediction_doc_ids) - set(qrel_doc_ids)):
        prediction.append(ranking[doc_id])
        true_relevance.append(0)
    try:
        acc = ndcg_score(np.array([true_relevance]), np.array([prediction]),k=k)
        acc = round(acc,5)
    except:
        print(true_relevance)
        print(prediction)
    return acc

In [5]:
def calc_ap(query_id, q_ranking, q_qrels):
    """
    calculate average precision
    """
    true_relevance = []
    prediction = []
    qrel_doc_ids = [x[2] for x in q_qrels]
    prediction_doc_ids = list(q_ranking.keys())
    no_info_count = 0
    fail = False
    for qrel in q_qrels:
        doc_id = qrel[2]
        true_relevance.append(float(qrel[3]))
        # if qrel doc not found, add as 0 to prediction
        try:
            prediction.append(q_ranking[doc_id])
        except KeyError:
            prediction.append(0)
    len_rel = len(prediction)
    for doc_id in list(set(prediction_doc_ids) - set(qrel_doc_ids)):
        prediction.append(q_ranking[doc_id])
        true_relevance.append(0)


    aps = []
    true_relevance = [0 if x < 1 else 1 for x in true_relevance]

    trues = np.array(true_relevance)
    preds = np.array(prediction)
    ap = average_precision_score(trues, preds)
    return ap

In [6]:
def precision_at_k_score(y_true, y_pred_proba, k=10, pos_label=1):
    """
    calculate precision@k for a given value of k
    """
    topk = [
        y_true_ == pos_label 
        for y_true_, y_pred_proba_ 
        in sorted(
            zip(y_true, y_pred_proba), 
            key=lambda y: y[1], 
            reverse=True
        )[:k]
    ]
    return sum(topk) / len(topk)

In [7]:
def save_q_based(index_name, acc, lst, model):
    """
    function that dumps accuracies
    """
    if not catch_all:
        q_acc_save_path = os.path.join(q_acc_dir,'acc-nonsuper-'+index_name+'_{}_'+ '{}_'+'base.csv')
    else:
        q_acc_save_path = os.path.join(q_acc_dir,'acc-nonsuper-'+index_name+'_{}_'+ '{}_'+'catch_all.csv')
    with open(q_acc_save_path.format(model, acc),'w') as out:
        writer = csv.writer(out)
        writer.writerows(zip(all_query_data.keys(),lst))

In [8]:
# name of index
index_name = 'trec-web'

In [9]:
# if True a catchall field is used
catch_all = False

In [10]:
accuracy_dict = {'ndcg10':{},'ndcg100':{}, 'map':{}}

In [11]:
# directory where query based accuracies are saved
q_acc_dir = 'q_based_accs'

In [12]:
# data directory
data_dir = 'data/{}/'.format(index_name)

In [13]:
# information regarding datasets necessary for calculations
datasetInfo = {
    'dbpedia':{
        "empty_fields": {
            "similar_entities": 1869980,
            "label": 940,
            "attributes": 0,
            "categories": 41688,
            "related_entities": 1832508,
            "all": 0
    },
        "total_doc_count":4641889
        },
    'trec-web':{
         "empty_fields": {
             "title": 3062,
             "body": 361,
             "all": 0
     },
         "total_doc_count":212591
    },
    'homedepot': {

         "empty_fields": {
             "product_uid": 0,
             "product_name": 0,
             "product_description": 0,
             "product_attributes": 16263,
             "all": 0
             
     },
         "total_doc_count": 54668,
        }
}

In [14]:
# compile a field-based dictionaries
field_dict = {
    'trec-web':['title', 'body'],
    'homedepot':['product_name', 'product_description','product_attributes'],
    'dbpedia': ["label" , "attributes" , "categories" , "related_entities", "similar_entities"]
 }
avg_fl_dict = {
    field: None for field in field_dict[index_name]
}
fields = field_dict[index_name]

In [15]:
# load query based qrels
qrel_dict = defaultdict(list)
qrel_file_path = 'qrels-{}.all'.format(index_name)
with open(qrel_file_path, 'r') as in_:
    reader = csv.reader(in_, delimiter='\t')
    qrel_file = list(reader)
for qrel in qrel_file:
    qrel[2] = qrel[2].lower()
    qrel_dict[qrel[0].strip()].append(qrel)


### put all the relevant info into arrays

In [16]:
# This cell loops over the queries and compiles a query_data dictionary for each
# In this dictionary all the needed metrics data is stored for calculations later on
all_query_data = {}
for q_file_name in os.listdir(data_dir):
    if 'pickle' in q_file_name:
        continue
    q_data = open_json(os.path.join(data_dir,q_file_name))
    q_id = q_data['query_id'].strip()
    field_idfs = q_data['q_term_idfs']
    field_dfs = q_data['q_term_dfs']
#     field_Ns = q_data['fieldNs']
    field_Ns = {
        field: datasetInfo[index_name]['total_doc_count'] - datasetInfo[index_name]['empty_fields'][field]
        for field in fields}
    terms = q_data['query'].split(' ')
    results = q_data['results']
    field_tfs = {field: [] for field in fields}
    field_lengths = {field: [] for field in fields}
    field_scores = {field: [] for field in fields}

    doc_ids = []
    bm25f_score_lst = []
    concat_dls = []
    for document, data in results.items():
        doc_ids.append(document)
        bm25f_score_lst.append(q_data['bm25f_score'])
        for field in fields:

            doc_field_data = data[field]

            tf_vect = [doc_field_data['term_tfs'][term]  for term in terms]

            fl = doc_field_data['fl']
            score = doc_field_data['score']
            field_tfs[field].append(tf_vect)
            field_lengths[field].append(fl)
            field_scores[field].append(score)
            if doc_field_data['avgfl']:
                avg_fl_dict[field] = doc_field_data['avgfl']
    

    field_tfs = np.dstack(list(field_tfs.values()))
    
    field_scores_= np.array(list(field_scores.values())).T

    # doc lenghths
    field_lengths = np.array(list(field_lengths.values())).T
    field_lengths = field_lengths.reshape((len(results),1,len(fields)))
    q_data['tf_arr'] = field_tfs
    q_data['fl_arr'] = field_lengths
    q_data['avgfl'] = np.array(list(avg_fl_dict.values())).reshape(1,1,len(fields))
    q_data['bm25f_scores'] = dict(zip(q_data['bm25f_doc_ids'],q_data['bm25f_score']))

    # idfs
    idfs = np.dstack(
        [
            [field_idfs[field][t] if t in field_idfs[field].keys() else 0.0 for t in terms]
            for field in fields
        ])
    
    # global idfs
    global_idfs = np.array([
        q_data['global_idfs'][term] if term in q_data['global_idfs'].keys() else 0 for term in terms
    ]).reshape(1,len(terms))
    
    # global dfs 
    global_dfs = np.array([
        q_data['global_dfs'][term] if term in q_data['global_dfs'].keys() else 0 for term in terms
    ]).reshape(1,len(terms))
    
    # dfs
    dfs =  np.dstack(
        [
            [field_dfs[field][t] if t in field_dfs[field].keys() else 0.0 for t in terms]
            for field in fields
        ])
    
    # term occcs
    term_occurrences = (field_tfs > 0).astype(int)
    
    # field occurrences
    field_occurrences = term_occurrences.sum(axis=2).reshape(len(results), len(terms), 1)

    # field Ns
    Nfs = np.array(list(field_Ns.values())).reshape(1,1,len(fields))
    
    # P of t given F
#     print(field_Ns)
    p_t_f = term_occurrences * (dfs / Nfs)
    p_t_f[p_t_f == 0] = 1.0
    
    
    # P of t given d
    p_t_d = field_occurrences / len(fields)
    p_t_d[p_t_d == 0] = 1
    

    q_data['bm25_scores_arr'] = field_scores_
    q_data['idf_arr'] = idfs
    q_data['df_arr'] = dfs
    q_data['global_idf_arr'] = global_idfs
    q_data['global_df_arr'] = global_dfs
    q_data['df_arr'] = dfs
    q_data['t_occs'] = term_occurrences
    q_data['f_occs'] = field_occurrences
    q_data['Nfs'] = Nfs
    q_data['p_t_f'] = p_t_f
    q_data['p_t_d'] = p_t_d
    q_data['doc_ids'] = doc_ids
    q_data['avgfl'] = np.array(list(avg_fl_dict.values())).reshape(1,1,len(fields))
    q_data['bm25f_scores'] = dict(zip(q_data['bm25f_doc_ids'],q_data['bm25f_score']))
    all_query_data[q_id] = q_data

In [17]:
# if catchall field is used, add this field to all the query_data dictionaries in all_query_data
if catch_all:
    fields = fields + ['all']
    for query_id, data in all_query_data.items():
        # add meta data things

        data['fl_arr'] = np.dstack([data['fl_arr'],data['fl_arr'].sum(axis=2)])
        data['tf_arr'] = np.dstack([data['tf_arr'],data['tf_arr'].sum(axis=2)])
        data['idf_arr'] = np.dstack([data['idf_arr'], data['global_idf_arr']])
        data['df_arr'] = np.dstack([data['df_arr'], data['global_df_arr']])
        data['avgfl'] = np.dstack([data['avgfl'],data['avgfl'].sum(axis=2)])

        # new dimenstion to tf_occs
        data['t_occs'] = np.dstack([data['t_occs'], data['t_occs'].max(axis=2)])

        # add field occ due to new field
        data['f_occs'][data['f_occs'] > 0] += 1
        
        ptf_ca = data['t_occs'][:,:,-1] * (data['global_df_arr'] / datasetInfo[index_name]['total_doc_count'])
        ptf_ca[ptf_ca == 0] = 1.0

        data['p_t_f'] = np.dstack([data['p_t_f'], ptf_ca])
        # add field for ptd

In [18]:
def bm25_score(query_data,b=0.8,k_1=1.6,catch_all=False):
    """
    function to calculate bm25 scores for a given query based on the results
    """
    terms = query_data['query'].split(' ')
    tf_array = query_data['tf_arr']
    dls = query_data['fl_arr']
    avgfl = query_data['avgfl']
    idf_arr = query_data['idf_arr']
    if catch_all:
        tf_array = np.dstack([tf_array,tf_array.sum(axis=2)])
        dls = np.dstack([dls,dls.sum(axis=2)])
        idf_arr = np.dstack([idf_arr, query_data['global_idf_arr']])
        avgfl = np.dstack([avgfl,avgfl.sum(axis=2)])
        
    len_norm_tf = tf_array / ((b*dls / avgfl + (1-b)))
    
    bm25_TF = len_norm_tf / (k_1 + len_norm_tf)
    bm25_score_per_term = bm25_TF * idf_arr
    bm25_score = bm25_score_per_term.sum(axis=1)
    query_data['bm25_scores'] = bm25_score
#     return bm25_score

## FSA-BM25

In [19]:
# calculate bm25 score for each documents in the results for each query
for query_id, query_data in all_query_data.items(): 
    bm25_score(query_data)

In [20]:
def linear_aggregate(field_scores,weights=[1.0 for x in fields],catch_all=False):
    """
    function that linearily aggregates the BM25 scores across fields
    """
    weights=[1.0 for x in range(field_scores.shape[1])]
    weighted_scores = field_scores * np.array(weights)
    return weighted_scores.sum(axis=1)

In [21]:
# This cell implements the FSA-BM25 model
ndcgs10 = []
ndcgs100 = []
aps = []
# q_ids_ = []
for query_id, query_data in all_query_data.items():
    query_id = query_id.strip()
#     q_ids_.append(query_id)
    q_qrels = qrel_dict[query_id]
    field_scores = query_data['bm25_scores']
    if field_scores.size != 0:
        aggregated_scores = linear_aggregate(field_scores)
        q_ranking = dict(zip(query_data['doc_ids'], aggregated_scores.tolist()))
        query_data['ranking'] = q_ranking

        q_ndcg10 = calc_ndcg(query_id, q_ranking, q_qrels,10)
        ndcgs10.append(q_ndcg10)

        q_ndcg100 = calc_ndcg(query_id, q_ranking, q_qrels,100)
        ndcgs100.append(q_ndcg100)

        q_ap = calc_ap(query_id, q_ranking, q_qrels)
        aps.append(q_ap)
    else:
        print('miss')
        aps.append(0)
        ndcgs10.append(0)
        ndcgs100.append(0)

    
ndcg_ds10 = np.round(np.mean(ndcgs10),4)
ndcg_ds100 = np.round(np.mean(ndcgs100),4)
map_ds = np.round(np.mean(aps),4)

save_q_based(index_name, 'map', aps,'bm25lin')
save_q_based(index_name, 'ndcg', ndcgs100,'bm25lin')

    
accuracy_dict['map']['bm25_lin'] = map_ds
accuracy_dict['ndcg10']['bm25_lin'] = ndcg_ds10
print('ndcg10: ', ndcg_ds10)
print('map: ', map_ds)
    

ndcg10:  0.6212
map:  0.2099


## BM25F

In [22]:
def bm25f_score(query_data,b=0.8,k_1=1.6, weights = np.array([1.0 for x in fields]).reshape(1,1,len(fields))):
    tf_array = query_data['tf_arr']
    dls = query_data['fl_arr']
    avgfl = query_data['avgfl']
    idf_arr = query_data['global_idf_arr']
    len_norm_tf = tf_array / (((1-b) + b*dls / avgfl))
    len_norm_tf_weighted = len_norm_tf * weights
    len_norm_tf_concat = len_norm_tf_weighted.sum(axis=2)
    bm25_TF = (len_norm_tf_concat / (k_1 + len_norm_tf_concat))
    bm25_score_per_term = (bm25_TF * idf_arr)
    bm25_score = bm25_score_per_term.sum(axis=1)
    return bm25_score


In [23]:
# This cell implements the BM25F model
aps = []
ndcgs10 = []
ndcgs100 = []
for query_id, query_data in all_query_data.items():
    query_id = query_id.strip()
    q_qrels = qrel_dict[query_id]
    aggregated_scores = bm25f_score(query_data)
    
    q_ranking = dict(zip(query_data['doc_ids'], aggregated_scores.tolist()))
    query_data['ranking'] = q_ranking
    
    q_ndcg10 = calc_ndcg(query_id, q_ranking, q_qrels,10)
    ndcgs10.append(q_ndcg10)
    
    q_ndcg100 = calc_ndcg(query_id, q_ranking, q_qrels,100)
    ndcgs100.append(q_ndcg100)
    
    
    q_ap = calc_ap(query_id, q_ranking, q_qrels)
    aps.append(q_ap)
#     print(query_id, q_ndcg)
ndcg_ds10 = np.round(np.mean(ndcgs10),4)
ndcg_ds100 = np.round(np.mean(ndcgs100),4)
map_ds = np.round(np.mean(aps),4)

save_q_based(index_name, 'map', aps,'bm25f')
save_q_based(index_name, 'ndcg', ndcgs100,'bm25f')

accuracy_dict['map']['bm25f'] = map_ds
accuracy_dict['ndcg10']['bm25f'] = ndcg_ds10
print('ndcg10: ', ndcg_ds10)
print('map: ', map_ds)
    

ndcg10:  0.4422
map:  0.2165


## BM25FSimple

In [24]:
def bm25f_score_simple(query_data,b=0.8,k_1=1.6, weights = np.array([1.0 for x in fields]).reshape(1,1,len(fields))):
    tf_array = query_data['tf_arr']
    dls = query_data['fl_arr'].sum(axis=1).sum(axis=1).reshape(tf_array.shape[0],1)

    avgfl = query_data['global_avgdl']

    idf_arr = query_data['global_idf_arr']
    weighted_tf_array = tf_array * weights
    weighted_tf_array_concat = weighted_tf_array.sum(axis=2)
    bm25_TF = weighted_tf_array_concat / (k_1*((1-b)+b*(dls / avgfl)) + weighted_tf_array_concat)
    bm25_score_per_term = (bm25_TF * idf_arr)
    bm25_score = bm25_score_per_term.sum(axis=1)
    return bm25_score


In [25]:
# This cell implements the BM25FSimple model
aps = []
ndcgs10 = []
ndcgs100 = []
for query_id, query_data in all_query_data.items():
    query_id = query_id.strip()
    q_qrels = qrel_dict[query_id]
    if query_data['tf_arr'].size != 0 and field_scores.size != 0:
        aggregated_scores = bm25f_score_simple(query_data)

        q_ranking = dict(zip(query_data['doc_ids'], aggregated_scores.tolist()))
        query_data['ranking'] = q_ranking

        q_ndcg10 = calc_ndcg(query_id, q_ranking, q_qrels,10)
        ndcgs10.append(q_ndcg10)

        q_ndcg100 = calc_ndcg(query_id, q_ranking, q_qrels,100)
        ndcgs100.append(q_ndcg100)

        q_ap = calc_ap(query_id, q_ranking, q_qrels)
        aps.append(q_ap)
    else:
        aps.append(0)
        ndcgs10.append(0)
        ndcgs100.append(0)

ndcg_ds10 = np.round(np.mean(ndcgs10),4)
ndcg_ds100 = np.round(np.mean(ndcgs100),4)
map_ds = np.round(np.mean(aps),4)
accuracy_dict['map']['bm25fSimple'] = map_ds
accuracy_dict['ndcg10']['bm25fSimple'] = ndcg_ds10

save_q_based(index_name, 'map', aps,'bm25fSimple')
save_q_based(index_name, 'ndcg', ndcgs100,'bm25fSimple')
print('ndcg10: ', ndcg_ds10)
print('map: ', map_ds)
    

ndcg10:  0.4422
map:  0.2168



## ICFW-G

In [26]:
def lambda_G(query_idfs, query_dfs, empty_fields, N_total, nr_fields):
    """
    Function to calculate the lambda values for ICFW-G
    """
    dfs = query_dfs
    idfs = query_idfs
    empty_fields = empty_fields
    N_total = N_total
    N = np.array(N_total)
    

    nr_fields = nr_fields

    df_arr = np.array(list(query_data['global_dfs'].values()))
    idf_arr = np.array(list(query_data['global_idfs'].values()))
    nr_terms = df_arr.shape[0]
    if nr_terms == 1:
        shape = (nr_fields.shape[0],1 )
        lambda_= np.zeros(shape)
        return lambda_, 1
    
    max_dfs = df_arr.max()
    min_dfs = df_arr.min()



    non_zero_dfs = df_arr[df_arr > 0]
    if not non_zero_dfs[non_zero_dfs != non_zero_dfs.max()].any():
        min_df = np.nan
    else:
        min_df = np.mean(non_zero_dfs)

    idf_ratio = idf_arr.max() / idf_arr.min()
    max_idf = np.log((N + 0.5) / (min_dfs + 0.5))
    min_idf = np.log((N + 0.5) / (max_dfs + 0.5))
    idf_ratio = np.log((N + 0.5) / (min_dfs + 0.5)) / np.log((N + 0.5) / (max_dfs + 0.5))
    
    
    Z = idf_ratio
#     Z = 
    numerator = np.log((max_dfs*N**Z) / ((min_dfs**Z)*N))
    denominator = np.log((len(fields)**(2*Z)*nr_fields**(Z+1)) / (nr_fields **(2*Z)))
    lambda_ = numerator / denominator

    lambda_[np.isnan(lambda_)] = 0
    lambda_[np.isinf(lambda_)] = 0
    return lambda_, Z


In [27]:
# This cell implements the ICFW-G-BM25 model

ndcgs10 = []
ndcgs100 = []
aps = []
lambdas_ = []
q_accs = []
count = 0
for query_id, query_data in all_query_data.items():
#     print(count, '/',len(all_query_data))
    query_id = query_id.strip()
    q_qrels = qrel_dict[query_id]
    field_scores = query_data['bm25_scores'] 
    
    query_idfs = query_data['global_idfs']
    query_dfs = query_data['global_dfs']
    if field_scores.size != 0:

        empty_fields = datasetInfo[index_name]['empty_fields']
        N_total = datasetInfo[index_name]['total_doc_count']
        dls_squueze = query_data['fl_arr'].reshape(field_scores.shape[0],len(fields))
        nr_fields = (dls_squueze > 0).sum(axis=1)[np.newaxis].T
        lambda_, Z = lambda_G(query_idfs, query_dfs, empty_fields, N_total, nr_fields)


        p_t_d = query_data['p_t_d']
        p_t_f = query_data['p_t_f']
        inf_f_d = -np.log(p_t_d.prod(axis=1))
        inf_f_F = -np.log(p_t_f.prod(axis=1))

        field_weights = inf_f_F + inf_f_d * lambda_
        weighted_arr = field_weights * field_scores
        aggregated_scores = weighted_arr.sum(axis=1)

        q_ranking = dict(zip(query_data['doc_ids'], aggregated_scores.tolist()))
        query_data['ranking'] = q_ranking

        q_ndcg10 = calc_ndcg(query_id, q_ranking, q_qrels,10)
        ndcgs10.append(q_ndcg10)

        q_ndcg100 = calc_ndcg(query_id, q_ranking, q_qrels,100)
        ndcgs100.append(q_ndcg100)

        q_ap = calc_ap(query_id, q_ranking, q_qrels)
        aps.append(q_ap)

        lambdas_.append(lambda_)
        q_accs.append(q_ap)
    else:

        print('miss')
        aps.append(0)
        ndcgs10.append(0)
        ndcgs100.append(0)
    

ndcg_ds10 = np.round(np.mean(ndcgs10),4)
ndcg_ds100 = np.round(np.mean(ndcgs100),4)
map_ds = np.round(np.mean(aps),4)
accuracy_dict['map']['icfw-G'] = map_ds
accuracy_dict['ndcg10']['icfw-G'] = ndcg_ds10
accuracy_dict['ndcg100']['icfw-G'] = ndcg_ds100
save_q_based(index_name, 'map', aps,'icfw-G')
print('ndcg10: ', ndcg_ds10)
print('map: ', map_ds)
    

ndcg10:  0.5712
map:  0.2917



## ICFW-GA

In [28]:
def lambda_GA(query_idfs, query_dfs, empty_fields, N_total, nr_fields):
    """
    Function to calculate the lambda values for ICFW-GA
    """
    dfs = query_dfs
    idfs = query_idfs
    empty_fields = empty_fields
    N_total = N_total
    N = np.array(N_total)
    

    nr_fields = nr_fields

    df_arr = np.array(list(query_data['global_dfs'].values()))
    idf_arr = np.array(list(query_data['global_idfs'].values()))
    nr_terms = df_arr.shape[0]
    if nr_terms == 1:
        shape = (nr_fields.shape[0],1 )
        lambda_= np.zeros(shape)
        return lambda_, 1
    
    max_dfs = df_arr.max()
    min_dfs = df_arr[df_arr!=max_dfs].mean()



    non_zero_dfs = df_arr[df_arr > 0]
    if not non_zero_dfs[non_zero_dfs != non_zero_dfs.max()].any():
        min_df = np.nan
    else:
        min_df = np.mean(non_zero_dfs)


    idf_min = idf_arr.min()
    idf_max = idf_arr[idf_arr!=idf_min].mean()
    
    idf_ratio = idf_max / idf_min
    idf_ratio = np.log((N + 0.5) / (min_dfs + 0.5)) / np.log((N + 0.5) / (max_dfs + 0.5))
    Z = idf_ratio
    numerator = np.log((max_dfs*N**Z) / ((min_dfs**Z)*N))
    denominator = np.log((len(fields)**(2*Z)*nr_fields**(Z+1)) / (nr_fields **(2*Z)))
    lambda_ = numerator / denominator

    lambda_[np.isnan(lambda_)] = 0
    lambda_[np.isinf(lambda_)] = 0
    return lambda_, Z


In [29]:
# This cell implements the ICFW-GA-BM25 model
ndcgs10 = []
ndcgs100 = []
aps = []
lambdas_ = []
q_accs = []
for query_id, query_data in all_query_data.items():
    query_id = query_id.strip()
    q_qrels = qrel_dict[query_id]
    field_scores = query_data['bm25_scores'] 

    
    query_idfs = query_data['global_idfs']
    query_dfs = query_data['global_dfs']
    if field_scores.size != 0:

        empty_fields = datasetInfo[index_name]['empty_fields']
        N_total = datasetInfo[index_name]['total_doc_count']
        dls_squueze = query_data['fl_arr'].reshape(field_scores.shape[0],len(fields))
        nr_fields = (dls_squueze > 0).sum(axis=1)[np.newaxis].T
        lambda_, Z = lambda_GA(query_idfs, query_dfs, empty_fields, N_total, nr_fields)


        p_t_d = query_data['p_t_d']
        p_t_f = query_data['p_t_f']
        inf_f_d = -np.log(p_t_d.prod(axis=1))
        inf_f_F = -np.log(p_t_f.prod(axis=1))

        field_weights = inf_f_F + inf_f_d * lambda_
        weighted_arr = field_weights * field_scores
        aggregated_scores = weighted_arr.sum(axis=1)

        q_ranking = dict(zip(query_data['doc_ids'], aggregated_scores.tolist()))
        query_data['ranking'] = q_ranking

        q_ndcg10 = calc_ndcg(query_id, q_ranking, q_qrels,10)
        ndcgs10.append(q_ndcg10)

        q_ndcg100 = calc_ndcg(query_id, q_ranking, q_qrels,100)
        ndcgs100.append(q_ndcg100)

        q_ap = calc_ap(query_id, q_ranking, q_qrels)
        aps.append(q_ap)

        lambdas_.append(lambda_)
        q_accs.append(q_ap)
    else:

        print('miss')
        aps.append(0)
        ndcgs10.append(0)
        ndcgs100.append(0)
    

ndcg_ds10 = np.round(np.mean(ndcgs10),4)
ndcg_ds100 = np.round(np.mean(ndcgs100),4)
map_ds = np.round(np.mean(aps),4)
accuracy_dict['map']['icfw-GA'] = map_ds
accuracy_dict['ndcg10']['icfw-GA'] = ndcg_ds10
accuracy_dict['ndcg100']['icfw-GA'] = ndcg_ds100
save_q_based(index_name, 'map', aps,'icfw-GA')
print('ndcg10: ', ndcg_ds10)
print('map: ', map_ds)
    

ndcg10:  0.5779
map:  0.2914


## ICFW-LA

In [30]:
def lambda_LA(query_idfs, query_dfs, empty_fields, N_total, nr_fields):
    """
    Function to calculate the lambda values for ICFW-LA
    """
    k_1 = 1.6
    dfs = query_dfs
    idfs = query_idfs
    empty_fields = empty_fields
    N_total = N_total
    N = [N_total - empty_fields[field] for field in fields]
    N = np.array(N)
    
    nr_fields = nr_fields

    df_arr = query_dfs
    idf_arr = query_idfs
    
    nr_terms = df_arr.shape[1]
    if nr_terms == 1:
        shape = (nr_fields.shape[0],1 )
        lambda_= np.zeros(shape)
        return lambda_, None

    max_dfs = df_arr.mean(axis=1).max()
    min_dfs = df_arr.mean(axis=1).mean()
    min_dfs = df_arr[df_arr!=max_dfs].mean()

    idf_ratio = np.log((N + 0.5) / (min_dfs + 0.5)) / np.log((N + 0.5) / (max_dfs + 0.5))
    idf_ratio = query_idfs.mean(axis=1).max()  / query_idfs[query_idfs!=query_idfs.min()].mean()
    Z = idf_ratio

    numerator = np.log((max_dfs*N**Z) / ((min_dfs**Z)*N))
    denominator = np.log((2**(2*Z)*nr_fields**(Z+1)) / (nr_fields **(2*Z)))
    lambda_ = numerator / denominator

    lambda_[np.isnan(lambda_)] = 0
    lambda_[np.isinf(lambda_)] = 0
    lambda_[lambda_ < 0] = 0
    return lambda_, Z

In [31]:

# This cell implements the ICFW-LA-BM25 model
ndcgs10 = []
ndcgs100 = []
aps = []
lambdas_ = []

for query_id, query_data in all_query_data.items():
    query_id = query_id.strip()
    q_qrels = qrel_dict[query_id]
    field_scores = query_data['bm25_scores'] 

    
    query_idfs = query_data['idf_arr']
    query_dfs = query_data['df_arr']

    if field_scores.size != 0:
        empty_fields = datasetInfo[index_name]['empty_fields']
        N_total = datasetInfo[index_name]['total_doc_count']
        dls_squueze = query_data['fl_arr'].reshape(field_scores.shape[0],len(fields))
        nr_fields = (dls_squueze > 0).sum(axis=1)[np.newaxis].T
        lambda_, Z = lambda_LA(query_idfs, query_dfs, empty_fields, N_total, nr_fields)

        p_t_d = query_data['p_t_d']
        p_t_f = query_data['p_t_f']
        inf_f_d = -np.log(p_t_d.prod(axis=1))
        inf_f_F = -np.log(p_t_f.prod(axis=1))

        field_weights = inf_f_F + inf_f_d * lambda_
        weighted_arr = field_weights * field_scores
        aggregated_scores = weighted_arr.sum(axis=1)
        q_ranking = dict(zip(query_data['doc_ids'], aggregated_scores.tolist()))
        query_data['ranking'] = q_ranking

        q_ndcg10 = calc_ndcg(query_id, q_ranking, q_qrels,10)
        ndcgs10.append(q_ndcg10)

        q_ndcg100 = calc_ndcg(query_id, q_ranking, q_qrels,100)
        ndcgs100.append(q_ndcg100)

        q_ap = calc_ap(query_id, q_ranking, q_qrels)
        aps.append(q_ap)
        lambdas_.append(lambda_)
    else:

        print('miss')
        aps.append(0)
        ndcgs100.append(0)

ndcg_ds100 = np.round(np.mean(ndcgs100),4)
map_ds = np.round(np.mean(aps),4)
accuracy_dict['map']['icfw-LA'] = map_ds
accuracy_dict['ndcg10']['icfw-LA'] = ndcg_ds100
save_q_based(index_name, 'map', aps,'icfw-LA')
save_q_based(index_name, 'ndcg', ndcgs100,'icfw-LA')
print('ndcg10: ', ndcg_ds10)
print('map: ', map_ds)
    

ndcg10:  0.5779
map:  0.293


In [32]:
accuracy_dict['ndcg@100'] = accuracy_dict['ndcg10']
accuracy_dict_ = {k:v for k,v in accuracy_dict.items() if k not in ['ndcg100','ndcg10']}
acc_df = pd.DataFrame(accuracy_dict_)

acc_df

Unnamed: 0,map,ndcg@100
bm25_lin,0.2099,0.6212
bm25f,0.2165,0.4422
bm25fSimple,0.2168,0.4422
icfw-G,0.2917,0.5712
icfw-GA,0.2914,0.5779
icfw-LA,0.293,0.5275
