In [1]:
import pyterrier as pt
import json
import numpy as np
from scipy.spatial.distance import jensenshannon
import ast
import pandas as pd
import ir_datasets
if not pt.started():
    pt.init(tqdm="notebook")

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8



# 1. Define evaluation metrics

In [2]:
def dcg(scores, k):
    return np.sum([(rel / np.log2(idx + 2)) for idx, rel in enumerate(scores[:k], start=1)])

def calculate_ndcg(ranked_list, qrel_dict, qid, n):
    """
    Calculate the nDCG of a result set
    result_df: DataFrame. The dataframe containing the results
    """
    result_df = ranked_list.copy()
    result_df['docno'] = result_df['docno'].astype(str)
    # pair = (qid, result_df['docid'].values[0])
    
    result_df['gt_relevance'] = result_df.apply(lambda row: qrel_dict.get((qid, str(row['docno'])), 0), axis=1)
    scores = result_df['gt_relevance'].values[0:n]
    ideal_scores = sorted(scores, reverse=True)
    DCG = dcg(scores, n)
    IDCG = dcg(ideal_scores, n)
    if IDCG >0:
        return DCG/IDCG
    else:
        return 0
    
def disc(k):
    """
    The discount function used in the nDCG/AWRF
    """
    return 1 / np.log2(max(k, 2))

def calculate_awrf(ranked_list, expected_distribution, attribute, n=10):
    """
    Calculate the AWRF of a result set
    result_df: DataFrame. The dataframe containing the results
    expected_distribution: Dict. The distribution of the dataset. e.g.
            {'Man': 1494634,
            'Woman': 353495,
            'Non-binary': 781,
            'Unknown': 4605703,
            'null': 0}
    attribute: Str. The attribute to consider. e.g. 'gender'
    n: Int.  refer to the number of documents to consider
    """
    result_df = ranked_list.copy()
    current_distribution = {key: 0 for key in expected_distribution}
    for index, row in result_df.iterrows():
        if index<n:
            if row[attribute]:
                current_distribution[row[attribute]]+= disc(index+1)
            else:
                current_distribution['null']+= disc(index+1)
    
    desired_distribution = np.array(list(expected_distribution.values()))
    current_distribution = np.array(list(current_distribution.values()))

    target_distr = desired_distribution/np.sum(desired_distribution)
    current_distr = current_distribution/np.sum(current_distribution)
    awrf = 1- jensenshannon(target_distr, current_distr)**2
    return awrf

# 2. Obtain dataset distribution of gender and qrel for AWRF and NDCG calculation

In [3]:
dataset = pt.get_dataset('irds:trec-fair/2022/train')
dataset_distribution = {'Man':0, 'Woman':0, 'Non-binary':0, 'Unknown':0, 'null':0}
for it in dataset.get_corpus_iter():
    title = it['title']
    if it['gender_category']:
        gender = it['gender_category']
        dataset_distribution[gender]+=1


dataset = ir_datasets.load("trec-fair/2022/train")
qrel_dict = {(qrel.query_id, qrel.doc_id): qrel.relevance for qrel in dataset.qrels_iter()}

trec-fair/2022/train documents:   0%|          | 0/6475537 [00:00<?, ?it/s]

# 3. Obtain biography related queries

In [18]:
biography_related_topics = ['396', '397', '403', '770', '1371']
with open("train_topics_meta.jsonl", "r") as f:
    topics = [json.loads(line) for line in f]
# "id":396,"title":"Biography/WikiProject Actors and Filmmakers"
# "id":397,"title":"Biography/WikiProject Musicians"
# "id":403,"title":"Biography/science and academia work group"
# "id":770,"title":"Crime and Criminal Biography"
# "id":1371,"title":"Japan/Biography task force"
topic_id = topics[0]['id']
all_topic_queries = topics[0]['keywords']
test_query_dict = {}
for key, val in topic_id.items():
    if str(val) in biography_related_topics:
        topic_queries = all_topic_queries[key]
        topic_query_list = ast.literal_eval(topic_queries)
        test_query_dict[str(val)] = topic_query_list


# 4. Build the retrieval pipeline

In [20]:
index_ref = pt.IndexRef.of('D:/Q7/InformationRetrieval/group_project/test_dataset/indices_full/trec-fair_2022_multi')
pipeline = pt.BatchRetrieve(index_ref, wmodel='BM25', metadata=['gender', 'gender_category', 'title', 'docno'])

19:40:14.417 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 7.3 GiB of memory would be required.


# 5. Retrieve and evaluation

In [21]:
eval_reulst = []
from tqdm import tqdm
for topic_id, topic_query_list in tqdm(test_query_dict.items()):
    for query in topic_query_list:
        result = pipeline.search(query)
        nDCG20 = calculate_ndcg(result, qrel_dict, qid=topic_id, n=20)
        awrf20 = calculate_awrf(result, dataset_distribution, 'gender_category', n=20)
        eval_reulst.append([topic_id, query, nDCG20, awrf20])

eval_df = pd.DataFrame(eval_reulst, columns=["topic_id", "query", "nDCG@20", "AWRF@20"])

100%|██████████| 5/5 [00:34<00:00,  6.82s/it]


In [23]:
eval_df

Unnamed: 0,topic_id,query,nDCG@20,AWRF@20
0,396,actor,0.480360,0.980590
1,396,cast,0.000000,0.928159
2,396,screenwriter,0.641466,0.745758
3,396,filmmaker,0.467492,0.961185
4,396,film,0.000000,0.888697
...,...,...,...,...
279,1371,kawashima,0.686181,0.886308
280,1371,kazufumi,0.792481,0.959422
281,1371,utada,0.000000,0.888697
282,1371,yuriko,0.533695,0.851755
