In [1]:
from scipy.spatial.distance import jensenshannon
import numpy as np
import pyterrier as pt

if not pt.started():
    pt.init(tqdm="notebook")

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8



# 1. Obtain the dataset distribution

In [2]:
dataset_distribution = {'Man':0, 'Woman':0, 'Non-binary':0, 'Unknown':0, 'null':0}

In [3]:
dataset = pt.get_dataset('irds:trec-fair/2022/train')
for it in dataset.get_corpus_iter():
    title = it['title']
    if it['gender_category']:
        gender = it['gender_category']
        dataset_distribution[gender]+=1

trec-fair/2022/train documents:   0%|          | 0/6475537 [00:00<?, ?it/s]

In [48]:
print(dataset_distribution)

{'Man': 1494634, 'Woman': 353495, 'Non-binary': 781, 'Unknown': 4605703, 'null': 0}


# 2. Create functions to calculate AWRF and nDCG

In [8]:
def disc(k):
    """
    The discount function used in the nDCG/AWRF
    """
    return 1 / np.log2(max(k, 2))

In [267]:
def calculate_awrf(ranked_list, expected_distribution, attribute, n=10):
    """
    Calculate the AWRF of a result set
    result_df: DataFrame. The dataframe containing the results
    expected_distribution: Dict. The distribution of the dataset. e.g.
            {'Man': 1494634,
            'Woman': 353495,
            'Non-binary': 781,
            'Unknown': 4605703,
            'null': 0}
    attribute: Str. The attribute to consider. e.g. 'gender'
    n: Int.  refer to the number of documents to consider
    """
    result_df = ranked_list.copy()
    current_distribution = {key: 0 for key in expected_distribution}
    for index, row in result_df.iterrows():
        if index<n:
            if row[attribute]:
                current_distribution[row[attribute]]+= disc(index+1)
            else:
                current_distribution['null']+= disc(index+1)
    
    desired_distribution = np.array(list(expected_distribution.values()))
    current_distribution = np.array(list(current_distribution.values()))

    target_distr = desired_distribution/np.sum(desired_distribution)
    current_distr = current_distribution/np.sum(current_distribution)
    awrf = 1- jensenshannon(target_distr, current_distr)**2
    return awrf
        


# 3. Test AWRF with raw BM25

In [263]:
index_ref = pt.IndexRef.of('D:/Q7/InformationRetrieval/group_project/test_dataset/indices_full/trec-fair_2022_multi')
pipeline = pt.BatchRetrieve(index_ref, wmodel='BM25', metadata=['gender', 'gender_category', 'title'])
result = pipeline.search('agricultural')
result

01:29:34.011 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 7.3 GiB of memory would be required.


Unnamed: 0,qid,docid,gender,gender_category,title,rank,score,query
0,1,2951586,[],Unknown,Agricultural law,0,11.351495,agricultural
1,1,603749,[],Unknown,Agricultural communication,1,11.336769,agricultural
2,1,1766318,[],Unknown,Census of agriculture,2,11.334330,agricultural
3,1,5579710,[],Unknown,Agricultural literacy,3,11.330249,agricultural
4,1,6401476,[],Unknown,Ministry of Agriculture and Forestry,4,11.326292,agricultural
...,...,...,...,...,...,...,...,...
995,1,6214276,[],Unknown,Shanxi Agricultural University,995,10.707407,agricultural
996,1,2649856,[],Unknown,Federal Agricultural Mortgage Corporation,996,10.706961,agricultural
997,1,186645,"[""male""]",Man,He Kang,997,10.706825,agricultural
998,1,1235218,[],Unknown,Agricultural Wheel,998,10.706410,agricultural


In [264]:
awrf10 = calculate_awrf(result, dataset_distribution, 'gender_category', n=10)
awrf20 = calculate_awrf(result, dataset_distribution, 'gender_category', n=20)
awrf30 = calculate_awrf(result, dataset_distribution, 'gender_category', n=30)

desired_distribution = {'Man': 1,
            'Woman': 1,
            'Non-binary': 1,
            'Unknown': 0.1,
            'null': 0.1}

awrf10_ = calculate_awrf(result, desired_distribution, 'gender_category', n=10)
awrf20_ = calculate_awrf(result, desired_distribution, 'gender_category', n=20)
awrf30_ = calculate_awrf(result, desired_distribution, 'gender_category', n=30)

print("AWRF@10 with dataset distribution: ", awrf10)
print("AWRF@20 with dataset distribution: ", awrf20)
print("AWRF@30 with dataset distribution: ", awrf30)
print("AWRF@10 with desired distribution: ", awrf10_)
print("AWRF@20 with desired distribution: ", awrf20_)
print("AWRF@30 with desired distribution: ", awrf30_)

AWRF@10 with dataset distribution:  0.8886972609010921
AWRF@20 with dataset distribution:  0.8886972609010921
AWRF@30 with dataset distribution:  0.8886972609010921
AWRF@10 with desired distribution:  0.3768715794213452
AWRF@20 with desired distribution:  0.3768715794213452
AWRF@30 with desired distribution:  0.3768715794213452
