In [7]:
import pyterrier as pt
import json
import numpy as np
from scipy.spatial.distance import jensenshannon
import matplotlib.pyplot as plt
import ast
import pandas as pd
import ir_datasets
if not pt.started():
    pt.init(tqdm="notebook")

# 1. Define evaluation metrics

In [18]:
def dcg(scores, k):
    return np.sum([(rel / np.log2(idx + 2)) for idx, rel in enumerate(scores[:k], start=1)])

def calculate_ndcg(ranked_list, n):
    """
    Calculate the nDCG of a result set
    result_df: DataFrame. The dataframe containing the results
    """
    scores = [rank[1]['qrel'] for rank in ranked_list][0:n]
    
    ideal_scores = sorted(scores, reverse=True)
    DCG = dcg(scores, n)
    IDCG = dcg(ideal_scores, n)
    if IDCG >0:
        return DCG/IDCG
    else:
        return 0
    
def disc(k):
    """
    The discount function used in the nDCG/AWRF
    """
    return 1 / np.log2(max(k, 2))

def calculate_awrf(ranked_list, expected_distribution, attribute, n=10):
    """
    Calculate the AWRF of a result set
    result_df: DataFrame. The dataframe containing the results
    expected_distribution: Dict. The distribution of the dataset. e.g.
            {'Man': 1494634,
            'Woman': 353495,
            'Non-binary': 781,
            'Unknown': 4605703,
            'null': 0}
    attribute: Str. The attribute to consider. e.g. 'gender'
    n: Int.  refer to the number of documents to consider
    """
    current_distribution = {key: 0 for key in expected_distribution}
    for index, row in ranked_list:
        if index<n:
            if row[attribute]:
                current_distribution[row[attribute]]+= disc(index+1)
            else:
                current_distribution['null']+= disc(index+1)
    
    desired_distribution = np.array(list(expected_distribution.values()))
    current_distribution = np.array(list(current_distribution.values()))

    target_distr = desired_distribution/np.sum(desired_distribution)
    current_distr = current_distribution/np.sum(current_distribution)
    awrf = 1- jensenshannon(target_distr, current_distr)**2
    return awrf

# 2. Obtain dataset distribution of genderfor AWRF and NDCG calculation

In [9]:
import os
import json

index_root = os.path.join(os.getcwd(), '..', '..', '..', '..', '..', '..', '..', '..', 'media', 'steve', 'PortableSSD', 'index')

distribution_file = os.path.join(index_root, 'distribution.json')

with open(distribution_file, 'r') as fp:
    dataset_distribution = json.load(fp)
    
dataset_distribution

{'Man': 1495328, 'Woman': 353642, 'Non-binary': 781}

# 3. Retrieve and evaluation

In [10]:
llm_ranking_file = os.path.join(index_root, 'llm_ranking.json')

with open(llm_ranking_file, 'r') as fp:
    llm_ranking = json.load(fp)
    
llm_ranking

{'4': [[0, {'gender_category': 'Man', 'qrel': 0.45942733}],
  [1, {'gender_category': 'Man', 'qrel': 0.34803268}],
  [2, {'gender_category': 'Woman', 'qrel': 0.43804768}],
  [3, {'gender_category': 'Woman', 'qrel': 0.41044617}],
  [4, {'gender_category': 'Woman', 'qrel': 0.57402503}],
  [5, {'gender_category': 'Woman', 'qrel': 0.22064358}],
  [6, {'gender_category': 'Man', 'qrel': 0.44185033}],
  [7, {'gender_category': 'Woman', 'qrel': 0.34168866}],
  [8, {'gender_category': 'Man', 'qrel': 0.2258933}],
  [9, {'gender_category': 'Man', 'qrel': 0.4571543}],
  [10, {'gender_category': 'Woman', 'qrel': 0.24468344}],
  [11, {'gender_category': 'Man', 'qrel': 0.53643125}],
  [12, {'gender_category': 'Woman', 'qrel': 0.27914345}],
  [13, {'gender_category': 'Man', 'qrel': 0.2753966}],
  [14, {'gender_category': 'Man', 'qrel': 0.5608159}],
  [15, {'gender_category': 'Man', 'qrel': 0.47655153}],
  [16, {'gender_category': 'Man', 'qrel': 0.38892874}],
  [17, {'gender_category': 'Man', 'qrel': 0

In [19]:
eval_reulst = []
from tqdm import tqdm
for ranking_id, ranking in tqdm(llm_ranking.items()):
    result = ranking

    nDCG5 = calculate_ndcg(result, n=5)
    awrf5 = calculate_awrf(result, dataset_distribution, 'gender_category', n=5)

    nDCG10 = calculate_ndcg(result, n=10)
    awrf10 = calculate_awrf(result, dataset_distribution, 'gender_category', n=10)

    nDCG20 = calculate_ndcg(result, n=20)
    awrf20 = calculate_awrf(result, dataset_distribution, 'gender_category', n=20)

    nDCG50 = calculate_ndcg(result, n=50)
    awrf50 = calculate_awrf(result, dataset_distribution, 'gender_category', n=50)

    # eval_reulst.append([topic_id, query, nDCG5, awrf5, nDCG10, awrf10, nDCG20, awrf20, nDCG50, awrf50])
    eval_reulst.append([nDCG5,nDCG10,nDCG20,nDCG50, awrf5,awrf10,awrf20,awrf50])

eval_df = pd.DataFrame(eval_reulst, columns=["nDCG5","nDCG10","nDCG20","nDCG50", "awrf5","awrf10","awrf20","awrf50"])

  0%|          | 0/5 [00:00<?, ?it/s]


TypeError: list indices must be integers or slices, not str

In [94]:
eval_df.to_csv("LLMresults.csv",index=False)