In [1]:
import os
import pyterrier as pt
if not pt.started():
    pt.init()
import ir_datasets as irds
import pandas as pd
from ir_measures import *
from ir_measures import evaluator

PyTerrier 0.10.0 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [2]:
DATASET = "msmarco-passage/trec-dl-2019/judged"
RUN_DIR = '/home/andrew/Documents/Code/Annotation/runs/trec-dl-2019'

In [3]:
dataset = irds.load(DATASET)
original_qrels = pd.DataFrame(dataset.qrels_iter())

## QREL CHECKS

In [4]:
values = original_qrels.relevance.value_counts()

In [5]:
len(original_qrels) # total number of judgements

9260

In [6]:
values[0] / (values[0] + values[1] + values[2] + values[3]) * 100 # percentage of non-relevant judgements

55.70194384449244

In [7]:
(values[0] + values[1]) / (values[0] + values[1] + values[2] + values[3]) * 100 # percentage of non-relevant and partially relevant judgements

72.9913606911447

## Distribution Stats

In [8]:
NUM_ANNOTATORS = 8

In [9]:
filtered_qrels = original_qrels[original_qrels.relevance != 0]
nonrel_qrels = original_qrels[original_qrels.relevance == 0]

In [10]:
# query_ids value counts nonrel_qrels
nonrel_qrels.query_id.value_counts()

query_id
168216     293
183378     222
1133167    207
1124210    191
855410     179
19335      174
264014     171
156493     167
1121709    166
104861     165
1037798    141
1110199    140
1117099    138
962179     136
1114819    129
1115776    128
1129237    119
1106007    118
1063750    116
146187     115
207786     113
1103812    110
405717     109
130510     105
1113437    103
1121402    100
915593     100
1114646     99
443396      94
490595      93
527433      85
359349      83
833860      82
1112341     81
182539      79
489204      79
87181       75
573724      72
131843      68
451602      66
87452       58
148538      58
47923       31
Name: count, dtype: int64

In [11]:
rel_query_pools = filtered_qrels.groupby('query_id').doc_id.apply(list).to_dict()
nonrel_query_pools = nonrel_qrels.groupby('query_id').doc_id.apply(list).to_dict()

In [12]:
print([len(x) for x in nonrel_query_pools.values()])

[141, 165, 116, 110, 118, 140, 81, 103, 99, 129, 128, 138, 100, 166, 191, 119, 207, 105, 68, 115, 58, 167, 293, 79, 222, 174, 113, 171, 83, 109, 94, 66, 31, 79, 93, 85, 72, 82, 179, 75, 58, 100, 136]


In [13]:
# count non rels by query
nonrel_counts = nonrel_qrels.groupby('query_id').size()

In [14]:
nonrel_counts.max()

293

In [15]:
len(filtered_qrels)

4102

In [16]:
# Initialize the document count for each annotator
queries = {qid : len(filtered_qrels[filtered_qrels['query_id'] == qid]) for qid in filtered_qrels['query_id'].unique()}
queries = {k: v for k, v in sorted(queries.items(), key=lambda item: item[1])}
annotators = [i for i in range(NUM_ANNOTATORS)]
annotator_load = {a: 0 for a in annotators}
assignments = {a: [] for a in annotators}
query_to_annotators = {q: [] for q, _ in queries.items()}

# Step 1: Assign queries to the two annotators with the least load
for query, num_docs in queries.items():
    sorted_annotators = sorted(annotator_load, key=annotator_load.get)
    annotator_1 = sorted_annotators[0]
    annotator_2 = sorted_annotators[1]
    
    assignments[annotator_1].append(query)
    assignments[annotator_2].append(query)
    
    query_to_annotators[query].extend([annotator_1, annotator_2])
    
    annotator_load[annotator_1] += num_docs
    annotator_load[annotator_2] += num_docs

# Step 2: Perform swaps to balance the load while ensuring each query is seen by two annotators
def swap_queries(assignments, annotator_load, query_to_annotators, delta=100):
    improved = True
    
    while improved:
        improved = False
        # Get the annotators with the maximum and minimum load
        max_annotator = max(annotator_load, key=annotator_load.get)
        min_annotator = min(annotator_load, key=annotator_load.get)
        
        max_load = annotator_load[max_annotator]
        min_load = annotator_load[min_annotator]
        
        if max_load - min_load <= delta:
            break
        
        # Try to find a query to swap
        for query in assignments[max_annotator]:
            query_docs = next(d for q, d in queries.items() if q == query)
            
            # Check if the query can be reassigned to the min_annotator
            if (min_annotator not in query_to_annotators[query] and
                len(query_to_annotators[query]) == 2):
                
                # Swap query from max_annotator to min_annotator
                assignments[max_annotator].remove(query)
                assignments[min_annotator].append(query)
                
                query_to_annotators[query].remove(max_annotator)
                query_to_annotators[query].append(min_annotator)
                
                annotator_load[max_annotator] -= query_docs
                annotator_load[min_annotator] += query_docs
                
                improved = True
                break
    
    return assignments, annotator_load, query_to_annotators

# Execute the balancing swap with the constraint
assignments, annotator_load, query_to_annotators = swap_queries(assignments, annotator_load, query_to_annotators, 25)


In [17]:
annotator_load

{0: 1015, 1: 1015, 2: 1028, 3: 1028, 4: 1035, 5: 1035, 6: 1024, 7: 1024}

In [18]:
assignments

# check that the number of docs in assignments is equal to the number in annotator load

for annotator, assigned in assignments.items():
    print(assigned)
    docs = sum([queries[query] for query in assigned])
    assert docs == annotator_load[annotator]

['855410', '146187', '130510', '1110199', '490595', '573724', '87452', '489204', '156493', '451602', '1133167', '1114646']
['855410', '146187', '130510', '1110199', '490595', '573724', '87452', '489204', '156493', '451602', '1133167', '1114646']
['1129237', '1121402', '359349', '527433', '87181', '148538', '1124210', '264014', '168216']
['1129237', '1121402', '359349', '527433', '87181', '148538', '1124210', '264014', '168216']
['833860', '915593', '47923', '104861', '183378', '1114819', '19335', '962179']
['833860', '915593', '47923', '104861', '183378', '1114819', '19335', '962179']
['405717', '182539', '131843', '1113437', '443396', '1117099', '1112341', '1063750', '1037798', '1115776', '1103812', '1121709', '207786', '1106007']
['405717', '182539', '131843', '1113437', '443396', '1117099', '1112341', '1063750', '1037798', '1115776', '1103812', '1121709', '207786', '1106007']


In [19]:
sum([len(assigned) for assigned in query_to_annotators.values()]) / len(query_to_annotators)

2.0

In [20]:
import random
from collections import defaultdict

def distribute_extra_annotations(assignments, query_pools, extra_annotations_per_annotator=100):
    num_per_query = {query: len(docs) for query, docs in query_pools.items()}
    # Initialize the structure for extra annotations
    extra_assignments = {annotator: defaultdict(list) for annotator in assignments}
    query_extra_annotations = {}  # Track extra annotations assigned to each query
    
    # Calculate the number of queries per annotator and distribute extra annotations
    for annotator, queries in assignments.items():
        num_queries = len(queries)
        if num_queries == 0:
            continue
        
        # Determine how many extra annotations per query
        annotations_per_query = extra_annotations_per_annotator // num_queries
        remaining_annotations = extra_annotations_per_annotator % num_queries

        # if any query in group has too few docs to sampel annotations from, distribute to other queries 


        # Distribute annotations across the queries
        for query in queries:
            available_annotations = query_pools[query]

            if len(available_annotations) < annotations_per_query:
                remaining_annotations += annotations_per_query - len(available_annotations)
                current_annotations = len(available_annotations)
            else: current_annotations = annotations_per_query
            
            if query not in query_extra_annotations:
                # Randomly select annotations for this query
                selected_annotations = random.sample(available_annotations, current_annotations)
                query_extra_annotations[query] = selected_annotations
                            
            # Assign annotations to this annotator
            if query not in extra_assignments[annotator]:
                extra_assignments[annotator][query] = []
                
            # Add the annotations from query_extra_annotations
            extra_assignments[annotator][query].extend(query_extra_annotations[query])
        
        # get largest num docs per query in current annotator and add remainder 
        max_docs = max([num_per_query[query] for query in queries])
        query = max(num_per_query, key=num_per_query.get)
        available_annotations = query_pools[query]
        selected_annotations = random.sample(available_annotations, remaining_annotations)
        extra_assignments[annotator][query].extend(selected_annotations)
        
    
    
    return extra_assignments

In [21]:
extra_assignments = distribute_extra_annotations(assignments, nonrel_query_pools)

In [22]:
extra_assignments[0]

defaultdict(list,
            {'855410': ['638673',
              '7238783',
              '4810232',
              '2240114',
              '5021348',
              '3594261',
              '2269747',
              '6806702'],
             '146187': ['3615724',
              '3615725',
              '5952424',
              '8792967',
              '3615731',
              '5217815',
              '1369241',
              '6066658'],
             '130510': ['7203252',
              '7717230',
              '1494937',
              '1481524',
              '978177',
              '779340',
              '1494933',
              '2079375'],
             '1110199': ['4057232',
              '2657988',
              '863984',
              '1852785',
              '5857927',
              '6392463',
              '2080928',
              '7221362'],
             '490595': ['3329498',
              '6599070',
              '48492',
              '3573382',
              '1160842',
        

In [23]:
for annotator, extra in extra_assignments.items():
    print(sum([len(docs) for docs in extra.values()]))

100
100
100
100
100
100
100
100


In [24]:
final = {
    'annotator' : [],
    'query_id' : [],
    'doc_id' : [],
}

for annotator, queries in assignments.items():
    for query in queries:
        for doc in rel_query_pools[query]:
            final['annotator'].append(annotator)
            final['query_id'].append(query)
            final['doc_id'].append(doc)
    
    for query in extra_assignments[annotator]:
        if len(extra_assignments[annotator][query]) == 0:
            print('No extra annotations for query', query)
        for doc in extra_assignments[annotator][query]:
            final['annotator'].append(annotator)
            final['query_id'].append(query)
            final['doc_id'].append(doc)

In [25]:
len(final['annotator'])

9004

In [26]:
final_frame = pd.DataFrame(final)

In [27]:
final_frame.annotator.value_counts()

annotator
5    1135
4    1135
3    1128
2    1128
6    1124
7    1124
0    1115
1    1115
Name: count, dtype: int64

In [28]:
final_frame.to_json('annotated_qrels.jsonl', orient='records', lines=True)

# Assign and dump to files

In [29]:
import ir_datasets as irds

In [30]:
dl19 = irds.load("msmarco-passage/trec-dl-2019/judged")

In [31]:
docs = pd.DataFrame(dl19.docs_iter()).set_index('doc_id').text.to_dict()
queries = pd.DataFrame(dl19.queries_iter()).set_index('query_id').text.to_dict()

In [32]:
final_frame['text'] = final_frame.doc_id.map(docs)
final_frame['query'] = final_frame.query_id.map(queries)

In [33]:
final_frame

Unnamed: 0,annotator,query_id,doc_id,text,query
0,0,855410,8651770,Theraderm is a manufacturer of clinical-grade ...,what is theraderm used for
1,0,855410,8651771,The main ingredient in this Theraderm cream is...,what is theraderm used for
2,0,855410,8651772,"Nowadays there are many skin care brands, like...",what is theraderm used for
3,0,855410,8651775,Theraderm Skin Renewal System. Theraderm Skin ...,what is theraderm used for
4,0,146187,1230566,There are 280 calories in a 1 burger serving o...,difference between a mcdouble and a double che...
...,...,...,...,...,...
8999,7,1106007,8204460,"At the same time, the junction of the visceral...",define visceral?
9000,7,1106007,483330,There's nothing necessarily wrong with being i...,define visceral?
9001,7,1106007,1646610,There are three types of muscle tissue: Viscer...,define visceral?
9002,7,168216,1011043,Bacterial Pneumonia Causes. 1 Most pneumonia ...,does legionella pneumophila cause pneumonia


In [34]:
assignees = ['andrew-parry', 'eugene-yang', 'ferdinand-schlatt', 'froebe', 'guglielmo-faggioli', 'harry-scells', 'saber-zerhoudi', 'sean-macavaney']

In [35]:
for id, group in final_frame.groupby('annotator'):
    assign = assignees[id]
    out = group.copy().drop(columns='annotator')
    out['comments'] = [[] for _ in range(len(out))]
    out['label'] = [[] for _ in range(len(out))]
    out.to_json(f'{assign}.jsonl', orient='records', lines=True)
    