This notebook is meant to create a relevance label dataset where we use binary labels (either relevant or irrelevant). The threshold to determine if a label should be relevant or irrelevant is 2. If subjects agree on a label less than 2 it is irrelevant, otherwise the label is relevant.

## Import Libraries

In [1]:
import pandas as pd
import json
import numpy as np
from collections import Counter

## Load Data

In [2]:
with open('data.json', 'r') as infile:
    data = json.load(infile)
query_ids = list(data.keys())

## Create Dataframes per Query

In [3]:
def create_labels(nr_assessors):
    labels = ["query_id","passageid","msmarco"]
    for i in range(nr_assessors):
        labels = labels + ['user%s_id'%(i+1),'user%s_label'%(i+1)]
    return labels

In [4]:
dataframe_data = {}
label_data = {}
assessor_data = {}
for query_id in query_ids:
    query_data_lists = []
    query_data = data[query_id]
    nr_assessors = 0
    for i, passage_id in enumerate(query_data.keys()):
        dataFrameRow2be = [query_id, passage_id] + query_data[passage_id]
        query_data_lists.append(dataFrameRow2be)
        if i == 0:
            nr_assessors = int((len(query_data[passage_id])-1)/2)
    dataframe_data[query_id] = query_data_lists
    label_data[query_id] = create_labels(nr_assessors)
    assessor_data[query_id] = nr_assessors

In [5]:
dataframes = {}
for query_id in query_ids:
    df = pd.DataFrame(dataframe_data[query_id],columns=label_data[query_id])
    dataframes[query_id] = df

## Compute Binary MSMARCO Label

In [6]:
for query_id in query_ids:
    df = dataframes[query_id]
    df['msmarco_binary'] = df['msmarco'].apply(lambda x: 0 if (x == 'irrelevant') else 1)

## Check for missing data

Sometimes an assessors forgot to provide input, which means that the dataset includes "no_input". This is missing data and often we can fix this by considering the input of the remaining assessors. If the remaining assessors agree, I can still provide the agreed relevance label. If there is no agreement between the remaining assessors, I simply take the relevance label that MSMARCO provides.

In [7]:
queries_with_missing_data = []
counter = 0
for query_id in query_ids:
    nr_assessors = assessor_data[query_id]
    if nr_assessors >= 3:
        df = dataframes[query_id]
        if 'no_input' in df.values:
            counter += 1
            print(query_id)
            queries_with_missing_data.append(query_id)
print("nr of queries with missing data: %s"%(counter))

825147
1097449
758519
689885
426442
904389
202306
1077356
1096257
92542
993153
nr of queries with missing data: 11


In [8]:
for query_id in queries_with_missing_data:
    print(query_id)
    df = dataframes[query_id]
    column_labels = list(df.columns)
    for i in range(7):
        column = 'user%s_label'%(i+1)
        if column in column_labels:
            print(column)
            print(len(df[df[column].isin(["no_input"])]))
    print("\n")

825147
user1_label
0
user2_label
0
user3_label
1
user4_label
0
user5_label
0


1097449
user1_label
0
user2_label
0
user3_label
0
user4_label
1
user5_label
0
user6_label
0
user7_label
0


758519
user1_label
0
user2_label
1
user3_label
0


689885
user1_label
1
user2_label
0
user3_label
0


426442
user1_label
0
user2_label
3
user3_label
0


904389
user1_label
0
user2_label
0
user3_label
1


202306
user1_label
0
user2_label
1
user3_label
0


1077356
user1_label
1
user2_label
0
user3_label
0


1096257
user1_label
0
user2_label
1
user3_label
0


92542
user1_label
0
user2_label
0
user3_label
1


993153
user1_label
0
user2_label
19
user3_label
1
user4_label
0
user5_label
0
user6_label
0
user7_label
0




Most queries have just one 'no_input' entry, except for query 993153 which has 19 entries. So we need to remove this query from the data and find a way to deal with the remaining queries.

In [9]:
query_ids.remove('993153')

## Compute Binary Agreement Label

In [10]:
# < 2 is irrelevant >= 2 is relevant
binary_threshold = 2

In [11]:
def getListUserLabelColumns(nr_assessors):
    label_columns = []
    for i in range(nr_assessors):
        label_columns.append("user%s_label"%(i+1))
    return label_columns

In [12]:
def makeBinary(labels):
    binary_labels = []
    for label in labels:
        if not label == "no_input":
            if int(label) < binary_threshold:
                binary_labels.append(0)
            else:
                binary_labels.append(1)
        else:
            binary_labels.append(label)
    return binary_labels

In [13]:
def getAgreementLabel(msmarco_label,labels):
    if "no_input" in labels:
        labels.remove('no_input')
        if usersDoAgree(labels):
            return labels[0]
        else:
            return performMajorityVote(msmarco_label,labels)
    else:
        if usersDoAgree(labels):
            return labels[0]
        else:
            return performMajorityVote(msmarco_label,labels)

In [14]:
def usersDoAgree(labels):
    if len(set(labels)) == 1:
        return True
    else:
        return False

In [15]:
def performMajorityVote(msmarco_label,labels):
    count_votes = Counter(labels)
    majority_label = msmarco_label
    if (len(labels) % 2) == 0:
        vote_threshold = int(len(labels)/2)
        for label,votes in count_votes.items():
            if votes > vote_threshold:
                majority_label = label
    else:
        vote_threshold = int(np.ceil(len(labels)/2))
        for label, votes in count_votes.items():
            if votes >= vote_threshold:
                majority_label = label
    return majority_label

In [16]:
for query_id in query_ids:
    df = dataframes[query_id]
    agreement_labels = []
    nr_assessors = assessor_data[query_id]
    for index, row in df.iterrows():
        user_labels = row[getListUserLabelColumns(nr_assessors)].values
        binary_labels = makeBinary(user_labels)
        agreement_label = getAgreementLabel(row['msmarco_binary'],binary_labels)
        agreement_labels.append(agreement_label)
    df['agreement_label'] = agreement_labels
    dataframes[query_id] = df

## Create Thesis Dataset

This section creates the dataset that is used for my thesis.

We need to check if the agreement labels specify the msmarco relevant passage as relevant as well. If not we remove this query from the dataset. The subjects did not label the msmarco relevant passage as relevant. If I then manually label this passage as relevant I am denying the assessments of the assessors. This is why it is better to leave these cases out of the dataset.

In [17]:
queries_2_remove = []
for query_id in query_ids:
    nr_assessors = assessor_data[query_id]
    if nr_assessors >= 3:
        df = dataframes[query_id]
        idx = df.index[(df['msmarco_binary'] == 1) & (df['agreement_label'] == 0)]
        if not (idx.values.size == 0):
            queries_2_remove.append(query_id)

In [18]:
queries_2_remove

['540906', '993987', '427323']

There are 3 queries for which is the case that the subjects did not label the msmarco relevant passage as relevant. So these queries will be left out of the dataset.

Next we want to get rid of all queries for which is the case that (after majority voting) only have the msmarco passage as relevant. We want to get rid of those queries as the do not provide any changes to the original dataset.

In [19]:
query_agreement_data = {}
for query_id in query_ids:
    nr_assessors = assessor_data[query_id]
    if nr_assessors >= 3:
        df = dataframes[query_id]
        nr_relevant = df['agreement_label'].sum()
        query_agreement_data[query_id] = nr_relevant

In [20]:
for query_id, nr_relevant_passages in query_agreement_data.items():
    if nr_relevant_passages < 2:
        if not query_id in queries_2_remove:
            queries_2_remove.append(query_id)

In [21]:
queries_2_remove

['540906', '993987', '427323', '335710']

In [22]:
for query_id in queries_2_remove:
    query_ids.remove(query_id)

Now we want to create the actual dataset

In [23]:
relevant_queries = []
relevant_passages = []
counter = 0
for query_id in query_ids:
    nr_assessors = assessor_data[query_id]
    if nr_assessors >= 3:
        counter += 1
        df = dataframes[query_id]
        relevance_df = df[df['agreement_label'] == 1]
        relevant_queries = relevant_queries + relevance_df['query_id'].values.tolist()
        relevant_passages = relevant_passages + relevance_df['passageid'].values.tolist()
output_df = pd.DataFrame()
output_df['query_id'] = relevant_queries
output_df['label1'] = 0
output_df['passage_id'] = relevant_passages
output_df['label2'] = 1

In [24]:
output_df.to_csv('thesis_dataset_binary_threshold2.tsv',sep='\t',index=False,header=False)

In [25]:
counter

46