This notebook helps to analyse all assessed queries in order to check if they meet the following three criteria:
1. 3 or more assessors assessed the query
2. the ms marco relevant passage is assessed relevant as well
3. the ms marco relevant passage is not the only relevant assessed passage



## Import Libraries

In [1]:
import pandas as pd
import json
import numpy as np
from collections import Counter

## Load Data

In [3]:
# filenames
firebase_data = 'data/data.json'
query_subset_filename = 'data/queries.dev.small.tsv'
qrels_filename = 'data/qrels.dev.small.tsv'
assessor_info = 'data/assessor_info.tsv'

In [4]:
with open(firebase_data, 'r') as infile:
    data = json.load(infile)
query_ids = list(data.keys())

In [5]:
query_subset = pd.read_csv(query_subset_filename,delimiter='\t',encoding='utf-8', header=None)
query_subset.columns = ['query_id', 'query']

In [6]:
qrels_df = pd.read_csv(qrels_filename,delimiter='\t',encoding='utf-8',header=None)
qrels_df.columns = ['query_id','label1','passage_id','label2']

In [7]:
assessor_df = pd.read_csv(assessor_info,delimiter='\t',encoding='utf-8',header=None)
assessor_df.columns = ["user_id","email","consent","contact","english","progress","assessed_queries"]

## Create DataFrames per Query

In [8]:
def create_labels(nr_assessors):
    labels = ["query_id","passageid","msmarco"]
    for i in range(nr_assessors):
        labels = labels + ['user%s_id'%(i+1),'user%s_label'%(i+1)]
    return labels

In [9]:
dataframe_data = {}
label_data = {}
assessor_data = {}
for query_id in query_ids:
    query_data_lists = []
    query_data = data[query_id]
    nr_assessors = 0
    for i, passage_id in enumerate(query_data.keys()):
        dataFrameRow2be = [query_id, passage_id] + query_data[passage_id]
        query_data_lists.append(dataFrameRow2be)
        if i == 0:
            nr_assessors = int((len(query_data[passage_id])-1)/2)
    dataframe_data[query_id] = query_data_lists
    label_data[query_id] = create_labels(nr_assessors)
    assessor_data[query_id] = nr_assessors

In [10]:
dataframes = {}
for query_id in query_ids:
    df = pd.DataFrame(dataframe_data[query_id],columns=label_data[query_id])
    dataframes[query_id] = df

## Check Missing Data

In [11]:
queries_with_missing_data = []
counter = 0
for query_id in query_ids:
    df = dataframes[query_id]
    if 'no_input' in df.values:
        counter += 1
        print(query_id)
        queries_with_missing_data.append(query_id)
print("nr of queries with missing data: %s"%(counter))

993153
973362
92542
637459
426442
785176
1101714
758519
904389
202306
1083852
736125
689885
971653
1097449
38946
1096257
825147
257885
988142
1045826
1077356
nr of queries with missing data: 22


In [12]:
for query_id in queries_with_missing_data:
    print(query_id)
    df = dataframes[query_id]
    nr_assessors = assessor_data[query_id]
    user_id_columns = ["user%s_label"%(i+1) for i in range(nr_assessors)]
    for user_id_column in user_id_columns:
        print(user_id_column)
        print(len(df[df[user_id_column].isin(["no_input"])]))
    print("\n")

993153
user1_label
0
user2_label
19
user3_label
1
user4_label
0
user5_label
0
user6_label
0
user7_label
0


973362
user1_label
0
user2_label
1


92542
user1_label
0
user2_label
0
user3_label
1


637459
user1_label
1


426442
user1_label
0
user2_label
3
user3_label
0


785176
user1_label
2


1101714
user1_label
1


758519
user1_label
0
user2_label
1
user3_label
0


904389
user1_label
0
user2_label
0
user3_label
1


202306
user1_label
0
user2_label
1
user3_label
0


1083852
user1_label
2


736125
user1_label
2


689885
user1_label
1
user2_label
0
user3_label
0


971653
user1_label
1


1097449
user1_label
0
user2_label
0
user3_label
0
user4_label
1
user5_label
0
user6_label
0
user7_label
0


38946
user1_label
1


1096257
user1_label
0
user2_label
1
user3_label
0


825147
user1_label
0
user2_label
0
user3_label
1
user4_label
0
user5_label
0


257885
user1_label
1


988142
user1_label
9


1045826
user1_label
1


1077356
user1_label
1
user2_label
0
user3_label
0




Most of the times it is just one data entry. Except for query 993153 for which user 2 only entered one label. We could either remove this query entirely from the dataset or we could just leave out the input from user 2 as there are 6 other assessors for this query. I will do the latter.

In [13]:
del dataframes['993153']['user2_label']
del dataframes['993153']['user2_id']

In [14]:
assessor_data['993153'] = 6

In [15]:
nr_assessors = assessor_data['993153']
user_column_names = []
for i in range(nr_assessors):
    user_column_names = user_column_names + ["user%s_id"%(i+1), "user%s_label"%(i+1)]
dataframes['993153'].columns = ['query_id','passageid','msmarco'] + user_column_names

## Check Criteria

First we check how many queries do not have at least 3 assessors.

In [16]:
print(len(query_ids))

125


In [17]:
less_then_3 = []
for query_id in query_ids:
    nr_assessors = assessor_data[query_id]
    if not nr_assessors >= 3:
        less_then_3.append(query_id)

In [21]:
print(len(less_then_3))

74


Second, we continue with the remaining queries to see for how many of these the assessors agree with the MS MARCO relevant passage. We take a binary threshold of 3 because this causes the more irrelevant passages than taking a threshold of 2. Which would result in more disagreement with the original relevant passage. Moreover, for the graded case we look for the cases where the MS MARCO relevant passage is assessed with a grade 1 (totally irrelevant). We also consider the cases where the MS MARCO relevant passage is not located in the top 20.

### binary threshold = 3

In [22]:
def getListUserLabelColumns(nr_assessors):
    label_columns = []
    for i in range(nr_assessors):
        label_columns.append("user%s_label"%(i+1))
    return label_columns

def makeBinary(labels):
    binary_labels = []
    for label in labels:
        if not label == "no_input":
            if int(label) < binary_threshold:
                binary_labels.append(0)
            else:
                binary_labels.append(1)
        else:
            binary_labels.append(label)
    return binary_labels

def getAgreementLabel(msmarco_label,labels):
    if "no_input" in labels:
        labels.remove('no_input')
        if usersDoAgree(labels):
            return labels[0]
        else:
            return performMajorityVote(msmarco_label,labels)
    else:
        if usersDoAgree(labels):
            return labels[0]
        else:
            return performMajorityVote(msmarco_label,labels)
        
def usersDoAgree(labels):
    if len(set(labels)) == 1:
        return True
    else:
        return False
    
def performMajorityVote(msmarco_label,labels):
    count_votes = Counter(labels)
    majority_label = msmarco_label
    if (len(labels) % 2) == 0:
        vote_threshold = int(len(labels)/2)
        for label,votes in count_votes.items():
            if votes > vote_threshold:
                majority_label = label
    else:
        vote_threshold = int(np.ceil(len(labels)/2))
        for label, votes in count_votes.items():
            if votes >= vote_threshold:
                majority_label = label
    return majority_label

In [26]:
# < 3 is irrelevant >= 3 is relevant
binary_threshold = 3
counter_no_relevant_in_top_20 = 0
counter_disagreement = 0

no_top20_ms_marco = []
no_agreement = []

for query_id in query_ids:
    df = dataframes[query_id]
    agreement_labels = []
    nr_assessors = assessor_data[query_id]
    if nr_assessors >= 3:
        relevant_row = df.loc[df['msmarco'] == 'relevant']
        if not len(relevant_row) == 0:
            relevant_row['msmarco_binary'] = relevant_row['msmarco'].apply(lambda x: 0 if (x == 'irrelevant') else 1)
            user_labels = relevant_row[getListUserLabelColumns(nr_assessors)].values[0]
            binary_labels = makeBinary(user_labels)
            agreement_label = getAgreementLabel(relevant_row['msmarco_binary'],binary_labels)
            if agreement_label == 0:
                print(query_id)
                counter_disagreement += 1
                no_agreement.append(query_id)
        else:
            counter_no_relevant_in_top_20 += 1
            no_top20_ms_marco.append(query_id)

427323
321951
838453
993987
866251
540906


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [25]:
print(counter_no_relevant_in_top_20)

3


In [27]:
print(counter_disagreement)

6


### graded 

In [28]:
def str2int(labels):
    integer_labels = []
    for label in labels:
        if not label == "no_input":
            integer_labels.append(int(label))
    return integer_labels

def getAgreementLabel(labels):
    return int(np.ceil(np.median(labels)))

def toNum(label):
    if label == "irrelevant":
        return 1
    else:
        return 5

In [29]:
counter_no_relevant_in_top_20_graded = 0
counter_disagreement_graded = 0

no_agreement_graded = []
no_top20_ms_marco_graded = []

for query_id in query_ids:
    df = dataframes[query_id]
    nr_assessors = assessor_data[query_id]
    if nr_assessors >= 3:
        relevant_row = df.loc[df['msmarco'] == 'relevant']
        if not len(relevant_row) == 0:
            assessor_labels = relevant_row.iloc[:,4::2].values[0]
            integer_labels = str2int(assessor_labels)
            agreement_label = getAgreementLabel(integer_labels)
            relevant_row['agreement_label'] = agreement_label
            relevant_row['msmarco'] = toNum(relevant_row['msmarco'].values)
            [msmarco_label, agreement_label] = relevant_row[['msmarco','agreement_label']].values[0]
            if agreement_label == 1:
                print(query_id)
                counter_disagreement_graded += 1
                no_agreement_graded.append(query_id)
        else:
            counter_no_relevant_in_top_20_graded += 1
            no_top20_ms_marco_graded.append(query_id)

427323
993987
540906


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [30]:
print(counter_no_relevant_in_top_20_graded)

3


In [31]:
print(counter_disagreement_graded)

3


In [32]:
no_agreement_total = []
for query_id in no_agreement:
    if not query_id in no_agreement_total:
        no_agreement_total.append(query_id)
        
for query_id in no_agreement_graded:
    if not query_id in no_agreement_total:
        no_agreement_total.append(query_id)
        
for query_id in no_top20_ms_marco:
    if not query_id in no_agreement_total:
        no_agreement_total.append(query_id)
        
for query_id in no_top20_ms_marco_graded:
    if not query_id in no_agreement_total:
        no_agreement_total.append(query_id)

In [33]:
print(len(no_agreement_total))

9


Finally, we continue with the remaining queries to see if it is not the case that the MS MARCO relevant passage is the only relevant one. Again we do this for both the threshold=3 and graded.

### Binary Treshold=3

In [35]:
for query_id in query_ids:
    df = dataframes[query_id]
    df['msmarco_binary'] = df['msmarco'].apply(lambda x: 0 if (x == 'irrelevant') else 1)

In [37]:
def makeBinary(labels):
    binary_labels = []
    for label in labels:
        if not label == "no_input":
            if int(label) < binary_threshold:
                binary_labels.append(0)
            else:
                binary_labels.append(1)
        else:
            binary_labels.append(label)
    return binary_labels

def getListUserLabelColumns(nr_assessors):
    label_columns = []
    for i in range(nr_assessors):
        label_columns.append("user%s_label"%(i+1))
    return label_columns

def getAgreementLabel(msmarco_label,labels):
    if "no_input" in labels:
        labels.remove('no_input')
        if usersDoAgree(labels):
            return labels[0]
        else:
            return performMajorityVote(msmarco_label,labels)
    else:
        if usersDoAgree(labels):
            return labels[0]
        else:
            return performMajorityVote(msmarco_label,labels)
        
def usersDoAgree(labels):
    if len(set(labels)) == 1:
        return True
    else:
        return False
    
def performMajorityVote(msmarco_label,labels):
    count_votes = Counter(labels)
    majority_label = msmarco_label
    if (len(labels) % 2) == 0:
        vote_threshold = int(len(labels)/2)
        for label,votes in count_votes.items():
            if votes > vote_threshold:
                majority_label = label
    else:
        vote_threshold = int(np.ceil(len(labels)/2))
        for label, votes in count_votes.items():
            if votes >= vote_threshold:
                majority_label = label
    return majority_label

In [39]:
# < 3 is irrelevant >= 3 is relevant
binary_threshold = 3

count_cases_where_only_ms_relevant = 0
ms_only_relevant_binary = []


for query_id in query_ids:
    if not query_id in less_then_3:
        if not query_id in no_agreement_total:
            df = dataframes[query_id]
            agreement_labels = []
            nr_assessors = assessor_data[query_id]
            if nr_assessors >= 3:
                for index, row in df.iterrows():
                    user_labels = row[getListUserLabelColumns(nr_assessors)].values
                    binary_labels = makeBinary(user_labels)
                    agreement_label = getAgreementLabel(row['msmarco_binary'],binary_labels)
                    agreement_labels.append(agreement_label)
                df['agreement_label'] = agreement_labels
                if (len(df[(df['msmarco_binary'] == 1) & (df['agreement_label'] == 1)]) == 1):
                    if(sum(df[df['msmarco_binary'] == 0]['agreement_label'].values.tolist()) == 0):
                        count_cases_where_only_ms_relevant += 1
                        print(query_id)
                        ms_only_relevant_binary.append(query_id)

489257
335710


### Graded

In [40]:
def getListUserLabelColumns(nr_assessors):
    label_columns = []
    for i in range(nr_assessors):
        label_columns.append("user%s_label"%(i+1))
    return label_columns

def str2int(labels):
    integer_labels = []
    for label in labels:
        if not label == "no_input":
            integer_labels.append(int(label))
    return integer_labels

def getAgreementLabel(labels):
    return int(np.ceil(np.median(labels)))

In [41]:
count_cases_where_only_ms_relevant = 0
ms_only_relevant_graded = []

for query_id in query_ids:
    if not query_id in less_then_3:
        if not query_id in no_agreement_total:
            df = dataframes[query_id]
            agreement_labels = []
            nr_assessors = assessor_data[query_id]
            if nr_assessors >= 3:
                for index, row in df.iterrows():
                    user_labels = row[getListUserLabelColumns(nr_assessors)].values
                    integer_labels = str2int(user_labels)
                    agreement_label = getAgreementLabel(integer_labels)
                    agreement_labels.append(agreement_label)
                df['agreement_label'] = agreement_labels
                unique_labels = set(df[df['msmarco'] == 'irrelevant']['agreement_label'].values.tolist())
                if len(unique_labels) == 1 and unique_labels[0] == 1:
                    count_cases_where_only_ms_relevant += 1
                    ms_only_relevant_graded.append(query_id)

In [42]:
ms_only_relevant_total = []

for query_id in ms_only_relevant_binary:
    if not query_id in ms_only_relevant_total:
        ms_only_relevant_total.append(query_id)
        
for query_id in ms_only_relevant_graded:
    if not query_id in ms_only_relevant_total:
        ms_only_relevant_total.append(query_id)

In [43]:
ms_only_relevant_total

['489257', '335710']

## Compute Experiment Queries

In [44]:
experiment_queries = []
for query_id in query_ids:
    if not query_id in less_then_3:
        if not query_id in no_agreement_total:
            if not query_id in ms_only_relevant_total:
                experiment_queries.append(query_id)

In [45]:
print(len(experiment_queries))

40


In [46]:
with open("output/experiment_queries.txt","w") as outfile:
    for query_id in experiment_queries:
        line = "%s\n"%(query_id)
        outfile.write(line)