## Import Libraries

In [1]:
import pandas as pd
import json
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt

## Load Data

In [2]:
# filenames
firebase_data = 'data.json'
query_subset_filename = 'queries.dev.small.tsv'
qrels_filename = 'qrels.dev.small.tsv'
bm25_filename = 'run_development_top100.tsv'
assessor_info = 'assessor_info.tsv'

In [3]:
with open(firebase_data, 'r') as infile:
    data = json.load(infile)
query_ids = list(data.keys())

In [4]:
query_subset = pd.read_csv(query_subset_filename,delimiter='\t',encoding='utf-8', header=None)
query_subset.columns = ['query_id', 'query']

In [5]:
bm25_df = pd.read_csv(bm25_filename,delimiter='\t',encoding='utf-8',header=None)
bm25_df.columns = ['query_id', 'passage_id','bm25_rank']

In [None]:
qrels_df = pd.read_csv(qrels_filename,delimiter='\t',encoding='utf-8',header=None)
qrels_df.columns = ['query_id','label1','passage_id','label2']

In [6]:
assessor_df = pd.read_csv(assessor_info,delimiter='\t',encoding='utf-8',header=None)
assessor_df.columns = ["user_id","email","consent","contact","english","progress","assessed_queries"]

## Create Dataframes per Query

In [7]:
def create_labels(nr_assessors):
    labels = ["query_id","passageid","msmarco"]
    for i in range(nr_assessors):
        labels = labels + ['user%s_id'%(i+1),'user%s_label'%(i+1)]
    return labels

In [8]:
dataframe_data = {}
label_data = {}
assessor_data = {}
for query_id in query_ids:
    query_data_lists = []
    query_data = data[query_id]
    nr_assessors = 0
    for i, passage_id in enumerate(query_data.keys()):
        dataFrameRow2be = [query_id, passage_id] + query_data[passage_id]
        query_data_lists.append(dataFrameRow2be)
        if i == 0:
            nr_assessors = int((len(query_data[passage_id])-1)/2)
    dataframe_data[query_id] = query_data_lists
    label_data[query_id] = create_labels(nr_assessors)
    assessor_data[query_id] = nr_assessors

In [9]:
dataframes = {}
for query_id in query_ids:
    df = pd.DataFrame(dataframe_data[query_id],columns=label_data[query_id])
    dataframes[query_id] = df

## Gather Experiment Queries

In [21]:
assessors_do_not_agree = ["838453", "866251", "321951", "427323", "993987", "540906"]
msmarco_only_relevant = ["335710", "489257"]
at_least_3 = []
for query_id in query_ids:
    nr_assessors = assessor_data[query_id]
    if nr_assessors >= 3:
        at_least_3.append(query_id)
experiment_query_ids = [query_id for query_id in at_least_3 if (not (query_id in assessors_do_not_agree)) and (not (query_id in msmarco_only_relevant))]

## Define Agreement Rating Function

In [72]:
def fleiss_kappa(ratings, n, k):
    '''
    Computes the Fleiss' kappa measure for assessing the reliability of 
    agreement between a fixed number n of raters when assigning categorical
    ratings to a number of items.
    
    Args:
        ratings: a list of (item, category)-ratings
        n: number of raters
        k: number of categories
    Returns:
        the Fleiss' kappa score
    
    See also:
        http://en.wikipedia.org/wiki/Fleiss'_kappa
    '''
    items = set()
    categories = set()
    n_ij = {}
    
    for i, c in ratings:
        items.add(i)
        categories.add(c)
        n_ij[(i,c)] = n_ij.get((i,c), 0) + 1
    
    N = len(items)
    
    p_j = {}
    for c in categories:
        p_j[c] = sum(n_ij.get((i,c), 0) for i in items) / (1.0*n*N)
    
    P_i = {}
    for i in items:
        P_i[i] = (sum(n_ij.get((i,c), 0)**2 for c in categories)-n) / (n*(n-1.0))
    
    P_bar = sum(P_i.values()) / (1.0*N)
    P_e_bar = sum(p_j[c]**2 for c in categories)
    
    kappa = (P_bar - P_e_bar) / (1 - P_e_bar)
    
    return kappa

## Deal with missing data

In [53]:
def replace_missing_data(ratings):
    return int(np.ceil(np.median(ratings)))

# Compute Agreement Grades

## Gather Ratings

The functions wants as input a list of ratings (item,category) where item is the query passage pair ids and category is the grade (1-5). So lets go over all dataframes and collect 

In [93]:
ratings_dict = {}
for query_id in experiment_query_ids:
    nr_assessors = assessor_data[query_id]
    user_label_df = dataframes[query_id].iloc[:,4:(nr_assessors*2)+4:2]
    for index, row in user_label_df.iterrows():
        passage_id = dataframes[query_id].iloc[index,:]['passageid']
        item = "%s_%s"%(query_id,passage_id)
        query_ratings = row.values.tolist()
        integer_ratings = [int(rating) for rating in query_ratings if not rating == "no_input"]
        if "no_input" in query_ratings:
            for rating in query_ratings:
                if rating == "no_input":
                    integer_ratings = integer_ratings + [replace_missing_data(integer_ratings)]
        for rating in integer_ratings:
            if not query_id in ratings_dict.keys():
                ratings_dict[query_id] = [(item,rating)]
            else:
                ratings_dict[query_id].append((item,rating))

In [94]:
for query_id, ratings in ratings_dict.items():
    print(query_id)
    nr_assessors = assessor_data[query_id]
    print(nr_assessors)
    print(fleiss_kappa(ratings,nr_assessors,5))
    print("\n")

149161
3
-0.12350597609561761


1049791
3
0.6821192052980136


825147
5
0.18774966711051932


409143
3
0.01639344262295082


1056446
3
0.48069241011984026


1003875
3
-0.012753188297074237


202306
3
0.0023752969121139107


1007473
3
0.03659447348767735


687375
3
0.0800299177262526


904389
3
0.2612137203166227


242107
3
0.2407809110629067


178468
3
0.13778529163144557


689885
3
0.16978309648466702


1091688
3
0.2349726775956285


1095876
3
0.3785310734463277


785721
4
0.219455366244357


1086248
3
-0.10400000000000008


9083
3
0.3445878848063557


993153
7
0.21869870658525414


212195
3
-0.07999999999999996


758519
3
0.15789473684210514


414155
5
0.15954415954415954


440362
3
0.09742120343839533


831784
5
0.3668381480015827


30860
3
0.37313432835820903


75266
3
0.13553895410885822


1040461
3
0.28836174944403253


1097449
7
0.33360794544372735


92542
3
-0.11940298507462681


1034595
3
0.22680412371134062


1077356
3
0.11958914159941313


208822
3
0.13196894848270985


2084

# Compute Agreement Threshold=2

In [76]:
# < 2 is irrelevant >= 2 is relevant
binary_threshold = 2

In [77]:
def makeBinary(labels):
    binary_labels = []
    for label in labels:
        if not label == "no_input":
            if int(label) < binary_threshold:
                binary_labels.append(0)
            else:
                binary_labels.append(1)
        else:
            binary_labels.append(label)
    return binary_labels

In [89]:
ratings_dict = {}
for query_id in experiment_query_ids:
    nr_assessors = assessor_data[query_id]
    user_label_df = dataframes[query_id].iloc[:,4:(nr_assessors*2)+4:2]
    for index, row in user_label_df.iterrows():
        passage_id = dataframes[query_id].iloc[index,:]['passageid']
        item = "%s_%s"%(query_id,passage_id)
        query_ratings = row.values.tolist()
        integer_ratings = [int(rating) for rating in query_ratings if not rating == "no_input"]
        if "no_input" in query_ratings:
            for rating in query_ratings:
                if rating == "no_input":
                    integer_ratings = integer_ratings + [replace_missing_data(integer_ratings)]
        binary_ratings = makeBinary(integer_ratings)
        for rating in binary_ratings:
            if not query_id in ratings_dict.keys():
                ratings_dict[query_id] = [(item,rating)]
            else:
                ratings_dict[query_id].append((item,rating))

In [92]:
for query_id, ratings in ratings_dict.items():
    print(query_id)
    nr_assessors = assessor_data[query_id]
    print(nr_assessors)
    print(fleiss_kappa(ratings,nr_assessors,5))
    print("\n")

149161
3
-0.23428571428571474


1049791
3
1.0


825147
5
0.15110356536502556


409143
3
0.19137466307277654


1056446
3
0.822222222222222


1003875
3
-0.004784688995215103


202306
3
-0.028571428571428702


1007473
3
-0.05263157894736725


687375
3
-0.08019639934533497


904389
3
0.3265993265993266


242107
3
0.8557692307692313


178468
3
0.46428571428571463


689885
3
0.09722222222222215


1091688
3
0.259259259259259


1095876
3
0.6527777777777778


785721
4
0.20423280423280413


1086248
3
-0.01694915254236875


9083
3
0.712918660287081


993153
7
0.3308664816527906


212195
3
-0.028571428571428702


758519
3
0.34782608695652184


414155
5
0.1680194805194805


440362
3
0.19137466307277762


831784
5
0.42555147058823495


30860
3
0.42583732057416257


75266
3
0.3250000000000001


1040461
3
0.3654524089306701


1097449
7
0.39655172413793094


92542
3
-0.15384615384615344


1034595
3
0.22077922077922085


1077356
3
0.15275994865211837


208822
3
-0.17839607201309338


208494
3
0.50900163

# Compute Agreement Threshold=3

In [95]:
# < 3 is irrelevant >= 3 is relevant
binary_threshold = 3

In [96]:
def makeBinary(labels):
    binary_labels = []
    for label in labels:
        if not label == "no_input":
            if int(label) < binary_threshold:
                binary_labels.append(0)
            else:
                binary_labels.append(1)
        else:
            binary_labels.append(label)
    return binary_labels

In [97]:
ratings_dict = {}
for query_id in experiment_query_ids:
    nr_assessors = assessor_data[query_id]
    user_label_df = dataframes[query_id].iloc[:,4:(nr_assessors*2)+4:2]
    for index, row in user_label_df.iterrows():
        passage_id = dataframes[query_id].iloc[index,:]['passageid']
        item = "%s_%s"%(query_id,passage_id)
        query_ratings = row.values.tolist()
        integer_ratings = [int(rating) for rating in query_ratings if not rating == "no_input"]
        if "no_input" in query_ratings:
            for rating in query_ratings:
                if rating == "no_input":
                    integer_ratings = integer_ratings + [replace_missing_data(integer_ratings)]
        binary_ratings = makeBinary(integer_ratings)
        for rating in binary_ratings:
            if not query_id in ratings_dict.keys():
                ratings_dict[query_id] = [(item,rating)]
            else:
                ratings_dict[query_id].append((item,rating))

In [98]:
for query_id, ratings in ratings_dict.items():
    print(query_id)
    nr_assessors = assessor_data[query_id]
    print(nr_assessors)
    print(fleiss_kappa(ratings,nr_assessors,5))
    print("\n")

149161
3
-0.1805555555555557


1049791
3
0.9018003273322419


825147
5
0.5


409143
3
0.194139194139194


1056446
3
0.7054009819967267


1003875
3
0.25925925925925947


202306
3
0.17920656634746937


1007473
3
0.2207792207792212


687375
3
0.0757381258023112


904389
3
0.6273291925465843


242107
3
0.659090909090909


178468
3
0.44444444444444464


689885
3
0.19191919191919168


1091688
3
0.4434137291280149


1095876
3
0.7054009819967267


785721
4
0.5022222222222223


1086248
3
0.1272727272727289


9083
3
0.6148908857509626


993153
7
0.44936086529006886


212195
3
0.12698412698412725


758519
3
0.35406698564593314


414155
5
0.8785818358426419


440362
3
0.6273291925465843


831784
5
0.8191681735985534


30860
3
0.9018003273322426


75266
3
0.44099378881987583


1040461
3
0.7179788484136312


1097449
7
0.6733333333333337


92542
3
-0.13333333333333353


1034595
3
0.4230769230769235


1077356
3
0.2673992673992675


208822
3
0.4258373205741628


208494
3
0.7115384615384616


1006199
3
