# Worker clustering for job 758438

Tong Shu Li<br>
Created on Thursday 2015-08-06<br>
Last updated: 2015-08-06

In this notebook we try to calculate Cohen's Kappa for each pair of workers for job #758438. We then use the Kappa to try and cluster workers by agreement. Our hope is to determine clusters of workers who are likely to be cheaters.

In [1]:
from __future__ import division
from collections import defaultdict
from itertools import combinations_with_replacement
import numpy as np
import pandas as pd

---

In [2]:
raw_data = pd.read_csv("data/crowdflower/results/job_758438_full_with_untrusted.csv", sep = ',')

---

In [3]:
def kappa_matrix(worker_id_col, resp_col, poss_resp, raw_data):
    """
    Given a dataframe representing the choices of multiple workers
    on a categorization task with mutually exclusive categories,
    this function calculates the Cohen's kappa for each unique
    pair of workers.
    
    worker_id_col = column with unique worker ids
    resp_col = column with the worker responses
    poss_resp = set of all possible responses in the resp_col
    """
    # for each worker determine which question ids they chose for each possible response
    worker_resp = defaultdict(dict)
    for worker_id, user_work in raw_data.groupby(worker_id_col):
        for resp_choice in poss_resp:
            sub = user_work.query("{0} == '{1}'".format(resp_col, resp_choice))
            work_ids = set(sub["uniq_id"])
            worker_resp[worker_id][resp_choice] = work_ids

    # calculate kappa matrix
    kappa = defaultdict(dict)
    all_workers = set(raw_data[worker_id_col])
    for worker_A, worker_B in combinations_with_replacement(all_workers, 2):
        # for all unique worker pairs
        if worker_A == worker_B:
            kappa[worker_A][worker_A] = 1
            continue
        
        # find work done by each worker (including test questions)
        work_A = raw_data.query("{0} == {1}".format(worker_id_col, worker_A))
        work_B = raw_data.query("{0} == {1}".format(worker_id_col, worker_B))
        
        work_A_ids = set(work_A["uniq_id"])
        work_B_ids = set(work_B["uniq_id"])
        
        in_common = work_A_ids & work_B_ids
        if not in_common:
            # no way to calculate agreement
            kappa[worker_A][worker_B] = np.nan
            kappa[worker_B][worker_A] = np.nan
        else:
            common_work = raw_data.query("uniq_id in {0}".format(list(in_common)))
            
            # size of work units both A and B worked on
            M = len(in_common)
            
            # determine how many work units both workers agreed upon
            agree = 0
            for resp_choice in poss_resp:
                agree += len(worker_resp[worker_A][resp_choice] & worker_resp[worker_B][resp_choice])
                
            assert agree <= M
            p_naught = agree / M
            
            A_resp = common_work.query("{0} == {1}".format(worker_id_col, worker_A))
            A_distribution = A_resp[resp_col].value_counts(normalize = True)
            
            B_resp = common_work.query("{0} == {1}".format(worker_id_col, worker_B))
            B_distribution = B_resp[resp_col].value_counts(normalize = True)
            
            # using python sum() if nan exists converts answer to nan
            p_e = (A_distribution * B_distribution).sum()
            
            val = (p_naught - p_e) / (1 - p_e)
            kappa[worker_A][worker_B] = val
            kappa[worker_B][worker_A] = val
            
    return pd.DataFrame(kappa)

In [None]:
responses = {"yes_direct", "yes_indirect", "no_relation", "ner_mistake"}
ans = kappa_matrix("_worker_id", "verify_relationship", responses, raw_data)

In [17]:
ans

Unnamed: 0,33081299,33081469,33301062,33387828
33081299,1.0,0.932345,0.938906,0.621318
33081469,0.932345,1.0,0.917081,0.42515
33301062,0.938906,0.917081,1.0,0.433298
33387828,0.621318,0.42515,0.433298,1.0


---