#Worker analysis of job #758438

Tong Shu Li<br>
Created on Saturday 2015-08-01<br>
Last updated 2015-08-03

We take a look at the individual workers of job 758438.

In [1]:
from collections import defaultdict
import datetime as dt
import matplotlib as plt
import pandas as pd
import random

In [2]:
random.seed("2015-08-03:14:03")

In [3]:
NUMPY_RAND_SEED = 993402

In [4]:
matplotlib inline

---

In [5]:
raw_data = pd.read_csv("data/crowdflower/results/job_758438_full_with_untrusted.csv", sep = ",")

---

###Building a profile about each worker:

started at time is before created at time

In [6]:
def convert_time(text):
    """
    Given the time in CrowdFlower's output format,
    convert it to a Python datetime object.
    CrowdFlower time stamps are in UTC.
    
    CrowdFlower timestamps have the following format:
    
    month/day/year hour:minute:second
    
    Example:
    
    8/2/2015 00:26:24
    """
    return dt.datetime.strptime(text, "%m/%d/%Y %H:%M:%S")

def time_elapsed(start, stop):
    """
    Given two Python datetime objects, determines the number
    of seconds between those two times.
    """
    return int((stop - start).total_seconds())

def determine_time_taken(dataframe):
    """
    Given a dataframe with the responses from one worker,
    determines the time taken for each individual work unit.
    """
    def convert_wrapper(row):
        return (convert_time(row["_started_at"]), convert_time(row["_created_at"]))
    
    def time_taken_wrapper(row):
        return time_elapsed(row[0], row[1])
    
    if dataframe.empty:
        return pd.Series()

    # convert to datetime objects
    time_stamps = dataframe[["_started_at", "_created_at"]].apply(convert_wrapper, axis = 1)
    
    # determine time elapsed in seconds
    time_taken = time_stamps.map(time_taken_wrapper)
    return time_taken

In [7]:
def time_stats(time_series):
    """
    Given the time taken for each individual work unit,
    determines some stats.
    """
    return (time_series.min(), time_series.median(), time_series.max())

###Build worker profiles:

In [8]:
def build_worker_profiles(raw_data):
    """
    Build a profile for every worker that tried the task.
    """
    res = defaultdict(list)
    
    metadata = ["_trust", "_ip", "_country"]
    ans_choices = ["yes_direct", "yes_indirect", "no_relation", "ner_error"]
    
    for worker_id, group in raw_data.groupby("_worker_id"):
        test_resp = group.query("_golden")
        work_resp = group.query("~_golden")
        
        res["worker_id"].append(worker_id)
        res["test_ques_seen"].append(len(test_resp["uniq_id"].unique()))
        res["work_units_seen"].append(len(work_resp["uniq_id"].unique()))
        
        for metadata_col in metadata:
            res[metadata_col.lstrip("_")].append(test_resp[metadata_col].iloc[0])
            
        for work_type, resp_data in zip(["test", "work"], [test_resp, work_resp]):
            time_series = determine_time_taken(resp_data)
            stats = time_stats(time_series)
            for i, name in enumerate(["min", "median", "max"]):
                res["{0}_{1}_time".format(work_type, name)].append(stats[i])
                
            # look at the response distributions
            for ans_choice in ans_choices:
                temp = resp_data.query("verify_relationship == '{0}'".format(ans_choice))
                res["{0}_{1}".format(work_type, ans_choice)].append(len(temp["uniq_id"].unique()))
        
    return pd.DataFrame(res)

In [9]:
res = build_worker_profiles(raw_data)

In [10]:
res

Unnamed: 0,country,ip,test_max_time,test_median_time,test_min_time,test_ner_error,test_no_relation,test_ques_seen,test_yes_direct,test_yes_indirect,trust,work_max_time,work_median_time,work_min_time,work_ner_error,work_no_relation,work_units_seen,work_yes_direct,work_yes_indirect,worker_id
0,USA,23.119.108.175,439,439.0,439,0,2,6,2,1,0.5000,,,,0,0,0,0,0,3694955
1,PHL,49.144.240.111,680,680.0,680,0,2,6,3,0,0.6667,,,,0,0,0,0,0,6332651
2,MEX,189.202.57.112,1256,1256.0,1256,0,4,6,2,0,0.6667,,,,0,0,0,0,0,6347829
3,CAN,70.50.92.191,1025,1025.0,1025,0,2,6,3,1,0.5000,,,,0,0,0,0,0,6450461
4,USA,67.247.11.75,515,515.0,515,0,1,6,2,3,0.1667,,,,0,0,0,0,0,7051387
5,USA,104.33.65.133,273,273.0,273,0,1,6,0,1,0.1667,,,,0,0,0,0,0,7211526
6,PHL,124.6.181.50,1106,1106.0,1106,0,4,6,0,1,0.6667,,,,0,0,0,0,0,8375045
7,BEL,81.242.111.156,602,602.0,602,0,5,6,0,1,0.6667,,,,0,0,0,0,0,8841252
8,IND,117.215.177.9,1397,720.0,247,0,7,14,6,1,0.7857,784,564.5,247,0,30,40,6,4,9832767
9,MYS,175.136.23.48,328,328.0,328,0,0,6,6,0,0.5000,,,,0,0,0,0,0,11026819


In [12]:
def find_bad_workers(raw_data):
    """
    Find workers who have weird response distributions.
    """
    bad_workers = defaultdict(list)

    for worker_id, group in raw_data.groupby("_worker_id"):
        work_resp = group.query("~_golden")
        work_distribution = work_resp["verify_relationship"].value_counts()
        if len(work_distribution) == 1:
            bad_workers["worker_id"].append(worker_id)
            bad_workers["top_response"].append(work_distribution.index.values[0])
            bad_workers["num_responses"].append(work_distribution.iloc[0])
            
            assert len(work_resp["_ip"].unique()) == 1
            assert len(work_resp["_country"].unique()) == 1
            bad_workers["country"].append(work_resp["_country"].iloc[0])
            bad_workers["ip"].append(work_resp["_ip"].iloc[0])
            
            bad_workers["trust"].append(work_resp["_trust"].iloc[0])
            
    return pd.DataFrame(bad_workers)

In [20]:
def sample_worker_responses(job_id, worker_id, raw_data, sample_size = 10):
    """
    Randomly sample a worker's responses for review by hand.
    """
    test_resp = raw_data.query("_golden and _worker_id == {0}".format(worker_id))
    work_resp = raw_data.query("~_golden and _worker_id == {0}".format(worker_id))
    
    test_sample = test_resp.sample(sample_size, random_state = NUMPY_RAND_SEED)
    work_sample = work_resp.sample(sample_size, random_state = NUMPY_RAND_SEED)
    
    print "Worker id:", worker_id
    
    print "Test question sample:"
    for unit_id in test_sample["_unit_id"].unique():
        print "https://crowdflower.com/jobs/{0}/units/{1}".format(job_id, unit_id)

    print "Work question sample:"
    for unit_id in work_sample["_unit_id"].unique():
        print "https://crowdflower.com/jobs/{0}/units/{1}".format(job_id, unit_id)
        
    print "-----------------------------------------------------------------"

In [21]:
bad_workers = find_bad_workers(raw_data)

In [22]:
bad_workers

Unnamed: 0,country,ip,num_responses,top_response,trust,worker_id
0,VNM,14.0.30.220,165,no_relation,0.6923,31501233
1,VNM,115.76.94.63,230,no_relation,0.8654,31720388
2,VNM,115.76.94.63,230,no_relation,0.9808,31720815
3,VNM,210.211.118.133,60,no_relation,1.0,32025293
4,IND,49.15.80.177,25,ner_mistake,1.0,33081102
5,IND,106.76.60.227,230,ner_mistake,0.9615,33081299
6,IND,27.97.189.169,230,ner_mistake,0.9808,33081469
7,IND,49.15.95.167,121,ner_mistake,0.9355,33081531
8,VNM,125.212.192.29,198,no_relation,0.9565,33085305
9,VNM,115.76.94.63,17,no_relation,1.0,33085428


In [23]:
sample_worker_responses(758438, 33081299, raw_data)

Worker id: 33081299
Test question sample:
https://crowdflower.com/jobs/758438/units/765528961
https://crowdflower.com/jobs/758438/units/765528954
https://crowdflower.com/jobs/758438/units/765528951
https://crowdflower.com/jobs/758438/units/765528963
https://crowdflower.com/jobs/758438/units/765528948
https://crowdflower.com/jobs/758438/units/765528976
https://crowdflower.com/jobs/758438/units/765528938
https://crowdflower.com/jobs/758438/units/765528946
https://crowdflower.com/jobs/758438/units/765528934
https://crowdflower.com/jobs/758438/units/765528974
Work question sample:
https://crowdflower.com/jobs/758438/units/765528299
https://crowdflower.com/jobs/758438/units/765528403
https://crowdflower.com/jobs/758438/units/765528634
https://crowdflower.com/jobs/758438/units/765528345
https://crowdflower.com/jobs/758438/units/765528895
https://crowdflower.com/jobs/758438/units/765528707
https://crowdflower.com/jobs/758438/units/765528548
https://crowdflower.com/jobs/758438/units/765528716


In [47]:
for worker_id in bad_workers:
    s = "https://crowdflower.com/jobs/758438/contributors/{0}".format(worker_id)
    print s

https://crowdflower.com/jobs/758438/contributors/31720388
https://crowdflower.com/jobs/758438/contributors/33301062
https://crowdflower.com/jobs/758438/contributors/32025293
https://crowdflower.com/jobs/758438/contributors/33081102
https://crowdflower.com/jobs/758438/contributors/31720815
https://crowdflower.com/jobs/758438/contributors/31501233
https://crowdflower.com/jobs/758438/contributors/33301138
https://crowdflower.com/jobs/758438/contributors/33081299
https://crowdflower.com/jobs/758438/contributors/33085428
https://crowdflower.com/jobs/758438/contributors/33238902
https://crowdflower.com/jobs/758438/contributors/33085305
https://crowdflower.com/jobs/758438/contributors/33081531
https://crowdflower.com/jobs/758438/contributors/33081469


In [50]:
work_units = raw_data.query("~_golden and _worker_id == {0}".format(33081102))

In [51]:
len(work_units)

25

In [52]:
import random

In [56]:
def worker_units_sample(worker_id, raw_data):
    """
    Randomly sample the work units the worker did.
    """
    job_id = 758438
    work_units = raw_data.query("~_golden and _worker_id == {0}".format(worker_id))

    sample = random.sample(list(work_units["_unit_id"].unique()), 10)
    for unit_id in sample:
        print "https://crowdflower.com/jobs/758438/units/{0}".format(unit_id)

In [57]:
worker_units_sample(33085428, raw_data)

https://crowdflower.com/jobs/758438/units/765528827
https://crowdflower.com/jobs/758438/units/765528569
https://crowdflower.com/jobs/758438/units/765528678
https://crowdflower.com/jobs/758438/units/765528387
https://crowdflower.com/jobs/758438/units/765528570
https://crowdflower.com/jobs/758438/units/765528519
https://crowdflower.com/jobs/758438/units/765527890
https://crowdflower.com/jobs/758438/units/765528235
https://crowdflower.com/jobs/758438/units/765528029
https://crowdflower.com/jobs/758438/units/765527942
