Regard human rating as a continuous variable.
Normalize the rating score of each mturker

In [15]:
from __future__ import absolute_import, division

import os
import urllib, cStringIO

import pymongo as pm

import numpy as np
import scipy.stats as stats
import pandas as pd
import json
import re

from PIL import Image
import base64
import sys

import matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import seaborn as sns
sns.set_context('talk')
sns.set_style('darkgrid')

from IPython.display import clear_output

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

### Setup

In [16]:
# set vars 
auth = pd.read_csv('auth.txt', header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'rxdhawkins.me' ## cocolab ip address

# have to fix this to be able to analyze from local
import pymongo as pm
conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')
db = conn['kiddraw']
coll = db['tracing_eval']


### Preprocessing
#### Basic Information about mturk ratings

In [17]:
iteration_names = ['pilot0']
for t in iteration_names:
    data_t = coll.find({'$and':[{'iterationName':t},{'workerId':{'$exists':True}}]}).sort('startTrialTime').sort('workerId')
    print 'Iteration {} has {} ratings in total'.format(t, data_t.count())
    

Iteration pilot0 has 2479 ratings


#### Evaluating raters by the following criterions and exclude raters who gave random responses:
    1. 80% Answers of the catch trial are consistent and the average is larger than 2 （on a 0-4 scale)
    2. There are no more than 20 continuous trials that have the same rating

Then normalize the ratings of each mturker to mean=0 and std=1

In [18]:
all_sessions = data_t.distinct('workerId')
try:
    all_sessions.remove('')
except:
    print " '' is already removed"
    
excluded_workerId = ['', ] # '' is 'no workerId'
valid_ratings = []

exc_incatch = []
exc_2catch = []
exc_20 = []

tolerance = 0.8
threshold = 2

for s in all_sessions: 
    strials = coll.find({'workerId':s})
    
    # if no ratings, exclude the current rater
    if strials.count() == 0:
        excluded_workerId.append(s)
        continue
    
    if list(strials)[0]['iterationName'] not in iteration_names:
        excluded_workerId.append(s)
        continue
        
    # if answers of catch trials are inconsistent, exclude the rater
    catch = coll.find({'workerId':s, 'category':'catch'})
    if catch.count() == 0: # no catch trials = no ratings at all. The first trial is the catch trial
        excluded_workerId.append(s)
        continue
        
    catch_answer = []
    for t in catch:
        catch_answer.append(int(t['button_pressed']))
    
    catch_answer = np.array(catch_answer)
    
    # check if all answers are larger than the threshold
    if len(np.where(catch_answer>threshold)[0]) != len(catch_answer): 
        excluded_workerId.append(s)
        exc_2catch.append(s)
        continue
    
    # check if tolerance% of answers are the same number
    counts = np.max(np.bincount(catch_answer))
    if counts < len(catch_answer) * tolerance:
        excluded_workerId.append(s)
        exc_incatch.append(s)
        continue
    
    # if more than 20 continuous trials are all the same, exclude the rater
    # 1. get all non-catch answers
    trials = coll.find({'workerId':s, 'category':{'$not':{'$eq':'catch'}}}).sort('trial_index')
    if trials.count() == 0:
        excluded_workerId.append(s)
        continue
    
    nc_answer = []
    current_rating = pd.DataFrame(list(trials))
    nc_answer = current_rating['button_pressed']
    
    # 2. check for repetitions
    rep = nc_answer[0]
    rep_times = 0

    for v in nc_answer[1:]:
        if v == rep:
            rep_times+= 1
            if rep_times == 20: 
                print 'catch'
                break
        else:
            rep = v
            rep_times = 0
    
    if rep_times == 20:
        excluded_workerId.append(s)
        exc_20.append(s)
        continue
            
    valid_ratings.append(current_rating)
    
print "Excluded workerIds are {}".format(excluded_workerId)
print "{} participants has inconsistent catch trial answers".format(len(exc_incatch))
print "{} participants gave a comparatively low rating (0, 1, 2) to one or more catch trials".format(len(exc_2catch))
print "{} participants has the same ratings for more than 20 continuous trials".format(len(exc_20))

Excluded workerIds are ['', u'A1KA64FR47O9FS', u'A6GIJTNLDOEWD', u'A3JLE2LJ5I17E2', u'A3SD02HCW68EUL', u'A1JS6T809ORXEF', u'AM8OWAW9TUVLN', u'A2EZNZ6X58RTNR']
2 participants has inconsistent catch trial answers
5 participants gave a comparatively low rating (0, 1, 2) to one or more catch trials
0 participants has the same ratings for more than 20 continuous trials


In [19]:
# write valid responses to a csv file
all_norm = pd.concat(valid_ratings)
all_norm.to_csv('valid_rating2.csv')

In [20]:
filtered = [s for s in all_sessions if s not in excluded_workerId]
print 'We have {} participants in total. After the filtering process, there are {} participants remaining'.format(len(all_sessions), len(filtered))

num_ratings = coll.find({'workerId':{'$nin': excluded_workerId}, 'category':{'$not':{'$eq':'catch'}}})
print "We have {} ratings from {} valid raters on {} children's tracings.".format(num_ratings.count(), len(filtered), len(num_ratings.distinct('session_id')))

unique_tracings = num_ratings.distinct('session_id')
print " {} mturkers has rated {} tracings".format(len(filtered), len(unique_tracings))

We have 26 participants in total. After the filtering process, there are 19 participants remaining
We have 7631 ratings from 19 valid raters on 1086 children's tracings.
 19 mturkers has rated 1086 tracings


In [24]:
all_sessions

[u'A1TLNLB9D87H6',
 u'A3LL096CAY5WHB',
 u'A197FECF831H95',
 u'ALSPLE0B3FHKO',
 u'A1KA64FR47O9FS',
 u'A2ML0070M8FDK1',
 u'A6GIJTNLDOEWD',
 u'A3LT7W355XOAKF',
 u'A13BZCNJ0WR1T7',
 u'A1VVWVCJQ56X7J',
 u'A3JLE2LJ5I17E2',
 u'A3SD02HCW68EUL',
 u'A3F51C49T9A34D',
 u'A2CF2BD4Q0ZDJN',
 u'A1EQ1LHEEIQ3UA',
 u'A1L3937MY09J3I',
 u'A1JS6T809ORXEF',
 u'AYQH26R4KXW2K',
 u'A34FOMRRI5KYCY',
 u'AN6L87Z4LP6R5',
 u'A23TGEQ4CG90PA',
 u'AM8OWAW9TUVLN',
 u'A2EZNZ6X58RTNR',
 u'A2482SLAY120J2',
 u'A1QICXSNBZLODT',
 u'A2RQMEPIHW5BOS']

### Combine Human Rating Results with Image Registration Model Results
For all tracings in the museumstation, the csv file $model$_$result$ contains model outputs. The model output consists of shape error and spatial error for each tracing image.

In [21]:
# read relevant files
model_result = 'museumstation_tracing_ncc.csv'
rating_result = 'valid_rating2.csv'
data = pd.read_csv(model_result)
all_norm = pd.read_csv(rating_result)
all_norm = all_norm.drop(all_norm[all_norm['iterationName'] == 'testing2'].index)
all_norm = all_norm.drop(all_norm[all_norm['session_id'].str.contains('CDM_photodraw')].index)

In [12]:
# Process the model result
# 1. normalize rotation, translation, and scaling
data['norm_r'] = (data['rotate'] - data['rotate'].mean())/data['rotate'].std()
data['norm_t'] = (data['translate'] - data['translate'].mean())/data['translate'].std()
data['norm_s'] = (data['scale'] - data['scale'].mean())/data['scale'].std()

# 2. summarize the three varaibles
w_r, w_t, w_s = 1, 1, 1
data['spatial'] = data['norm_r'] * w_r + data['norm_t'] * w_t + data['norm_s'] * w_s

try:
    data = data.drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1)
except:
    print "Already dropped"

data.to_csv(model_result)

Already dropped


In [13]:
norating = data.loc[data['human_norm_rating'].isnull()].index
data.loc[norating, 'human_norm_rating'] = -10.0
data.loc[norating, 'human_rating'] = -10.0

In [14]:
data['button_pressed'] = -10
new_data = []
drop_index = []

for di, d in data.iterrows():
#     if di>50:
#         break
    human_rating = all_norm[(all_norm['session_id'] == d['session_id']) & (all_norm['category'] == d['category'])]
    if human_rating.shape[0]!=0: # if the current item is rated and there exists at least one rating
        # duplicate the same row for several times
        dnew = pd.DataFrame(np.tile(d.values, (human_rating.shape[0],1)))
        dnew.columns = data.columns
        ratings = list(human_rating['button_pressed'].astype(int))
        
        # fill in the 'button_pressed' value
        for hi, h in dnew.iterrows():
            dnew.loc[hi, 'button_pressed'] = ratings[hi]
        
        new_data.append(dnew)
        drop_index.append(di)
        all_norm = all_norm.drop(human_rating.index)

print len(drop_index)
data = data.drop(drop_index)
new_data.append(data)
new_data = pd.concat(new_data)

try:
    new_data = new_data.drop('Unnamed: 0', axis=1)
except:
    print 'already dropped'  

new_data = new_data.reset_index()
try:
    new_data = new_data.drop('index', axis=1)
except:
    print 'already dropped'
    
new_data.to_csv('tracing_ordinal_data.csv')   

1425


In [22]:
len(pd.unique(data[['session_id', 'category', 'has_ref']].values.ravel()))

4945

In [23]:
len(pd.unique(new_data[['session_id','category']].values.ravel()))

4943

In [11]:
data.drop_duplicates(subset=['session_id', 'category'], keep="last")
data.drop('Unnamed: 0', axis=1)

Unnamed: 0,age,human_norm_rating,human_rating,norm_r,norm_s,norm_t,post_tran,pre_tran,rotate,scale,session_id,spatial,category,translate,has_ref,button_pressed
0,7,1.118034,4.0,-0.687757,-0.805323,-0.712715,-0.299774,-0.292040,0.000179,0.026652,CDM_photodraw_e11532111112405,-2.205796,square,0.023063,True,-10
1,7,0.812601,2.0,-0.309450,-0.226693,-0.386172,-0.609308,-0.355371,0.042778,0.087860,CDM_photodraw_e11532111112405,-0.922315,shape,0.050151,True,-10
2,7,0.231869,2.0,0.011249,0.010505,-0.271782,-0.746630,-0.063507,0.078891,0.112951,CDM_photodraw_e11532111112405,-0.250028,circle,0.059639,False,-10
3,4,0.188011,2.0,-0.551297,-0.598157,-0.374823,-0.603070,-0.256867,0.015545,0.048566,CDM_photodraw_e11532113637303,-1.524277,shape,0.051092,True,-10
4,4,0.384774,2.0,-0.133452,-0.382738,0.958791,-0.610923,-0.063604,0.062596,0.071354,CDM_photodraw_e11532113637303,0.442601,circle,0.161719,False,-10
5,5,-10.000000,-10.0,1.015032,0.897142,-0.064063,-0.609516,-0.061506,0.191922,0.206741,CDM_photodraw_e11532114278628,1.848111,square,0.076870,True,-10
6,5,-1.899047,0.0,-0.676630,0.406432,0.561601,-0.105994,0.020568,0.001432,0.154833,CDM_photodraw_e11532114278628,0.291404,shape,0.128771,True,-10
7,5,-1.173223,0.0,-0.193418,-1.007072,-0.382827,-0.363011,-0.112628,0.055844,0.005311,CDM_photodraw_e11532114278628,-1.583317,circle,0.050428,False,-10
8,7,-0.489993,1.0,-0.624027,-0.332933,-0.759503,-0.530510,-0.154724,0.007355,0.076622,CDM_photodraw_e11532119809715,-1.716463,square,0.019182,True,-10
9,7,0.348192,2.0,-0.623930,-0.612525,-0.086708,-0.633326,-0.418969,0.007366,0.047047,CDM_photodraw_e11532119809715,-1.323163,shape,0.074992,True,-10
