In [635]:
import os
import os.path as path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

PROJ_DIR = path.abspath('.')

results = pd.read_csv(path.join(PROJ_DIR,'mturk_results_v3.csv'), dtype=str)
golden = pd.read_csv(path.join(PROJ_DIR,'mf_df_golden_resps.csv'), dtype=str)
orig_results_columns = results.columns
results.columns

Index(['HITId', 'HITTypeId', 'Title', 'Description', 'Keywords', 'Reward',
       'CreationTime', 'MaxAssignments', 'RequesterAnnotation',
       'AssignmentDurationInSeconds', 'AutoApprovalDelayInSeconds',
       'Expiration', 'NumberOfSimilarHITs', 'LifetimeInSeconds',
       'AssignmentId', 'WorkerId', 'AssignmentStatus', 'AcceptTime',
       'SubmitTime', 'AutoApprovalTime', 'ApprovalTime', 'RejectionTime',
       'RequesterFeedback', 'WorkTimeInSeconds', 'LifetimeApprovalRate',
       'Last30DaysApprovalRate', 'Last7DaysApprovalRate', 'Input.Index',
       'Input.HS_CITY', 'Input.HS_COUNTRY', 'Input.HS_NAME',
       'Input.HS_POSTAL_CD', 'Input.HS_STATE', 'Input.HS_CEEB',
       'Input.HS_NCES', 'Input.HS_MATCH_NAME', 'Input.HS_STATE_FULL',
       'Answer.HS_NCES', 'Answer.HS_NO_NCES.on', 'Approve', 'Reject'],
      dtype='object')

First combine answers into single column for easy processing. Designed "not in database" with a string nan (ie. "NA") and designated no-answer submissions with a `np.nan`

In [636]:
results['Answer'] = results['Answer.HS_NCES'] 
results['Answer'][[_=='true' for _ in results['Answer.HS_NO_NCES.on']]] = 'NA'

golden['Answer'] = golden['Answer.HS_NCES'] 
golden['Answer'][[_=='true' for _ in golden['Answer.HS_NO_NCES.on']]] = 'NA'

Calculate how often each worker "dissented" (ie. submitted an answer different from the most common answer) and how often they "lone dissented" (ie. were the only worker to submit a different answer)

**Note:** I was originally intending to use these as rejection criteria, but ended up not doing so. I'm keeping this in for future reference, and because the code might be handy for determining the "correct" answers.  

In [637]:
resps = results[['WorkerId','Answer','Input.HS_NAME']].rename(columns={'Input.HS_NAME':'HS_NAME'})

worker_dissent_rates = []
#resps['lone_dissent'] = 
for worker in resps.WorkerId.unique():
    answers = resps.loc[resps.WorkerId==worker]
    
    dissents = 0
    lone_dissents = 0
    total_resps = answers.shape[0]
    
    for i in range(answers.shape[0]):
        if pd.isnull(answers.Answer.iloc[i]):
            continue
        else:
            pass
        
        name = answers.HS_NAME.iloc[i]
        match_answers = resps.loc[resps.HS_NAME==name]
        
        modal_answer = answers.Answer.value_counts().idxmax()
        
        if answers.Answer.iloc[i] != modal_answer:
            dissents += 1
            
        else:
            pass
        
        if match_answers.Answer.value_counts()[answers.Answer.iloc[i]]==1:
            lone_dissents += 1
            
        else:
            pass


    row = {'WorkerId':worker,
           'lone_dissent_rate':(lone_dissents/total_resps),
           'dissent_rate':(dissents/total_resps),
           'total_resps':total_resps
          }
    worker_dissent_rates.append(row)
    
worker_dissent_rates = pd.DataFrame(worker_dissent_rates)

Calculate how each worker performed on the "golden set" of known answers

In [638]:
known = golden[['WorkerId','Answer','Input.HS_NAME']].rename(columns={'Input.HS_NAME':'HS_NAME'})

worker_golden_rates = []
for worker in resps.WorkerId.unique():
    answers = resps.loc[resps.WorkerId==worker]
    answers_known = answers.merge(known,on='HS_NAME')
    if answers_known.shape[0]>0:
        rate = np.sum(answers_known.Answer_y == answers_known.Answer_x)/answers_known.shape[0]
    else:
        rate = np.nan
    
    row = {'WorkerId':worker, 'golden_rate':rate, 'total_golden':answers_known.shape[0]}
    worker_golden_rates.append(row)
    
worker_golden_rates = pd.DataFrame(worker_golden_rates)

Combine dissent rate and golden set performance into one Df

In [639]:
worker_quality = worker_golden_rates.merge(worker_dissent_rates,on='WorkerId')

* Reject all workers who answered 3 or more golden answers and got less than 66% of them correct
* **Disregard this criteria:** Reject all workers who were the "lone dissenter" more than 75% of the time

In [640]:
low_golden = (worker_quality.golden_rate<.66) & (worker_quality.total_golden>2)
high_lone_dissent = (worker_quality.lone_dissent_rate>.75)
# note that low_quality = low_golden
low_quality_workers = low_golden

low_quality_workers = set(worker_quality.WorkerId.loc[low_quality_workers])
print(f'There were {len(low_quality_workers)} workers whose performance was "low-quality"')

low_quality_resps = [_ in low_quality_workers for _ in resps.WorkerId]
print(f'There were {sum(low_quality_resps)} responses coming from those workers')


There were 5 workers whose performance was "low-quality"
There were 1868 responses coming from those workers


In [641]:
for i in range(results.shape[0]):
    if results.WorkerId.iloc[i] in low_quality_workers:
        results['Reject'].iloc[i] = 'x'
        
    elif pd.isnull(resps.Answer.iloc[i]):
        results['Reject'].iloc[i] = 'x'
    
    else:
        results['Approve'].iloc[i] = 'x'
        
n_reject = np.sum(results.Reject=='x')
print(f'Reject {n_reject} answers of {results.shape[0]} total')

Reject 1877 answers of 5200 total


Drop columns that were added in during grading and save results

In [642]:
results.drop([_ for _ in results.columns if _ not in orig_results_columns], axis=1, inplace=True)
results.replace(np.nan,' ',inplace=True)
results.to_csv('mturk_results_v3_graded.csv',index=False)

Check final results

In [643]:
def mode_answer(x):
    return(x.value_counts().idxmax())

nces = resps.loc[results.Approve=='x'].pivot_table(index='HS_NAME',values='Answer',aggfunc=mode_answer)

n_na = np.sum(nces.Answer=='NA')
n_nces = nces.shape[0] - np.sum(nces.Answer=='NA')

print(f'{n_na} schools were designated as "not in the database" and {n_nces} were matched to a NCES code')

966 schools were designated as "not in the database" and 286 were matched to a NCES code
