In [1]:
import mturk
import random
from datetime import datetime
import json
from pprint import pprint
import copy
import numpy as np
from collections import OrderedDict
import pandas as pd
import pymongo
import uuid

In [2]:
with open('./data/pairs_en.json','r') as f:
    pairs_all = json.load(f)
with open('./data/DocumentPairSetsForHITS','r') as f:
    pairSets = json.load(f)
with open('./data/DocumentRankingSetsForHITS.json','r') as f:
    rankingSets = json.load(f) 
    
''' Check if every pair in pairs_all appears at least 5 times in the pairSets '''
counter = [0]*len(pairs_all)
for s in pairSets:
    for ss in s['documents']:
        id1 = ss['document_1']['id']
        id2 = ss['document_2']['id']
        index = [i for (i,p) in enumerate(pairs_all) if (p['id1'] == id1 and p['id2'] == id2)]
        if len(index) > 0:
            counter[index[0]] = counter[index[0]] + 1
assert(all([c >= 5 for c in counter]))

''' Check if every pais in pairs_all appears at least 25 times in the rankingSets '''
counter = [0]*len(pairs_all)
for s in rankingSets:
    for ss in s['documents']:
        id1 = ss['main_document']['id']
        id2_s = [sss['id'] for sss in ss['documents']]
        indexes = [i for (i,p) in enumerate(pairs_all) if (p['id1'] == id1 and p['id2'] in id2_s)]
        if len(indexes) > 0:
            for index in indexes:
                counter[index] = counter[index] + 1
assert(all([c >= 25 for c in counter]))

In [3]:
""" Connect to MTurk and to the Mongodb database. Set the boolean below to TRUE to use the marketplace and to FALSE to use the sandbox (testing the HITs)"""
create_hits_in_production = True

db_client = pymongo.MongoClient("mongodb+srv://<username>:<password>@cluster0-hjstc.mongodb.net/test?retryWrites=true&w=majority")
db = db_client['tbfy']

hit_result_collection = db.hit_results if create_hits_in_production else db.hit_results_sandbox

mt = mturk.MTurk()
mt.launch_client(create_hits_in_production)

839.92


In [10]:
""" Ban the spammers! """
with open('./config/banlist.json','r') as f:
    banlist = json.load(f)
for w in banlist:
    response = mt.client.create_worker_block(
        WorkerId=w,
        Reason='You are copy and pasting text'
    )
    assert(response['ResponseMetadata']['HTTPStatusCode'] == 200)

In [4]:
""" Create the tasks by populating the HTML templates using the config file """

task_content = json.loads(open('./config/task_content_ranking.json').read())
#task_content = json.loads(open('./config/task_content_pairs_likert.json').read())
#task_content = json.loads(open('./config/task_content_pairs_magnitude.json').read())

TaskAttributes = task_content['task_attributes']

if task_content['task_type'] == 'pairs':  
    html_layout = open('./DocSimPairsTemplate.html', 'r').read()
    
    if task_content['input_scale'] == 'likert':
        context_inputs = open('./contextSliderTemplate.html', 'r').read()
    elif task_content['input_scale'] == 'magnitude':
        context_inputs = open('./contextMagnitudeTemplate.html', 'r').read()
        
    html_layout = html_layout.\
        replace('${instructions_project}', task_content['instructions_project']).\
        replace('${instructions_intro}', task_content['instructions_intro']).\
        replace('${instructions_similarities}', task_content['instructions_similarities']).\
        replace('${instructions_rules}', task_content['instructions_rules']).\
        replace('${time_thr}', task_content['time_thr']).\
        replace('${context_inputs}', context_inputs)
        
    documentSets = pairSets

    
elif task_content['task_type'] == 'ranking':
    html_layout = open('./DocSimRankingTemplate.html', 'r').read()
    
    html_layout = html_layout.\
        replace('${instructions_project}', task_content['instructions_project']).\
        replace('${instructions_intro}', task_content['instructions_intro']).\
        replace('${instructions_similarities}', task_content['instructions_similarities']).\
        replace('${instructions_rules}', task_content['instructions_rules']).\
        replace('${time_thr}', task_content['time_thr'])
            
    documentSets = rankingSets
# If you're only testing, just pick one hit and run it once, with no qualification barriers
if not create_hits_in_production:
    TaskAttributes.pop('QualificationRequirements')
    TaskAttributes['MaxAssignments'] = 1
    documentSets = random.sample(documentSets,1)
pprint(TaskAttributes,indent=1) #verify the properties before running the HITs

{'AssignmentDurationInSeconds': 3600,
 'Description': 'Rank documents according to how similar they are and tell us '
                'why',
 'Keywords': 'document, similarity, reading',
 'LifetimeInSeconds': 604800,
 'MaxAssignments': 1,
 'QualificationRequirements': [{'ActionsGuarded': 'DiscoverPreviewAndAccept',
                                'Comparator': 'GreaterThanOrEqualTo',
                                'IntegerValues': [99],
                                'QualificationTypeId': '000000000000000000L0'},
                               {'ActionsGuarded': 'DiscoverPreviewAndAccept',
                                'Comparator': 'GreaterThan',
                                'IntegerValues': [1000],
                                'QualificationTypeId': '00000000000000000040'},
                               {'ActionsGuarded': 'DiscoverPreviewAndAccept',
                                'Comparator': 'In',
                                'LocaleValues': [{'Country': 'US'},
    

In [5]:
""" See hor many HITs this will generate, already multiplied by the expected number of assignments.
Multiply the resulting number by the payment to see how much money this batch will consume. """
l=[]
target_assignments = TaskAttributes['MaxAssignments']
for documentSet in documentSets:
    TaskAttributes_hit = copy.deepcopy(TaskAttributes)
    TaskAttributes_hit['MaxAssignments'] = target_assignments -\
        sum([hit['hit']['NumberOfAssignmentsCompleted'] for hit in hit_result_collection.find({
            'documents_id':documentSet['_id'],
            'type': task_content['task_type'],
            'scale': task_content['input_scale']
        })])
    l.append(TaskAttributes_hit['MaxAssignments'])
print('Remaining HITs: {}'.format(sum(l)))

Remaining HITs: 218


In [9]:
""" Create the batch of HITs """
results = []
hit_type_id = ''
batch_id = str(uuid.uuid4())
target_assignments = TaskAttributes['MaxAssignments']
for documentSet in documentSets:
    TaskAttributes_hit = copy.deepcopy(TaskAttributes) # Adjust based on how many were already done in other batches
    TaskAttributes_hit['MaxAssignments'] = target_assignments -\
        sum([hit['hit']['NumberOfAssignmentsCompleted'] for hit in hit_result_collection.find({
            'documents_id':documentSet['_id'],
            'type': task_content['task_type'],
            'scale': task_content['input_scale']
        })])
    if TaskAttributes_hit['MaxAssignments'] > 0:
        response = mt.create_hit(
            html_layout.replace('${documents}', str(documentSet['documents'])),
            **TaskAttributes_hit
        )

        hit_type_id = response['HIT']['HITTypeId']
        result = {
            '_id': response['HIT']['HITId'],
            'batch_id': batch_id,
            'type': task_content['task_type'],
            'scale': task_content['input_scale'],
            'documents': documentSet['documents'],
            'documents_id':documentSet['_id'],
            'hit': response['HIT'],
            'timestamp': datetime.now()
        }
        results.append(result)
        hit_result_collection.insert_one(result)

In [None]:
# For you to go to the HITs you just created and test them
if not create_hits_in_production:
    print('You can view the HITs here:')
    print(mt.mturk_environment['preview']+"?groupId={}".format(hit_type_id))

In [None]:
''' Update all non-disposed hits in the database with correct results '''
# It is much better to run the 'update_db.py' script, which runs this routine in loop and keeps the DB updated
for hit in hit_result_collection.find({'hit.HITStatus': {'$not': {'$eq': 'Disposed'}}}):
    print('Updating',hit['_id'],end='\r')
    try:
        hit_result_collection.update_one(
            {'_id': hit['_id']},
            {
                "$set": {
                    "hit": mt.client.get_hit(HITId = hit['_id'])['HIT'],
                    'answers': mt.approve_and_get_hit_answers(hit['_id'])
                }
            })
    except Exception as e:
        print(e,end='\n\n')
        continue
print('Done'+(' '*100))

In [4]:
""" If you set 'force' to TRUE, it will abort mission and force an expiry in all HITs and then delete them.
If you only want to delete the completed ones (make them Disposed so the update routine won't loop through tons of HITs),
keep it as FALSE."""
force = False
''' Dispose all hits in the database '''
hit_result_collection_list = list(hit_result_collection.find({'hit.HITStatus': {'$not': {'$eq': 'Disposed'}}}))
for hit in hit_result_collection_list:
    try:
        mt.client.delete_hit(HITId = hit['_id'])
        print('Deleted',hit['_id'])
    except Exception as e:
        print(hit['_id'], e)
        if force:
            try:
                mt.client.update_expiration_for_hit(HITId = hit['_id'], ExpireAt=datetime(2017, 1, 1))
                mt.client.delete_hit(HITId = hit['_id'])
                print('Deleted',hit['_id'])
            except Exception as e:
                print(hit['_id'],e)
        continue

Deleted 3BA7SXOG2M2BQ3DXWFXSGA78P4GR8E
Deleted 32K26U12EQ0F05Q5M7DI3ZACL1QDVZ
Deleted 3IV1AEQ4EUPUSFUKOAOJEJLHFNMJ8E
Deleted 3HEADTGN3S491F9A48828ORG02HVRN
Deleted 3AQN9REUUISW4ZLARI02AYE63ISYDT
Deleted 30QQTY5GNNWZ6H0VQ40R7LYXLJEU7Y
Deleted 3MQKOF1EF50SFUQACHIEP1TQSASDW9
Deleted 32CXT5U15JF36V5E09SXLLUSW3KU8L
Deleted 3HA5ODM5LD4L2U9NJX87YYWCM40VSB
Deleted 3V8JSVE8Z121W5ZANDBMGGWR3UTYEM
Deleted 3D42WVSDIB7T2JZ6IOVSYYHRFV8YF4
Deleted 39N6W9XWSGZZGJV1T81AGJH3GA8YG3
Deleted 3YO4AH2FQGWTWDTUOWVDI5RLAK30Q0
Deleted 37NXA7GVTWIYYHEXZOB6MCIHUASLVN
Deleted 3RTFSSG7UBIBD6E2PYG28E1V1JULWX
Deleted 3OYHVNTV6WASORNIDCDRCKBU9HQOKL
Deleted 3D5G8J4N6DGMHUM28XDX83SYONEVT0
Deleted 3SR6AEG6X85DGFQCDEJS1JT4VALYH2
Deleted 3OJX0UFJ12J3K0OO51YBLHP3G92U9A
Deleted 3421H3BMADTU6KDUT2UXEFGSZT4J93
Deleted 3XBXDSS89BVQ26JSZ3COAXF2AL0LXQ
Deleted 38G0E1M86PH1H71R8237DZE071LVUR
Deleted 3VJ4PFXFK6JHPJYTX7CBHC0NMRKUAP
Deleted 3CKVGCS3QJH59T87G0KID56MS8O0SP
Deleted 3VQTAXTYO6XM8T3L3LC7CNKT495UBK
Deleted 38F60IALBJT6OTLM5

Deleted 3CMV9YRYQ6DP7ME3IZXNJO7NZNMJL1
Deleted 3Z33IC0JD3Y3XFDOAR8GH3RMUGC9VF
Deleted 3VW0145YM1O9D4WOIBY2J1MCZSCJM3
Deleted 34OWYT6U4ZTYB9TOBK9X2E1SMEGI9R
Deleted 3URJ6VVYVSZ7APYFSKQB53MYCV14OC
Deleted 3BO3NEOQN3TCGS3T3QNXY9CCSWYIA6
