In [None]:
# Import necessary functions from the module
from mturk_helpers import *

# Load environment variables and initialize MTurk client
load_dotenv()  # This loads the environment variables from .env
aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
region_name = 'us-east-1'
endpoint_url = 'https://mturk-requester.us-east-1.amazonaws.com'

mturk = boto3.client(
    'mturk',
    endpoint_url=endpoint_url,
    region_name=region_name,
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
)

# Print account balance
print(mturk.get_account_balance()['AvailableBalance'])

In [None]:
# Preprocessing and XML HIT creation
input_directory = '../dataset_txt_format/'
output_directory = './HITS_N'
process_directory_in_chunks(input_directory, output_directory)

In [None]:
# Qualification creation
feltalternativer = {
    "alder": ["18-24", "25-34", "35-44", "45-54", "55-64", "65+"],
    "kjønn": ["Mann", "Kvinne", "Ikke-binær", "Foretrekker å ikke si"],
    "yrke": ["Student", "Lærer", "Ingeniør", "Kunstner", "Annet"],
    "arbeidsstatus": ["Heltid", "Deltid", "Arbeidsledig", "Pensjonert", "Annet"],
    "interesser": ["Sport", "Teknologi", "Kunst", "Vitenskap", "Reise", "Annet"],
    "hobbyer": ["Lesing", "Spilling", "Matlaging", "Fjellturer", "Håndarbeid", "Annet"],
    "nyhetslesevaner": ["Daglig", "Ukentlig", "Månedlig", "Sjelden", "Aldri"],
    "foretrukne nyhetskategorier": ["Politikk", "Økonomi", "Underholdning", "Sport", "Teknologi", "Annet"],
    "Norsk språkferdigheter": ["Flytende", "Mellomnivå", "Grunnleggende", "Ingen"],
}

free_text_fields = []
personalia_test_xml = create_question_xml(feltalternativer, free_text_fields)
answer_key_xml = create_answer_key_xml(feltalternativer)

qualification_type_id_2 = create_qualification_type(
    mturk_client=mturk,
    name="NorwAI Norwegian Turk Personalia Qualification Preapproval v4",
    description="Denne kvalifikasjonsteksten er for å kartlegge våre arbeidere og vurdere deres kunnskaper i norsk og interesser",
    test=personalia_test_xml,
    answer_key=answer_key_xml,
    duration=1800  # Duration in seconds (30 minutes)
)

print(f"Created Qualification Type ID: {qualification_type_id_2}")

In [None]:
# Retrieve and list all qualifications made
qualifications = list_my_qualifications(mturk)
for qual in qualifications:
    print(f"ID: {qual['QualificationTypeId']}, Name: {qual['Name']}, Description: {qual['Description']}")

In [None]:
# HIT deployment
max_files_to_process = 148
starting_point = 0
all_xml_files = [f for f in sorted(os.listdir(output_directory)) if f.endswith('.xml')]
for xml_file in all_xml_files[starting_point:max_files_to_process]:
    xml_file_path = os.path.join(output_directory, xml_file)
    try:
        hit_id = create_hit_with_xml_file(xml_file_path, mturk, qualification_type_id_2)
        print(f"Created HIT with ID: {hit_id}")
        preview_link = f"https://worker.mturk.com/mturk/preview?groupId={hit_id}"
        print(f"Preview your HIT: {preview_link}")
    except Exception as e:
        print(f"An error occurred while creating HIT for {xml_file}: {e}")

In [None]:
# Approve qualifications manually
approve_qualifications(mturk, qualification_type_id_2, 'mturk_qualification_personalia_second_experiment.json')

In [None]:
# Retrieve HIT assignments to a dataframe
fetch_all_hits(mturk, datetime.datetime(2024, 5, 18, tzinfo=timezone.utc), 'mturk_cache/fetched_hits.json')
results_df = get_hit_results(mturk, 'mturk_cache/fetched_hits.json', 'mturk_cache/hit_data.json')
results_df

# Extract questions and match with XML files
folders = [output_directory]
xml_questions = {}
for folder in folders:
    for filename in os.listdir(folder):
        if filename.endswith('.xml'):
            question = extract_questions_from_xml(os.path.join(folder, filename))
            xml_questions[tuple(question)] = folder + "/" + filename

results_df['QuestionText'] = results_df['Question'].apply(lambda x: tuple(extract_texts_from_xml(x)))
results_df['QuestionXML'] = results_df['Question'].apply(lambda x: xml_questions[tuple(extract_texts_from_xml(x))])

# Expire all active HITs
#expire_all_active_hits(mturk)

In [None]:
# Save results to a CSV file
now = datetime.datetime.now()
dt_string = now.strftime("%d-%m-%Y %H-%M-%S")
folder_name = "HIT_results"
os.makedirs(folder_name, exist_ok=True)
file_name = f"contains_dataset_and_not_contains_hit_results_all_with_questions_{dt_string}.csv"
results_df.to_csv(os.path.join(folder_name, file_name), index=False)
print(f"Results exported to {file_name}")

In [None]:
# Format the dataframe into readable txt files
output_dir = './ParsedNotFilteredHITS'
os.makedirs(output_dir, exist_ok=True)
for index, row in results_df.iterrows():
    question_text = parse_question_xml(row['Question'])
    answer_text, passed = parse_answer_xml(row['Answer'])
    if passed < 0:
        print(f"Worker {row['WorkerId']} did not pass all questions for HIT {row['HITId']}")
        continue

    json_data = {
        "WorkerId": row["WorkerId"],
        "HITId": row["HITId"],
        "AssignmentId": row["AssignmentId"],
        "AssignmentStatus": row.get("AssignmentStatus", ""),
        "AcceptTime": row.get("AcceptTime", ""),
        "SubmitTime": row.get("SubmitTime", ""),
        "Duration": row.get("Duration", ""),
        "QuestionsAndAnswers": []
    }

    text_content = ""
    for key, value in json_data.items():
        if key != "QuestionsAndAnswers":
            text_content += f"{key}: {value}\n"

    for q in question_text:
        q_index = q['Title'].split()[-1]
        related_answers = [a for a in answer_text if a['Question ID'].startswith(f"text{q_index}_")]
        qa_pairs = {
            "Title": q['Title'],
            "QuestionText": q['Text'],
            "Answers": related_answers
        }
        json_data["QuestionsAndAnswers"].append(qa_pairs)
        text_content += f"\nTitle: {q['Title']}\nQuestion Text: {q['Text']}\n"
        for a in related_answers:
            text_content += f"- {a['Question ID']}: {a['Answer']}\n"

    json_file_path = os.path.join(output_dir, f"{row['AssignmentId']}.json")
    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(json_data, json_file, indent=4)

    txt_file_path = os.path.join(output_dir, f"{row['AssignmentId']}.txt")
    with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
        txt_file.write(text_content)

print("Files saved in directory:", output_dir)


In [None]:
# Approve or reject HIT assignments
from HITOrganizer import HITOrganizer
from BanFilter import BanFilter
from AssignmentFilter import AssignmentFilter

hit_organizer = HITOrganizer("ParsedNotFilteredHITS", "UserProfiles3")
assignment_filter = AssignmentFilter('ParsedNotFilteredHITS')
rejected_assignment_ids = assignment_filter.filter_assignments()
workers_to_ban = BanFilter('ParsedNotFilteredHITS').get_worker_ids()

approved_hits = results_df[~results_df['AssignmentId'].isin(rejected_assignment_ids)]
approved_hits = approved_hits[approved_hits['AssignmentStatus'] == 'Submitted']
approved_hits_list = approved_hits['AssignmentId'].tolist()

for assignment_id in rejected_assignment_ids:
    reject_hit(mturk, assignment_id, hit_organizer)

for assignment_id in approved_hits_list:
    approve_hit(mturk, assignment_id, hit_organizer)

for worker in workers_to_ban:
    ban_worker(mturk, worker)

In [None]:
# Redeploy missing assignments
questions_data = retrieve_and_count_questions(mturk, rejected_assignment_ids)
#questions_data = retrieve_and_count_questions_from_cache(mturk, rejected_assignment_ids, 'mturk_cache/fetched_hits.json')

for question_tuple in questions_data:
    question_string = str(question_tuple)
    matching_xml = results_df[results_df['QuestionText'] == question_string]['QuestionXML']
    if not matching_xml.empty:
        questions_data[question_tuple]['xml_file'] = matching_xml.iloc[0]
    else:
        print(f"XML data not found for question tuple: {question_tuple}")

sample_question_data = questions_data[next(iter(questions_data))]
sample_question_data

In [None]:
counter = 0
additional_assignments = 0
for question, data in questions_data.items():
    if data['total_count'] < 2:
        additional_assignments += 2 - data['total_count']
        counter += 1

print(f"Number of questions with less than 3 assignments: {counter}")
print(f"Total additional assignments needed: {additional_assignments}")

create_consolidated_additional_hits(mturk, questions_data, qualification_type_id_2)
