# Initialize client

In [None]:
import boto3
from dotenv import load_dotenv
import os
import xmltodict
import logging

load_dotenv()  # This loads the environment variables from .env
aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.environ.get(
    'AWS_SECRET_ACCESS_KEY')
region_name = 'us-east-1'

#endpoint_url = 'https://mturk-requester-sandbox.us-east-1.amazonaws.com'

# Uncomment this line to use in production
endpoint_url = 'https://mturk-requester.us-east-1.amazonaws.com'

mturk = boto3.client(
    'mturk',
    endpoint_url=endpoint_url,
    region_name=region_name,
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
)

# This will return $10,000.00 in the MTurk Developer Sandbox
print(mturk.get_account_balance()['AvailableBalance'])

# Preprocessing - xml - hits - qualifications 

## XML production helpers

In [None]:
import boto3
import os
import chardet  # You might need to install this package
import re  # Import regex module

def remove_emojis(text):
    # Regex to filter out emojis
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def extract_body_from_text(file_path):
    with open(file_path, 'rb') as file:  # Open the file in binary mode
        raw_data = file.read()
    try:
        content = raw_data.decode('utf-8')  # Try to decode using UTF-8
    except UnicodeDecodeError:
        print(f"Unicode Decode Error in file: {file_path}. Skipping file.")
        return None

    if 'Body:' in content and 'Category:' in content:
        body = content.split('Body:')[1].split('Category:')[0].strip()
        body = remove_emojis(body)
        return body
    else:
        print(f"Markers not found in file: {file_path}")
        return None


def extract_qa_pairs(content):
    qa_pairs = []
    true_qa_section = content.split('True QA:')[1].split('False QA:')[0].strip()
    false_qa_sections = content.split('False QA:')[1:]

    # Extract True QA pairs
    true_qa_parts = [part for part in true_qa_section.split('\n') if part.startswith('Question') or part.startswith('Answer')]
    for i in range(0, len(true_qa_parts), 2):
        if i+1 < len(true_qa_parts):
            question = true_qa_parts[i].split('Question')[1].strip()
            answer = true_qa_parts[i+1].split('Answer')[1].strip()
            qa_pairs.append((question, answer, True))  # True indicates it's a correct pair

    # Extract False QA pairs
    for section in false_qa_sections:
        parts = [part for part in section.split('\n') if part.startswith('Question') or part.startswith('Answer')]
        for i in range(0, len(parts), 2):
            if i+1 < len(parts):
                question = parts[i].split('Question')[1].strip()
                answer = parts[i+1].split('Answer')[1].strip()
                qa_pairs.append((question, answer, False))  # False indicates it's an incorrect pair

    return qa_pairs


def parse_question_xml(question_xml):
    # Parse the XML string
    question_data = xmltodict.parse(question_xml)

    # Navigate through the dictionary to extract needed information
    # Modify the path according to your XML structure
    questions = question_data['QuestionForm']['Overview']

    parsed_questions = []
    for question in questions:
        if 'Title' in question:
            title = question['Title']
        else:
            title = 'No Title'
        
        text = question['Text']
        parsed_questions.append({'Title': title, 'Text': text})

    return parsed_questions

def parse_answer_xml(answer_xml):
    parsed_data = xmltodict.parse(answer_xml)
    parsed_answers = []
    passed = 3

    if 'QuestionFormAnswers' in parsed_data and 'Answer' in parsed_data['QuestionFormAnswers']:
        answers = parsed_data['QuestionFormAnswers']['Answer']
        if not isinstance(answers, list):
            answers = [answers]

        for answer in answers:
            qid = answer['QuestionIdentifier']
            if 'FreeText' in answer:
                answer_text = answer['FreeText']
            elif 'SelectionIdentifier' in answer:
                answer_text = answer['SelectionIdentifier']
                if "incorrect" in answer_text:
                    passed -= 1
            else:
                answer_text = "No Answer Text"
            
            parsed_answers.append({'Question ID': qid, 'Answer': answer_text})

    return parsed_answers, passed

## HIT XML creation

In [None]:
import os
import xml.etree.ElementTree as ET

def create_HIT_test_xml(files, questions_per_text=3):
    xml_parts = ['<?xml version="1.0" encoding="UTF-8"?>']
    xml_parts.append('<QuestionForm xmlns="http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2005-10-01/QuestionForm.xsd">')
    xml_parts.append('<Overview><Text>Velkommen til oppgaven! Vennligst les teksten nedenfor nøye. Din oppgave er å velge det korrekte spørsmål-svar paret som tilsvarer teksten og gi en oppsummering på to til tre setninger av den delen i teksten du syntes var mest interessant. Oppsumeringen skal ikke være på jeg form, og ikke inneholde annen tekst enn bare oppsummeringen. I kilder feltet skal du kopiere inn den delen av nyhetsartikkelen du oppsumerte, har du eksempelvis oppsumert fra de første tre setningene i nyhetsartikkelen skal du kopiere inn disse. Her er det viktig at du ikke skriver inn setninger som ikke finnes i den originale teksten.</Text></Overview>')
    xml_parts.append('<Overview><Text>Teksten din vil bli evaluert basert på nøyaktigheten av svarene dine. For å unngå avvisning, er det viktig at du svarer korrekt på kontrollspørsmålene og velger et spørsmål-svar par som nøyaktig reflekterer informasjonen i teksten. Ukorrekte eller irrelevante svar kan føre til at bidraget ditt blir avvist.</Text></Overview>')
    xml_parts.append('<Overview><Text>Du kan kvalifisere deg for en bonus basert på kvaliteten og kvantiteten av arbeidet ditt. Høykvalitetsbidrag som viser en grundig forståelse av teksten og et presist valg av spørsmål-svar par, vil øke sjansene dine for å motta en bonus. Jo flere oppgaver du fullfører med høy kvalitet, desto større er sjansen for bonus.</Text></Overview>')

    for idx, file_path in enumerate(files, start=1):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        body = extract_body_from_text(file_path)
        if not body:
            continue

        qa_pairs = extract_qa_pairs(content)
        xml_parts.append(f'<Overview><Title>Tekst {idx}</Title><Text><![CDATA[{body}]]></Text></Overview>')

        # Start grouping question-answer pairs
        question_element = f'<Question><QuestionIdentifier>text{idx}_questions</QuestionIdentifier><IsRequired>true</IsRequired>'
        question_element += '<QuestionContent><Text>Velg korrekt spørsmål-svar par som tilhører teksten:</Text></QuestionContent>'
        selections = ""
        for q_idx, (question, answer, is_correct) in enumerate(qa_pairs[:questions_per_text], start=1):
            selection_id = f'{"correct" if is_correct else "incorrect"}_{idx}_{q_idx}'
            selection = f'<Selection><SelectionIdentifier>{selection_id}</SelectionIdentifier><Text><![CDATA[Spørsmål {question} Svar {answer}]]></Text></Selection>'
            selections += selection

        question_element += f'<AnswerSpecification><SelectionAnswer><StyleSuggestion>radiobutton</StyleSuggestion><Selections>{selections}</Selections></SelectionAnswer></AnswerSpecification></Question>'
        xml_parts.append(question_element)

        # Add the summary question with constraints
        summary_question = f'<Question><QuestionIdentifier>text{idx}_summary</QuestionIdentifier><IsRequired>true</IsRequired>'
        summary_question += '<QuestionContent><Text>Skriv en oppsummering på to til tre setninger av den delen i teksten du syntes var mest interessant. Oppsumeringen skal ikke være på jeg form:</Text></QuestionContent>'
        summary_question += '<AnswerSpecification><FreeTextAnswer><Constraints><Length minLength="1" /></Constraints></FreeTextAnswer></AnswerSpecification></Question>'
        
        xml_parts.append(summary_question)

        # Add the source citation question
        source_question = f'<Question><QuestionIdentifier>text{idx}_source</QuestionIdentifier><IsRequired>true</IsRequired>'
        source_question += '<QuestionContent><Text>Kilde - Kopier inn den delen av nyhetsartikkelen du oppsumerte fra:</Text></QuestionContent>'
        source_question += '<AnswerSpecification><FreeTextAnswer><Constraints><Length minLength="1" /></Constraints></FreeTextAnswer></AnswerSpecification></Question>'
        xml_parts.append(source_question)

    xml_parts.append('</QuestionForm>')
    return ''.join(xml_parts)

def process_directory_in_chunks(directory, output_dir, chunk_size=3, questions_per_text=3):
    os.makedirs(output_dir, exist_ok=True)
    all_filenames = [os.path.join(directory, f) for f in sorted(os.listdir(directory)) if f.endswith('.txt')]
    total_files = len(all_filenames)

    for i in range(0, total_files, chunk_size):
        chunk_files = all_filenames[i:i + chunk_size]

        # Check if this is the last chunk and if it has fewer files than the chunk size
        if i + chunk_size >= total_files and len(chunk_files) < chunk_size:
            remaining = chunk_size - len(chunk_files)
            chunk_files.extend(all_filenames[:remaining])

        hit_xml = create_HIT_test_xml(chunk_files, questions_per_text=questions_per_text)

        output_file_path = os.path.join(output_dir, f'hit_{i // chunk_size}.xml')
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(hit_xml)
        print(f"Created HIT XML file: {output_file_path}")

# Example Usage
input_directory = '../new_tasks 2/'
output_directory = './HITS5'
process_directory_in_chunks(input_directory, output_directory)

## Qualification creation

### Personalia qualification xml

In [None]:
import xml.etree.ElementTree as ET

def create_question_xml(field_dict, free_text_fields):
    question_form = ET.Element("QuestionForm", xmlns="http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2005-10-01/QuestionForm.xsd")
    
    for field, options in field_dict.items():
        question = ET.SubElement(question_form, "Question")
        ET.SubElement(question, "QuestionIdentifier").text = field
        ET.SubElement(question, "IsRequired").text = "true"
        question_content = ET.SubElement(question, "QuestionContent")
        ET.SubElement(question_content, "Text").text = f"Hva er din/ditt/dine {field}?"

        if field in free_text_fields:
            answer_spec = ET.SubElement(question, "AnswerSpecification")
            free_text_answer = ET.SubElement(answer_spec, "FreeTextAnswer")
            ET.SubElement(free_text_answer, "NumberOfLinesSuggestion").text = "1"
        else:
            answer_spec = ET.SubElement(question, "AnswerSpecification")
            selection_answer = ET.SubElement(answer_spec, "SelectionAnswer")
            ET.SubElement(selection_answer, "StyleSuggestion").text = "radiobutton"
            selections = ET.SubElement(selection_answer, "Selections")

            for option in options:
                selection = ET.SubElement(selections, "Selection")
                ET.SubElement(selection, "SelectionIdentifier").text = option
                ET.SubElement(selection, "Text").text = option

    return ET.tostring(question_form, encoding='unicode')

def create_answer_key_xml(field_dict):
    answer_key = ET.Element("AnswerKey", xmlns="http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2005-10-01/AnswerKey.xsd")
    for field in field_dict:
        question = ET.SubElement(answer_key, "Question")
        ET.SubElement(question, "QuestionIdentifier").text = field
        answer_option = ET.SubElement(question, "AnswerOption")
        ET.SubElement(answer_option, "SelectionIdentifier").text = "anyResponse"
        ET.SubElement(answer_option, "AnswerScore").text = "1"
    return ET.tostring(answer_key, encoding='unicode')


# Example usage
feltalternativer = {
    "alder": ["18-24", "25-34", "35-44", "45-54", "55-64", "65+"],
    "kjønn": ["Mann", "Kvinne", "Ikke-binær", "Foretrekker å ikke si"],
    "yrke": ["Student", "Lærer", "Ingeniør", "Kunstner", "Annet"],
    "arbeidsstatus": ["Heltid", "Deltid", "Arbeidsledig", "Pensjonert", "Annet"],
    "interesser": ["Sport", "Teknologi", "Kunst", "Vitenskap", "Reise", "Annet"],
    "hobbyer": ["Lesing", "Spilling", "Matlaging", "Fjellturer", "Håndarbeid", "Annet"],
    "nyhetslesevaner": ["Daglig", "Ukentlig", "Månedlig", "Sjelden", "Aldri"],
    "foretrukne nyhetskategorier": ["Politikk", "Økonomi", "Underholdning", "Sport", "Teknologi", "Annet"],
    "Norsk språkferdigheter": ["Flytende", "Mellomnivå", "Grunnleggende", "Ingen"],
    # Legg til flere felt og alternativer etter behov
}

def create_qualification_type(mturk_client, name, description, test, answer_key, duration):
    response = mturk_client.create_qualification_type(
        Name=name,
        Description=description,
        Test=test,
        AnswerKey=answer_key,
        TestDurationInSeconds=duration,
        QualificationTypeStatus='Active',
        AutoGranted=False
    )
    return response['QualificationType']['QualificationTypeId']


free_text_fields = []
personalia_test_xml = create_question_xml(feltalternativer, free_text_fields)
# Generate the answer_key_xml as needed
print(personalia_test_xml)
answer_key_xml = create_answer_key_xml(feltalternativer)
print(answer_key_xml)

### Personalia qualification creation 

In [None]:
import boto3

def create_qualification_type(mturk_client, name, description, test, duration):
    response = mturk_client.create_qualification_type(
        Name=name,
        Description=description,
        Test=test,
        TestDurationInSeconds=duration,
        QualificationTypeStatus='Active',
        RetryDelayInSeconds=315360000  # 10 years before retake possible

    )
    return response['QualificationType']['QualificationTypeId']


# Create the qualification type with personalia_test_xml and answer_key_xml

qualification_type_id_2 = create_qualification_type(
    mturk_client=mturk,
    name="NorwAI Norwegian Turk Personalia Qualification Preapproval v4",
    description="Denne kvalifikasjonsteksten er for å kartlegge våre arbeidere og vurdere deres kunnskaper i norsk og interesser",
    test=personalia_test_xml,
    duration=1800  # Duration in seconds (30 minutes)
)

print(f"Created Qualification Type ID: {qualification_type_id_2}")

# Deployment and processing

## Retrieve all qualifications made

In [None]:
def list_my_qualifications(mturk_client):
    # Initialize an empty list to hold qualification types
    my_qualifications = []

    # Use pagination to retrieve all qualification types
    next_token = None
    while True:
        if next_token:
            response = mturk_client.list_qualification_types(
                MustBeRequestable=True,
                MustBeOwnedByCaller=True,
                MaxResults=100,  # Adjust the number of results as needed
                NextToken=next_token
            )
        else:
            response = mturk_client.list_qualification_types(
                MustBeRequestable=True,
                MustBeOwnedByCaller=True,
                MaxResults=100  # Adjust the number of results as needed
            )

        my_qualifications.extend(response['QualificationTypes'])

        # Check if there are more qualifications to retrieve
        next_token = response.get('NextToken')
        if not next_token:
            break

    return my_qualifications


qualifications = list_my_qualifications(mturk)
for qual in qualifications:
    print(f"ID: {qual['QualificationTypeId']}, Name: {qual['Name']}, Description: {qual['Description']}")

## HIT deployment! running will deploy HITs

In [None]:
# This is the qualification type ID for the personalia qualification used
qualification_type_id_2 = "3BEYYY5C1NI3YYW23CETY7BDKQQ17F"

In [None]:
def create_hit_with_xml_file(xml_file_path, mturk_client):
    with open(xml_file_path, 'r') as file:
        question_xml = file.read()
    # Create HIT
    response = mturk_client.create_hit(
        Title='NorwAI Norwegian/Norsk Annotation',
        Description='Les en nyhetsartikkel og gi et sammendrag og svar på spørsmål.',
        Keywords='nyheter, annotering, sammendrag, lesing',
        Reward='6.00',  # Adjust as necessary
        MaxAssignments=2,
        LifetimeInSeconds=1920000,  
        AssignmentDurationInSeconds=1800,  # 30 minutes
        Question=question_xml,
        QualificationRequirements=[
            {
                'QualificationTypeId': qualification_type_id_2,
                'Comparator': 'Exists',
                'ActionsGuarded': 'Accept'
            },
            {'QualificationTypeId': "000000000000000000L0",
                'Comparator': 'GreaterThan',
                'IntegerValues': [60],
                'ActionsGuarded': 'Accept'
            }
        ],
    )
    return response['HIT']['HITId']

output_directory = './HITS5'

max_files_to_process = 148  # Set the number of files you want to process
starting_point = 0  # Set the starting point for the files
# Get the list of all xml files in the output directory
all_xml_files = [f for f in sorted(os.listdir(output_directory)) if f.endswith('.xml')]
# Process only a limited number of files based on max_files_to_process
for xml_file in all_xml_files[starting_point:max_files_to_process]:
    xml_file_path = os.path.join(output_directory, xml_file)
    try:
        hit_id = create_hit_with_xml_file(xml_file_path, mturk)
        print(f"Created HIT with ID: {hit_id}")
        # Adjust the preview link based on the endpoint
        if 'sandbox' in endpoint_url:
            preview_link = f"https://workersandbox.mturk.com/mturk/preview?groupId={hit_id}"
        else:
            preview_link = f"https://worker.mturk.com/mturk/preview?groupId={hit_id}"
        print(f"Preview your HIT: {preview_link}")
    
    except Exception as e:
        print(f"An error occurred while creating HIT for {xml_file}: {e}")


## Approve qualifications manually

### This is needed for the workers personalia to be saved and for them to be able to actuall accept and take a HIT assignment

In [None]:
# check that alder is not 25-34 and that Norsk språkferdigheter is Flytende
def check_qualification(answers):
    for answer in answers:
#        if answer['Question ID'] == 'alder' and answer['Answer'] == '25-34':
#            return False
        if answer['Question ID'] == 'Norsk språkferdigheter' and answer['Answer'] != 'Flytende':
            return False
    return True

### Approves all pending workers for a qualification
#### Warning!!! After approval all answers will not be retrievable again

In [None]:
import json

qualification_type_id = '3BEYYY5C1NI3YYW23CETY7BDKQQ17F'
file_name = 'mturk_qualification_personalia_second_experiment.json'
try:
    # Retrieve the list of qualification requests
    qualification_requests = mturk.list_qualification_requests(
        QualificationTypeId=qualification_type_id,
        MaxResults=100  # Adjust as needed, maximum is 100
    )

    # Iterate over the qualification requests
    for request in qualification_requests.get('QualificationRequests', []):
        request_id = request['QualificationRequestId']
        if not check_qualification(parse_answer_xml(request["Answer"])[0]):
            print(f"Qualification request {request_id} did not meet the criteria. Rejecting...")
            # Reject the qualification request
            mturk.reject_qualification_request(
                QualificationRequestId=request_id,
                Reason='Your answers did not meet the criteria for this qualification.'
            )
            continue
        # Convert the request to a JSON string
        request_json = json.dumps(request, indent=4, default=str)

        # Append the JSON string to the file
        with open(file_name, 'a') as file:
            file.write(request_json + '\n')

        # Approve each qualification request
        mturk.accept_qualification_request(
            QualificationRequestId=request_id
        )
        print(f"Approved and logged request: {request_id}")
except Exception as e:
    print(f"An error occurred: {e}")

print(f"All processed requests have been appended to {file_name}")

# Retrieval and postprocessing

## Retrieve all HITs assignments from Turk to dataframe

In [None]:
import os
import json
import datetime
from datetime import timezone
import concurrent.futures
import pandas as pd

# Cache directory and file setup
cache_directory = 'mturk_cache'
cache_file = 'hit_data.json'
fetched_hits_file = os.path.join(cache_directory, 'fetched_hits.json')
full_cache_path = os.path.join(cache_directory, cache_file)
os.makedirs(cache_directory, exist_ok=True)

def serialize_datetime(obj):
    """ Helper function to serialize datetime objects for JSON. """
    if isinstance(obj, datetime.datetime):
        return obj.isoformat()
    raise TypeError("Type %s not serializable" % type(obj))

def preprocess_hit(hit):
    """ Preprocess a HIT to ensure all data is serializable. """
    for key, value in hit.items():
        if isinstance(value, datetime.datetime):
            hit[key] = value.isoformat()  # Convert datetime to ISO format string
    return hit


def fetch_all_hits(mturk_client):
    print("Fetching all HITs from MTurk...")
    all_hits = []
    next_token = None
    cutoff_date = datetime.datetime(2024, 5, 18, tzinfo=timezone.utc)  # Ensure cutoff_date is offset-aware
    while True:
        response = mturk_client.list_hits(NextToken=next_token) if next_token else mturk_client.list_hits()
        processed_hits = [
            preprocess_hit(hit) for hit in response['HITs']
            if hit['CreationTime'].replace(tzinfo=timezone.utc) > cutoff_date  # Assume hit['CreationTime'] is datetime
        ]
        all_hits.extend(processed_hits)
        next_token = response.get('NextToken')
        if not next_token:
            break
    save_json(all_hits, fetched_hits_file)
    print("All HITs fetched and saved.")


def serialize_datetime(obj):
    """ Helper function to serialize datetime and timedelta objects for JSON. """
    if isinstance(obj, datetime.datetime):
        return obj.isoformat()
    elif isinstance(obj, datetime.timedelta):
        # Convert timedelta to total seconds for serialization
        return obj.total_seconds()
    raise TypeError("Type %s not serializable" % type(obj))

def save_json(data, file_path):
    with open(file_path, 'w') as file:
        # Pass the custom serializer to handle datetime and timedelta objects
        json.dump(data, file, default=serialize_datetime, indent=4)


def load_json(file_path):
    try:
        with open(file_path, 'r') as file:
            return json.load(file)
    except FileNotFoundError:
        return {}

def load_cache():
    return load_json(full_cache_path)

def save_cache(cache):
    save_json(cache, full_cache_path)

def convert_to_utc(dt_input):
    """ Convert an ISO formatted string or a datetime object with timezone to UTC datetime object. """
    if isinstance(dt_input, str):
        # Convert from string to datetime with timezone awareness
        dt = datetime.datetime.fromisoformat(dt_input)
    elif isinstance(dt_input, datetime.datetime):
        # Use the datetime object directly if it's already a datetime
        dt = dt_input
    else:
        # Handle cases where the input is neither a str nor datetime
        raise TypeError(f"Expected string or datetime for conversion, got {type(dt_input)}")

    # Convert to UTC if it's not already in UTC
    if dt.tzinfo is not None:
        return dt.astimezone(timezone.utc)
    else:
        # Assume naive datetime objects are in UTC
        return dt.replace(tzinfo=timezone.utc)


def check_hit_and_fetch_assignments(mturk_client, hit, cache, current_time):
    hit_id = hit['HITId']
    expiration = convert_to_utc(hit['Expiration'])  # Safely convert without assuming the type
    current_time = current_time.astimezone(timezone.utc)  # Ensure current_time is in UTC

    cache_hit = cache.get(hit_id)
    if cache_hit:
        cache_expiration = convert_to_utc(cache_hit['Expiration'])
        if cache_expiration < current_time or (cache_hit['NumberOfAssignmentsAvailable'] == 0 and cache_hit['NumberOfAssignmentsPending'] == 0):
            return {'hit_id': hit_id, 'data': cache_hit['Data']}

    assignments = mturk_client.list_assignments_for_hit(HITId=hit_id)['Assignments']
    assignment_data = [{
        'HITId': hit_id,
        'AssignmentId': assn['AssignmentId'],
        'AssignmentStatus': assn['AssignmentStatus'],
        'AcceptTime': convert_to_utc(assn['AcceptTime']).isoformat(),
        'SubmitTime': convert_to_utc(assn['SubmitTime']).isoformat(),
        'Duration': convert_to_utc(assn['SubmitTime']) - convert_to_utc(assn['AcceptTime']),
        'WorkerId': assn['WorkerId'],
        'Answer': assn['Answer'],
        'Question': mturk_client.get_hit(HITId=hit_id)["HIT"]["Question"]
    } for assn in assignments]

    cache[hit_id] = {
        'Expiration': expiration.isoformat(),
        'NumberOfAssignmentsAvailable': hit['NumberOfAssignmentsAvailable'],
        'NumberOfAssignmentsPending': hit['NumberOfAssignmentsPending'],
        'Data': assignment_data
    }
    return {'hit_id': hit_id, 'data': assignment_data}


In [None]:
fetch_all_hits(mturk)  # Uncomment to fetch all HITs if needed

In [None]:
import datetime
from datetime import timezone

def count_non_expired_assignments_in_cache():
    cache = load_cache()  # Load the cache which contains HITs and their details
    current_time = datetime.datetime.now(timezone.utc)
    total_available = 0
    total_pending = 0

    # Iterate through each HIT in the cache
    for hit_id, hit_data in cache.items():
        # Check if the HIT has expired
        expiration_time = datetime.datetime.fromisoformat(hit_data['Expiration'].replace('Z', '+00:00'))
        if expiration_time > current_time:
            total_available += hit_data.get('NumberOfAssignmentsAvailable', 0)
            total_pending += hit_data.get('NumberOfAssignmentsPending', 0)

    return total_available, total_pending

# Usage
total_available, total_pending = count_non_expired_assignments_in_cache()
print(f"Total Available Assignments for Non-Expired HITs: {total_available}")
print(f"Total Pending Assignments for Non-Expired HITs: {total_pending}")

In [None]:
def get_hit_results(mturk_client):
    cache = load_cache()
    hits = load_json(fetched_hits_file)
    current_time = datetime.datetime.now(timezone.utc)
    all_results = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=40) as executor:
        future_to_hit = {executor.submit(check_hit_and_fetch_assignments, mturk_client, hit, cache, current_time): hit for hit in hits}
        for future in concurrent.futures.as_completed(future_to_hit):
            result = future.result()
            if result:
                all_results.extend(result['data'])

    save_cache(cache)
    return pd.DataFrame(all_results)

# Example Usage
results_df = get_hit_results(mturk)
results_df

In [None]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

def extract_questions_from_xml(xml_file):
    # Assuming each XML file has a structure where the question text can be extracted
    # Modify this function according to the actual structure of your XML files
    with open(xml_file, 'r') as file:
        question_xml = file.read()
    return extract_texts_from_xml(question_xml)

def extract_texts_from_xml(xml_q):
    parsed_question = parse_question_xml(xml_q)
    questions_123 = []
    for question in parsed_question:
        if "Tekst" in question["Title"]:
            questions_123.append(question["Text"])
    return questions_123

# Paths to your folders
folders = ['./HITS5']
# Set to store questions from XML files
xml_questions = {}
# Read each XML file and extract questions
for folder in folders:
    for filename in os.listdir(folder):
        if filename.endswith('.xml'):
            question = extract_questions_from_xml(os.path.join(folder, filename))
            xml_questions[tuple(question)] = folder + "/" + filename

# lets make a column in df that has the question text, and another column with the xml file
results_df['QuestionText'] = results_df['Question'].apply(lambda x: tuple(extract_texts_from_xml(x)))
results_df['QuestionXML'] = results_df['Question'].apply(lambda x: xml_questions[tuple(extract_texts_from_xml(x))])

In [None]:
from datetime import datetime, timedelta
import pytz  # Or from zoneinfo import ZoneInfo if Python 3.9+

def expire_all_active_hits(mturk_client):
    # Get yesterday's UTC time
    utc_now = datetime.now(pytz.UTC)  # Or datetime.now(ZoneInfo("UTC"))
    utc_yesterday = utc_now - timedelta(days=1)

    # Retrieve all HITs
    next_token = None
    while True:
        if next_token:
            response = mturk_client.list_hits(NextToken=next_token)
        else:
            response = mturk_client.list_hits()

        for hit in response['HITs']:
            # Update the expiration time to yesterday
            mturk_client.update_expiration_for_hit(HITId=hit['HITId'], ExpireAt=utc_yesterday)

        next_token = response.get('NextToken')
        if not next_token:
            break

# Example usag
#expire_all_active_hits(mturk)

## Check df content and save

In [None]:
# get date now
from datetime import datetime
now = datetime.now()
# format date to year month day
dt_string = now.strftime("%d-%m-%Y %H-%M-%S")
# make or check folder exists
folder_name = f"HIT_results"
os.makedirs(folder_name, exist_ok=True)

# Save the results to a CSV file
file_name = f"contains_dataset_and_not_contains_hit_results_all_with_questions_{dt_string}.csv"
results_df.to_csv(os.path.join(folder_name, file_name), index=False)
print(f"Results exported to {file_name}")

# Parsing results into readable formats and txt

## Get the most recent results

In [None]:
import pandas as pd 
import os
# Specify the folder name
folder_name = "HIT_results"

# Get the list of CSV files in the folder
csv_files = [f for f in os.listdir(folder_name) if f.endswith('.csv')]

# Sort the CSV files based on the modification time
csv_files.sort(key=lambda x: os.path.getmtime(os.path.join(folder_name, x)))

# Select the newest CSV file
newest_file = csv_files[-1]
df = pd.read_csv(os.path.join(folder_name, newest_file))
df

## Format the dataframe into readable txt files

In [None]:
output_dir = './ParsedNotFilteredHITS3'

# Ensure the output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

### Save all with filtering >=2 correct multiple choice

In [None]:
import json
# Main loop for file generation
for index, row in df.iterrows():
    question_text = parse_question_xml(row['Question'])
    answer_text, passed = parse_answer_xml(row['Answer'])
    if passed < 0:
        print(f"Worker {row['WorkerId']} did not pass all questions for HIT {row['HITId']}")
        print(passed)
        continue

    json_data = {
        "WorkerId": row["WorkerId"],
        "HITId": row["HITId"],
        "AssignmentId": row["AssignmentId"],
        "AssignmentStatus": row.get("AssignmentStatus", ""),
        "AcceptTime": row.get("AcceptTime", ""),
        "SubmitTime": row.get("SubmitTime", ""),
        "Duration": row.get("Duration", ""),
        "QuestionsAndAnswers": []
    }

    text_content = ""
    for key, value in json_data.items():
        if key != "QuestionsAndAnswers":
            text_content += f"{key}: {value}\n"

    # Grouping questions and answers
    for q in question_text:
        q_index = q['Title'].split()[-1]
        related_answers = [a for a in answer_text if a['Question ID'].startswith(f"text{q_index}_")]
        qa_pairs = {
            "Title": q['Title'],
            "QuestionText": q['Text'],
            "Answers": related_answers
        }
        json_data["QuestionsAndAnswers"].append(qa_pairs)

        text_content += f"\nTitle: {q['Title']}\nQuestion Text: {q['Text']}\n"
        for a in related_answers:
            text_content += f"- {a['Question ID']}: {a['Answer']}\n"

    # Writing JSON file
    json_file_path = os.path.join(output_dir, f"{row['AssignmentId']}.json")
    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(json_data, json_file, indent=4)

    # Writing Text file
    txt_file_path = os.path.join(output_dir, f"{row['AssignmentId']}.txt")
    with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
        txt_file.write(text_content)

print("Files saved in directory:", output_dir)


# Approve or reject HIT assignments

### Filter based on correct multiple choice

In [None]:
# Importing the userStory script
from HITOrganizer import HITOrganizer
from BanFilter import BanFilter
import sys
from AssignmentFilter import AssignmentFilter

# Organiserer HITs og brukerprofiler, hvis problem med koden skyldes path
hit_organizer = HITOrganizer("ParsedNotFilteredHITS3", "UserProfiles3")

assignment_filter = AssignmentFilter('ParsedNotFilteredHITS3') # Filterer ut assignments som ikke har bestått, hvis det er problem med koden skyldes path
rejected_assignment_ids = assignment_filter.filter_assignments()

workers_to_ban = BanFilter('ParsedNotFilteredHITS3').get_worker_ids()
print(len(workers_to_ban))
print(len(rejected_assignment_ids))

In [None]:
# get approved hits by filtering out rejected_assignment_ids from df, and filtering to where df is Submitted, then to a list
approved_hits = df[~df['AssignmentId'].isin(rejected_assignment_ids)]
approved_hits = approved_hits[approved_hits['AssignmentStatus'] == 'Submitted']
approved_hits_list = approved_hits['AssignmentId'].tolist()
len(approved_hits_list)

In [None]:
# filter out workers that have been rejected already in df from rejected_assignment_ids, where df AssignmentStatus is not Rejected
#rejected_assignment_ids = [a for a in rejected_assignment_ids if a in df[df['AssignmentStatus'] == 'Submitted']['AssignmentId'].values]
#rejected_assignment_ids

In [None]:
# Can be used by inverting the filtering condition above, and then rejecting the assignments
def reject_hit(assignment_id):
    try:
        response = mturk.reject_assignment(
            AssignmentId=assignment_id,
            RequesterFeedback='Your work did not meet the required standards, as you had too few correct multiple choice answers or wrong sourcing. We encurage you to try again!',
            # You can customize the RequesterFeedback message as needed
        )
        hit_organizer.organize_file(assignment_id, approve=False)
        print(f"Rejected HIT: {assignment_id}")
    except mturk.exceptions.RequestError as e:
        print(f"Error: {e}")

# iterate passed_assignment_ids
for assignment_id in rejected_assignment_ids:
    reject_hit(assignment_id)

In [None]:
def approve_hit(assignment_id):
    try:
        response = mturk.approve_assignment(
            AssignmentId=assignment_id,
            RequesterFeedback='Good work, thank you!',
            OverrideRejection=False
        )
        
        hit_organizer.organize_file(assignment_id, approve=True)
        print(f"Approved HIT: {assignment_id}")
    except mturk.exceptions.RequestError as e:
        print(f"Error: {e}")

# iterate passed_assignment_ids
#for assignment_id in approved_hits:
#    approve_hit(assignment_id)

In [None]:
def ban_worker(worker_id):
    try:
        response = mturk.create_worker_block(
            WorkerId=worker_id,
            Reason='Repeatedly submitting low-quality work',
            # You can customize the Reason message as needed
        )
        print(f"Blocked Worker: {worker_id}")
    except mturk.exceptions.RequestError as e:
        print(f"Error: {e}")

for worker in workers_to_ban:
   ban_worker(worker)

# Redeploy missing assignments

In [None]:
import boto3
from datetime import datetime, timezone

def retrieve_and_count_questions(mturk_client):
    questions_data = {}  # Stores data for each question
    # Retrieve all HITs
    next_token = None
    while True:
        response = mturk_client.list_hits(NextToken=next_token) if next_token else mturk_client.list_hits()
        for hit in response['HITs']:
            hit_id = hit['HITId']
            question = hit['Question']  # Assuming direct access to question text
            is_expired = hit['Expiration'] < datetime.now(timezone.utc)
            parsed_question = tuple(extract_texts_from_xml(question))
            if parsed_question not in questions_data:
                questions_data[parsed_question] = {'total_count': 0, 'hit_ids': set(), 'question_xml': hit['Question']}

            assignments_response = mturk_client.list_assignments_for_hit(HITId=hit_id)
            for assignment in assignments_response['Assignments']:
                if assignment['AssignmentStatus'] in ['Submitted', 'Approved', 'Pending'] and assignment['AssignmentId'] not in rejected_assignment_ids:
                    questions_data[parsed_question]['total_count'] += 1
            
            questions_data[parsed_question]['total_count'] += hit['NumberOfAssignmentsPending']
            
            if not is_expired:
                questions_data[parsed_question]['total_count'] += hit['NumberOfAssignmentsAvailable']

            questions_data[parsed_question]['hit_ids'].add(hit_id)

        next_token = response.get('NextToken')
        if not next_token:
            break

    return questions_data

questions_data = retrieve_and_count_questions(mturk)

### Alternative quicker method, but not valid under all conditions

In [None]:
import os
import json
import datetime
from datetime import timezone
import concurrent.futures
import threading

# Cache directory and file setup
cache_directory = 'mturk_cache'
fetched_hits_file = os.path.join(cache_directory, 'fetched_hits.json')

def load_cached_hits():
    with open(fetched_hits_file, 'r') as file:
        return json.load(file)

def process_hit_and_count_questions(hit, mturk_client, questions_data, lock):
    hit_id = hit['HITId']
    question = hit['Question']
    expiration = datetime.datetime.fromisoformat(hit['Expiration']).replace(tzinfo=timezone.utc)
    is_expired = expiration < datetime.datetime.now(timezone.utc)
    parsed_question = tuple(extract_texts_from_xml(question))

    assignments_response = mturk_client.list_assignments_for_hit(HITId=hit_id)
    count = 0  # Local count to minimize lock duration
    for assignment in assignments_response['Assignments']:
        if assignment['AssignmentStatus'] in ['Submitted', 'Approved'] and assignment['AssignmentId'] not in rejected_assignment_ids:
            count += 1

    with lock:
        if parsed_question not in questions_data:
            questions_data[parsed_question] = {'total_count': 0, 'hit_ids': set(), 'question_xml': question}

        # Add the count of valid assignments to total_count
        questions_data[parsed_question]['total_count'] += count + hit['NumberOfAssignmentsPending']
        if not is_expired:
            questions_data[parsed_question]['total_count'] += hit['NumberOfAssignmentsAvailable']
        questions_data[parsed_question]['hit_ids'].add(hit_id)

def retrieve_and_count_questions_from_cache(mturk_client):
    questions_data = {}
    all_hits = load_cached_hits()  # Load cached HITs from a JSON file
    lock = threading.Lock()  # Lock for thread-safe operations on questions_data

    # Use ThreadPoolExecutor to parallelize processing of hits
    with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
        futures = [executor.submit(process_hit_and_count_questions, hit, mturk_client, questions_data, lock) for hit in all_hits]
        concurrent.futures.wait(futures)

    return questions_data

# Example usage
# Assuming mturk_client is set up and rejected_assignment_ids is populated
questions_data = retrieve_and_count_questions_from_cache(mturk)
questions_data

### Redeploy missing assignments

In [None]:
import ast  # Import the ast module for safely evaluating strings as Python expressions
# Iterate through each question in questions_data
for question_tuple in questions_data:
    # Convert the tuple to a string format for comparison
    question_string = str(question_tuple) 
    # Find matching QuestionXML in df for the current question_string
    matching_xml = df[df['QuestionText'] == question_string]['QuestionXML']
    if not matching_xml.empty:
        # Since the same QuestionText always has the same QuestionXML, take the first match
        questions_data[question_tuple]['xml_file'] = matching_xml.iloc[0]
    else:
        # Handle cases where the question text does not exist in the DataFrame
        print(f"XML data not found for question tuple: {question_tuple}")
# Get a sample from questions_data
sample_question_data = questions_data[next(iter(questions_data))]
print(sample_question_data)

In [None]:
counter = 0
additional_assignments = 0
for question, data in questions_data.items():
        if data['total_count'] < 2:
            additional_assignments += 2 - data['total_count']
            counter += 1

print(f"Number of questions with less than 3 assignments: {counter}")
print(f"Total additional assignments needed: {additional_assignments}")

In [None]:
def create_consolidated_additional_hits(mturk_client, questions_data, qualification_type_id):
    counter = 0
    ass_counter = 0
    for question, data in questions_data.items():
        try:
            # read xml_file from questions data to get the question_xml
            with open(data["xml_file"], 'r') as file:
                question_xml = file.read()
        except FileNotFoundError:
            print("File not found.")
            continue
        if data['total_count'] < 2:
            additional_assignments = 2 - data['total_count']
            counter += 1
            ass_counter += additional_assignments
            new_hit_id = mturk_client.create_hit(
                Title='NorwAI Norwegian/Norsk Annotation',
                Description='Les en nyhetsartikkel og gi et sammendrag og svar på spørsmål.',
                Keywords='nyheter, annotering, sammendrag, lesing',
                Reward='6.00',
                MaxAssignments=additional_assignments,
                LifetimeInSeconds=1920000,
                AssignmentDurationInSeconds=1800,
                Question=question_xml,
                QualificationRequirements=[{
                    'QualificationTypeId': qualification_type_id,
                    'Comparator': 'Exists',
                    'ActionsGuarded': 'Accept'
                },
                {'QualificationTypeId': "000000000000000000L0",
                     'Comparator': 'GreaterThan',
                     'IntegerValues': [60],
                     'ActionsGuarded': 'Accept'}
                ],
            )['HIT']['HITId']
            print(f"Created new HIT with ID: {new_hit_id} for question '{data['xml_file']}' with {additional_assignments} additional assignments.")
    return counter, ass_counter

qualification_type_id_2 = "3BEYYY5C1NI3YYW23CETY7BDKQQ17F" # new personalia we filter on
create_consolidated_additional_hits(mturk, questions_data, qualification_type_id_2)