#ُTranslate Groundtruths

In [None]:
import json
import random
import re
import os

def is_english(text):
    #this checks if the text contains more English characters than non-English
    english_char_pattern = re.compile(r'[a-zA-Z]')
    non_english_char_pattern = re.compile(r'[^\x00-\x7F]')

    english_chars = len(english_char_pattern.findall(text))
    non_english_chars = len(non_english_char_pattern.findall(text))

    return english_chars > non_english_chars

def annotate_data():
    input_file = input("Enter your file path: ")
    output_file = input_file.replace('.json', '_modified.json')
    sample_file = input_file.replace('.json', '_sample.json')

    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    print(f"Loaded {len(data)} questions from file.")

    #create a copy for modification
    modified_data = data.copy()

    #adding placeholders for labels and handle English answers
    english_groundtruths = []
    for question_id, content in data.items():
        # Check if original answer is in English
        if "original_answer" in content and is_english(content["original_answer"]):
            english_groundtruths.append(question_id)

        #add placeholders regardless of language
        if "question_label" not in content:
            modified_data[question_id]["question_label"] = ""

        for cluster in modified_data[question_id].get("clusters", []):
            for answer_item in cluster.get("answers", []):
                if len(answer_item) == 2:  # If it's [answer, score]
                    answer_item.append("")  # Add answer label placeholder

    # Handle English original answers if any exist
    if english_groundtruths:
        print(f"\nFound {len(english_groundtruths)} questions with English original answers.")

        for question_id in english_groundtruths:
            content = modified_data[question_id]
            print(f"\nQuestion ID: {question_id}")
            print(f"Question: {content['question']}")
            print(f"Current Original Answer: {content['original_answer']}")

            new_answer = input("Enter translated original answer (or press Enter to keep current): ")
            if new_answer:
                modified_data[question_id]["original_answer"] = new_answer
                print("Answer updated")
            else:
                print("Answer unchanged")
    else:
        print("No questions with English original answers found.")

    #select 100 random questions for the sample file
    sample_keys = random.sample(list(data.keys()), 100)
    sample_data = {k: modified_data[k] for k in sample_keys}


    # Remove sampled questions from the main data (not anymore)
    for k in sample_keys:
      """
        #modified_data.pop(k)
    #save sampled data
    with open(sample_file, 'w', encoding='utf-8') as f:
        json.dump(sample_data, f, ensure_ascii=False, indent=4)
    print(f"\nSaved 100 random questions to {sample_file}")
"""
    #save modified data
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(modified_data, f, ensure_ascii=False, indent=4)
    print(f"Saved modified data to {output_file}")

In [None]:
annotate_data()

# Human Labeling for new file structure
###it wont remove the human labeled questions from the og file
###To continue labeling from where run time disconnected, add og file and labeled file, and when prompted to add the file path insert the og file path

In [None]:
def huamn_labeling():
    import json
    import os

    # Get input and output file paths
    sample_file = input("Enter your sample file path: ")
    output_file = input("Enter your output file path (or press Enter to use 'labeled_' prefix): ")

    if not output_file:
        file_dir = os.path.dirname(sample_file)
        file_name = os.path.basename(sample_file)
        output_file = os.path.join(file_dir, f"labeled_{file_name}")

    print("For each answer, enter 'H' for Hallucinated or 'N' for Non-Hallucinated")

    # Load sample data
    with open(sample_file, 'r', encoding='utf-8') as f:
        sample_data = json.load(f)

    # Load labeled data if it already exists
    if os.path.exists(output_file):
        with open(output_file, 'r', encoding='utf-8') as f:
            labeled_data = json.load(f)
        print(f"Resuming from existing labeled file: {output_file}")
    else:
        # Initialize labeled_data with empty labels
        labeled_data = []
        for question in sample_data:
            labeled_question = question.copy()
            new_answers = []
            for answer_group in question.get("answers", []):
                new_group = []
                for answer_text in answer_group:
                    new_group.append({
                        "text": answer_text,
                        "RougeL_label": "",
                        "human_label": ""  # Initially empty
                    })
                new_answers.append(new_group)
            labeled_question["answers"] = new_answers
            labeled_data.append(labeled_question)

    # Start labeling
    for i, question_data in enumerate(labeled_data):
        print(f"\n\nQuestion #{i + 1}")
        print(f"Question: {question_data['question']}")
        print(f"Original Answer: {question_data['original_answer']}")

        non_hallucinated_count = 0
        total_answers = 0

        for group_idx, group in enumerate(question_data["answers"]):
            for ans_idx, answer_obj in enumerate(group):
                if answer_obj.get("human_label"):  # Already labeled
                    continue

                answer_text = answer_obj["text"]
                print(f"\nGroup [{group_idx + 1}], Answer [{ans_idx + 1}]: {answer_text}", end=" ")

                while True:
                    label = input("Label (H/N): ").strip().upper()
                    if label in ['H', 'N']:
                        break
                    print("Invalid input. Enter 'H' for Hallucinated or 'N' for Non-Hallucinated.")

                answer_label = "Non-Hallucinated" if label == 'N' else "Hallucinated"
                answer_obj["human_label"] = answer_label
                print(f"=> {answer_label}")

        # Count non-hallucinated answers for question label
        for group in question_data["answers"]:
            for answer_obj in group:
                if answer_obj.get("human_label") == "Non-Hallucinated":
                    non_hallucinated_count += 1
                if answer_obj.get("human_label"):
                    total_answers += 1

        if non_hallucinated_count >= 6:
            question_label = "Non-Hallucinated"
        else:
            question_label = "Hallucinated"

        question_data["computed_question_label"] = question_label
        print(f"\nQuestion Label: {question_label} (based on {non_hallucinated_count}/{total_answers} non-hallucinated answers)")

        # Save after each question
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(labeled_data, f, ensure_ascii=False, indent=4)
        print(f"Progress saved to {output_file}")

    print("\nAnswer labeling complete!")

In [5]:
huamn_labeling()

Enter your sample file path: /content/semantic_entropy_allam_arabicaqa_results.json
Enter your output file path (or press Enter to use 'labeled_' prefix): 
For each answer, enter 'H' for Hallucinated or 'N' for Non-Hallucinated
Resuming from existing labeled file: /content/labeled_semantic_entropy_allam_arabicaqa_results.json


Question #1
Question: ما هو مركز لعب دميترو زوزوليا في كرة القدم؟

Original Answer: الوسط

Question Label: Hallucinated (based on 4/10 non-hallucinated answers)
Progress saved to /content/labeled_semantic_entropy_allam_arabicaqa_results.json


Question #2
Question: مع أي نادي لعب دميترو زوزوليا في مسيرته الرياضية؟

Original Answer:  لعب مع نادي فولين لوتسك

Question Label: Hallucinated (based on 0/10 non-hallucinated answers)
Progress saved to /content/labeled_semantic_entropy_allam_arabicaqa_results.json


Question #3
Question: ما هي عاصمة غرب يوركشير في إنجلترا؟
Original Answer: ليدز

Question Label: Non-Hallucinated (based on 10/10 non-hallucinated answers)
P

KeyboardInterrupt: Interrupted by user