<a href="https://colab.research.google.com/github/Reemaalt/Detection-of-Hallucination-in-Arabic/blob/main/Human_labeling_and_cleaning_generations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Annotate Groundtruths

In [3]:
import json
import random
import re
import os

def is_english(text):
    #this checks if the text contains more English characters than non-English
    english_char_pattern = re.compile(r'[a-zA-Z]')
    non_english_char_pattern = re.compile(r'[^\x00-\x7F]')

    english_chars = len(english_char_pattern.findall(text))
    non_english_chars = len(non_english_char_pattern.findall(text))

    return english_chars > non_english_chars

def annotate_data():
    input_file = input("Enter your file path: ")
    output_file = input_file.replace('.json', '_modified.json')
    sample_file = input_file.replace('.json', '_sample.json')

    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    print(f"Loaded {len(data)} questions from file.")

    #create a copy for modification
    modified_data = data.copy()

    #adding placeholders for labels and handle English answers
    english_groundtruths = []
    for question_id, content in data.items():
        # Check if original answer is in English
        if "original_answer" in content and is_english(content["original_answer"]):
            english_groundtruths.append(question_id)

        #add placeholders regardless of language
        if "question_label" not in content:
            modified_data[question_id]["question_label"] = ""

        for cluster in modified_data[question_id].get("clusters", []):
            for answer_item in cluster.get("answers", []):
                if len(answer_item) == 2:  # If it's [answer, score]
                    answer_item.append("")  # Add answer label placeholder

    # Handle English original answers if any exist
    if english_groundtruths:
        print(f"\nFound {len(english_groundtruths)} questions with English original answers.")

        for question_id in english_groundtruths:
            content = modified_data[question_id]
            print(f"\nQuestion ID: {question_id}")
            print(f"Question: {content['question']}")
            print(f"Current Original Answer: {content['original_answer']}")

            new_answer = input("Enter translated original answer (or press Enter to keep current): ")
            if new_answer:
                modified_data[question_id]["original_answer"] = new_answer
                print("Answer updated")
            else:
                print("Answer unchanged")
    else:
        print("No questions with English original answers found.")

    #select 100 random questions for the sample file
    sample_keys = random.sample(list(data.keys()), 100)
    sample_data = {k: modified_data[k] for k in sample_keys}

    # Remove sampled questions from the main data
    for k in sample_keys:
        modified_data.pop(k)

    #save sampled data
    with open(sample_file, 'w', encoding='utf-8') as f:
        json.dump(sample_data, f, ensure_ascii=False, indent=4)
    print(f"\nSaved 100 random questions to {sample_file}")

    #save modified data
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(modified_data, f, ensure_ascii=False, indent=4)
    print(f"Saved modified data to {output_file}")

In [4]:
annotate_data()

Enter your file path: entailment_clusters_Llama3.1-8b-xor_tydiqa.json
Loaded 708 questions from file.

Found 68 questions with English original answers.

Question ID: -2275140940114918987
Question: هل زار ابن بطوطة اليمن؟
Current Original Answer: yes
Enter translated original answer (or press Enter to keep current): نعم
Answer updated

Question ID: 2644346659247031062
Question: هل توجد قواعد عسكرية فرنسية في جيبوتي؟
Current Original Answer: yes
Enter translated original answer (or press Enter to keep current): نعم
Answer updated

Question ID: -525316578670747929
Question: هل يوجد لقاح للحدّ من مرض حمى الضنك؟
Current Original Answer: yes
Enter translated original answer (or press Enter to keep current): نعم
Answer updated

Question ID: -240961318888839500
Question: هل هناك تساوي بين الرجل والمرأة في الهند ؟
Current Original Answer: no
Enter translated original answer (or press Enter to keep current): لا
Answer updated

Question ID: -3514747139730663213
Question: هل يمكن وصف دواء نيترازي

#Annotate answers & Questions

In [5]:
def huamn_labeling():
  sample_file = input("Enter your sample file path: ")
  print("For each answer, enter 'H' for Hallucinated or 'N' for Non-Hallucinated")

  # Load the sample data
  with open(sample_file, 'r', encoding='utf-8') as f:
      sample_data = json.load(f)

  #process each question
  for question_id, content in sample_data.items():
      print(f"\n\nQuestion ID: {question_id}")
      print(f"Question: {content['question']}")
      print(f"Original Answer: {content['original_answer']}")

      non_hallucinated_count = 0
      total_answers = 0

      #processing each answer in each cluster
      for cluster in content.get("clusters", []):
          print(f"\nCluster {cluster['cluster_number']}:")

          for i, answer_item in enumerate(cluster.get("answers", [])):
              answer_text = answer_item[0]
              print(f"\n[{i+1}] {answer_text}")

              valid_label = False
              while not valid_label:
                  label = input("Label (H/N): ").strip().upper()
                  if label in ['H', 'N']:
                      valid_label = True
                      # Store full label text
                      answer_label = "Non-Hallucinated" if label == 'N' else "Hallucinated"
                      answer_item[2] = answer_label

                      if label == 'N':
                          non_hallucinated_count += 1
                      total_answers += 1
                  else:
                      print("Invalid input. Enter 'H' for Hallucinated or 'N' for Non-Hallucinated.")

      #determine question label
      if non_hallucinated_count >= 6:
          question_label = "Non-Hallucinated"
      else:
          question_label = "Hallucinated"

      content["question_label"] = question_label
      print(f"\nQuestion Label: {question_label} (based on {non_hallucinated_count}/{total_answers} non-hallucinated answers)")

      # Save after each question in case the process is interrupted
      with open(sample_file, 'w', encoding='utf-8') as f:
          json.dump(sample_data, f, ensure_ascii=False, indent=4)
      print(f"Progress saved to {sample_file}")

  print("\nAnswer labeling complete!")

In [6]:

huamn_labeling()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Label (H/N): h

Question Label: Hallucinated (based on 0/10 non-hallucinated answers)
Progress saved to entailment_clusters_Llama3.1-8b-xor_tydiqa_sample.json


Question ID: 7835952656544964295
Question: ما هي الكاروتينات ؟
Original Answer: صبغات عضوية تيتراتيربينويدية (tetraterpenoid) صفراء اللون ، توجد بشكل طبيعي في الخضروات والفاكهة

Cluster 0:

[1] الكاروتينات هي مركبات كيميائية من الفيتامينات التي توجد في الطعام مثل البطاطس والفلفل والخضروات الخضراء والفواكه الحمراء والبرتقال، وتتألف من مركبات كيميائية تؤدي دورًا مهمًا في صحة الإنسان
Label (H/N): n

Cluster 1:

[1] الكاروتينات هي مركبات عضوية متعددة الجزيئات مشتقة من الفيتامين أ
Label (H/N): n

[2] الكاروتينات هي فصائل من الفيتامينات، وتشمل فيتامين أ (البيتا كاروتين) وفيتامين أ (ألفا كاروتين) وفيتامين أ (بيتا كاروتين) وفيتامين أ (جاما كاروتين)
Label (H/N): n

Cluster 2:

[1] الكاروتينات هي مركبات ك
Label (H/N): h

Cluster 3:

[1] الكاروتينات هي مركبات عضوية من الفيتا