<a href="https://colab.research.google.com/github/Reemaalt/Detection-of-Hallucination-in-Arabic/blob/main/RougeL_Labeling_of_Answers_XOR_ISRI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##QA pairs generated using llama8b on xquad

In [1]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=7be474d11d8b08b36c50c1e7061474aa459f667ca37e26acfb92a95ef5f4ec8d
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


#Test RougeL

In [None]:
from huggingface_hub import login
login()

In [4]:
!pip install nltk



In [5]:
from transformers import AutoTokenizer
from rouge_score import rouge_scorer
from nltk.stem.isri import ISRIStemmer  # Import ISRI stemmer

# Load tokenizer
model_name = "core42/jais-13b"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Initialize ISRI Stemmer
stemmer = ISRIStemmer()

# Define a custom tokenizer class that applies both tokenization and stemming
class CustomTokenizer:
    def __init__(self, tokenizer, stemmer):
        self.tokenizer = tokenizer
        self.stemmer = stemmer

    def tokenize(self, text):
        words = text.split()  # Split into words first
        stemmed_words = [self.stemmer.stem(word) for word in words]  # Apply stemming
        print("Stemmed Words:", stemmed_words)  # Debugging line to check the stemmed words
        return self.tokenizer.tokenize(" ".join(stemmed_words))  # Tokenize the stemmed text

# Create an instance of our custom tokenizer
custom_tokenizer = CustomTokenizer(tokenizer, stemmer)

# Create the ROUGE scorer with our custom tokenizer
scorer = rouge_scorer.RougeScorer(['rougeL'], tokenizer=custom_tokenizer)

# Example strings
pred_str = 'السلام عليكم كيف حالك'
label_str = 'السلام عليكم صديقي كيف حالك'

# Compute ROUGE scores
scores = scorer.score(label_str, pred_str)

# Print the stemmed words to check if stemming occurs
for key, value in scores.items():
    print(f'{key}: {value}')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Stemmed Words: ['سلم', 'علي', 'صدق', 'كيف', 'حلك']
Stemmed Words: ['سلم', 'علي', 'كيف', 'حلك']
rougeL: Score(precision=1.0, recall=0.8333333333333334, fmeasure=0.9090909090909091)


#Use RougeL on data

In [7]:

import json
# Load data from JSON file
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

# Save labeled data to a new JSON file
def save_labels(output_data, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as file:
        json.dump(output_data, file, ensure_ascii=False, indent=4)

# Process hallucination detection efficiently (no inheritance of labels)
def check_hallucination(data, rouge_threshold=0.3, min_non_hallucinated=6):

    updated_data = {}

    for question_id, item in data.items():
        original_answer = item["original_answer"].strip()  # Ground truth
        non_hallucinated_count = 0  # Count for the number of non-hallucinated answers

        # Process each cluster
        for cluster in item["clusters"]:
            for answer_entry in cluster["answers"]:
                answer = answer_entry[0].strip()  # Extract answer text

                if answer:
                    # Compute ROUGE-L score for each individual answer
                    scores = scorer.score(original_answer, answer)
                    rouge_l_f1 = scores["rougeL"].fmeasure

                    # Determine hallucination label based on ROUGE score
                    cluster_label = "Non-Hallucinated" if rouge_l_f1 >= rouge_threshold else "Hallucinated"

                    # Append ROUGE details for the answer
                    answer_entry.append({
                        "rouge_l_f1": round(rouge_l_f1, 2),
                        "rouge_label": cluster_label
                    })

                    # Count the non-hallucinated answers
                    if cluster_label == "Non-Hallucinated":
                        non_hallucinated_count += 1

        # Determine question-level label based on the number of non-hallucinated answers
        question_label = "Non-Hallucinated" if non_hallucinated_count >= min_non_hallucinated else "Hallucinated"

        # Store updated data with the question label
        updated_data[question_id] = {
            **item,
            "computed_question_label": question_label
        }

    return updated_data

# File paths
input_file_path = "/content/human_sample_xor_tydiqa.json"
output_file_path = "labeled_data_XORsample_rougel_isri.json"

# Load data
data = load_data(input_file_path)

# Process and add labels
updated_data = check_hallucination(data)

# Save updated data
save_labels(updated_data, output_file_path)

print(f"Labeled data has been saved to {output_file_path}")

Stemmed Words: ['329']
Stemmed Words: ['329', 'قعد']
Stemmed Words: ['329']
Stemmed Words: ['328', 'قعد']
Stemmed Words: ['329']
Stemmed Words: ['328', 'قعد']
Stemmed Words: ['329']
Stemmed Words: ['328', 'قعد']
Stemmed Words: ['329']
Stemmed Words: ['325', 'قعد']
Stemmed Words: ['329']
Stemmed Words: ['328', 'قعد']
Stemmed Words: ['329']
Stemmed Words: ['328', 'قعد']
Stemmed Words: ['329']
Stemmed Words: ['328', 'قعد']
Stemmed Words: ['329']
Stemmed Words: ['1925']
Stemmed Words: ['عمر', 'قوط']
Stemmed Words: ['طرز', 'كلس']
Stemmed Words: ['عمر', 'قوط']
Stemmed Words: ['طر']
Stemmed Words: ['عمر', 'قوط']
Stemmed Words: ['سمى']
Stemmed Words: ['عمر', 'قوط']
Stemmed Words: ['طرز', 'بار']
Stemmed Words: ['عمر', 'قوط']
Stemmed Words: ['برك']
Stemmed Words: ['عمر', 'قوط']
Stemmed Words: ['طرز', 'عمر', 'روم', 'سمى', 'طرز', 'كلس']
Stemmed Words: ['عمر', 'قوط']
Stemmed Words: ['طرز', 'عمر', 'روم', 'سمى', 'طرز', 'كلس']
Stemmed Words: ['عمر', 'قوط']
Stemmed Words: ['سمى', 'طرز', 'عمر', 'روم', '

In [12]:
import json

# Load data from JSON file
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

# Compare the computed labels with the human-provided labels and count non-hallucinated answers
def compare_labels(data):
    correct_count = 0
    incorrect_count = 0
    non_hallucinated_human = 0  # Count of non-hallucinated human labels
    non_hallucinated_computed = 0  # Count of non-hallucinated computed labels

    # Iterate through the data to compare computed and human labels
    for question_id, item in data.items():
        human_label = item["question_label"]  # Human-provided label
        computed_label = item["computed_question_label"]  # Computed label from ROUGE scores

        # Count non-hallucinated answers for the current question
        non_hallucinated_count = 0
        for cluster in item["clusters"]:
            for answer_entry in cluster["answers"]:
                # answer_entry[2] holds the label; ensure it's in the correct format (index-based)
                if len(answer_entry) > 2 and "rouge_label" in answer_entry[2] and answer_entry[2]["rouge_label"] == "Non-Hallucinated":
                    non_hallucinated_count += 1

        # Increment the appropriate counts based on human and computed labels
        if human_label == "Non-Hallucinated":
            non_hallucinated_human += 1
        if computed_label == "Non-Hallucinated":
            non_hallucinated_computed += 1

        # Compare the labels for correctness
        if human_label == computed_label:
            correct_count += 1
        else:
            incorrect_count += 1

    # Calculate accuracy
    total_count = correct_count + incorrect_count
    accuracy = correct_count / total_count if total_count > 0 else 0

    return correct_count, incorrect_count, accuracy, non_hallucinated_human, non_hallucinated_computed

# File path to your labeled data
input_file_path = "/content/labeled_data_XORsample_rougel_isri.json"

# Load data from the file
data = load_data(input_file_path)

# Compare the labels and get the non-hallucinated counts
correct, incorrect, accuracy, non_hallucinated_human, non_hallucinated_computed = compare_labels(data)

# Print results
print(f"Correct labels: {correct}")
print(f"Incorrect labels: {incorrect}")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Non-Hallucinated Human Labels: {non_hallucinated_human}")
print(f"Non-Hallucinated Computed Labels: {non_hallucinated_computed}")


Correct labels: 81
Incorrect labels: 19
Accuracy: 81.00%
Non-Hallucinated Human Labels: 19
Non-Hallucinated Computed Labels: 4
