<a href="https://colab.research.google.com/github/Reemaalt/Detection-of-Hallucination-in-Arabic/blob/main/Test_LLM_Judge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `week1 test` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `week1 t

In [2]:
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

In [3]:
# all functions
#Load  data
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# Setup the LLM judge
def setup_judge(model_name="meta-llama/Llama-3.2-3B-Instruct"):  # test model for now !!!

    print(f"Loading model {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    model = model.to(device)

    return model, tokenizer, device

# Judge work
def judge_hallucination(question, generated_answer, original_answer, model, tokenizer, device):
    # Create prompt for the LLM
    prompt = f"""Question: {question}
Original Answer: {original_answer}
Generated Answer: {generated_answer}


قارن المحتوى العلمي للإجابة المقدمة مع الاجابه الاصلية . تجاهل أي اختلافات في الأسلوب، القواعد، أو علامات الترقيم.قم بتقييم الإجابة المقدمة اذا كانت هلوسة او غير هلوسة صحيحه
Answer with only "Hallucinated" or "Not Hallucinated":"""

    # Tokenize and prepare promt
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)

    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=10,
            temperature=0.5,
            do_sample=False
        )

    # Decode the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.replace(prompt, "").strip()


    # Extract only the expected output
    if "Hallucinated" in response:
        return "Hallucinated"
    elif "Not Hallucinated" in response:
        return "Not Hallucinated"
    else:
        return "Unknown"


# Main function to validate labels
def validate_rouge_labels(data, model, tokenizer, device, sample_size=None):
    results = []

    # Sample a subset of for test onlyyyy
    if sample_size and sample_size < len(data):
        print(f"Using a sample of {sample_size} questions")
        sample_keys = list(data.keys())[:sample_size]
        sampled_data = {k: data[k] for k in sample_keys}
    else:
        sampled_data = data

    # Process each question
    for question_id, question_data in tqdm(sampled_data.items(), desc="Validating labels"):
        question = question_data.get("question", "")
        reference_answer = question_data.get("original_answer")
        clusters = question_data.get("clusters", [])

        for cluster in clusters:
            answers = cluster.get("answers", [])
            cluster_num = cluster.get("cluster_number", 0)

            for answer in answers:
                if isinstance(answer, list):
                    answer_text = answer[0]
                    rouge_info = answer[2] if len(answer) > 2 else {}
                    rouge_score = rouge_info.get("rouge_l_f1", 0)
                    rouge_label = "Hallucinated" if rouge_score < 0.3 else "Not Hallucinated"
                else:
                    answer_text = answer
                    rouge_label = "Unknown"

                # LLM judgment
                llm_judgment = judge_hallucination(
                    question, answer_text, reference_answer, model, tokenizer, device
                )

                results.append({
                    "question_id": question_id,
                    "question": question,
                    "original_answer": reference_answer,
                    "cluster": cluster_num,
                    "answer": answer_text,
                    "rouge_label": rouge_label,
                    "llm_judgment": llm_judgment,
                    "agreement": rouge_label == llm_judgment
                })

    return results


In [None]:
# Real main process
def main():

    # Path  data file
    data_file = "labeled_data_XORfull_rougel_isri.json"  # Update this path
    output_file = "llm_validation_results.json"

    # Load data
    print(f"Loading data from {data_file}...")
    data = load_data(data_file)
    print(f"Loaded {len(data)} questions")

    # Setup LLM judge
    model, tokenizer, device = setup_judge()

    # Validate a small sample first (5)
    sample_results = validate_rouge_labels(data, model, tokenizer, device, sample_size=5)

    # Print sample results agreement
    total_judgments = sum(1 for r in sample_results if r["rouge_label"] != "Unknown")
    agreement_count = sum(1 for r in sample_results if r["agreement"] and r["rouge_label"] != "Unknown")

    if total_judgments > 0:
        agreement_percentage = (agreement_count / total_judgments) * 100
        print(f"Agreement on sample: {agreement_percentage:.2f}% ({agreement_count}/{total_judgments})")


    # Ask to continue with full dataset
    user_input = input("\nContinue with full dataset? (y/n): ")
    if user_input.lower() == 'y':
        print("Processing full dataset...")
        full_results = validate_rouge_labels(data, model, tokenizer, device)

        # Save results
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(full_results, f, ensure_ascii=False, indent=2)

        print(f"Results saved to {output_file}")

        # Calculate overall agreement
        total_judgments = sum(1 for r in full_results if r["rouge_label"] != "Unknown")
        agreement_count = sum(1 for r in full_results if r["agreement"] and r["rouge_label"] != "Unknown")

        if total_judgments > 0:
            agreement_percentage = (agreement_count / total_judgments) * 100
            print(f"Overall agreement: {agreement_percentage:.2f}% ({agreement_count}/{total_judgments})")
    else:
        print("Exiting without processing full dataset.")

if __name__ == "__main__":
    main()

In [4]:
#test main
def main():
    data_file = "/content/labeled_data_XORfull_rougel_isri.json"
    print("start data...")
    print(f"Loading data from {data_file}...")
    data = load_data(data_file)
    print(f"Loaded {len(data)} questions")

    model, tokenizer, device = setup_judge()

    # Run validation on a small sample
    sample_results = validate_rouge_labels(data, model, tokenizer, device, sample_size=5)

    print("\n🔍 Detailed Validation Results on 5 Questions:\n")
    for result in sample_results:
        question_id = result.get("question_id", "")
        question_text = data.get(question_id, {}).get("question", "")
        original_answer = data.get(question_id, {}).get("original_answer", "")

        print(f"🟦 Question ID: {question_id}")
        print(f"Q: {question_text}")
        print(f"Original Answer: {original_answer}")

        print(f"\n  🔹 Cluster {result['cluster']}")
        print(f"  Answer: {result['answer']}")
        print(f"  ROUGE Label: {result['rouge_label']}")
        print(f"  LLM Judgment: {result['llm_judgment']}")
        print(f"  ✅ Agreement: {result['agreement']}")
        print("\n" + "-"*60)

    # Optional summary
    total = sum(1 for r in sample_results if r["rouge_label"] != "Unknown")
    agree = sum(1 for r in sample_results if r["rouge_label"] != "Unknown" and r["agreement"])
    if total > 0:
        print(f"\n🔁 Agreement on sample: {agree / total * 100:.2f}% ({agree}/{total})")

if __name__ == "__main__":
    main()

start data...
Loading data from /content/labeled_data_XORfull_rougel_isri.json...
Loaded 708 questions
Loading model meta-llama/Llama-3.2-3B-Instruct...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Using device: cuda
Using a sample of 5 questions


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Validating labels:  20%|██        | 1/5 [00:04<00:18,  4.65s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end


🔍 Detailed Validation Results on 5 Questions:

🟦 Question ID: 5434985093263686694
Q: ما هي أولى جامعات فنلندا؟
Original Answer: أكاديمية توركو

  🔹 Cluster 0
  Answer: أول جامعة في فنلندا هي جامعة هلسنكي (Helsingin yliopisto) التي تأسست في عام 1640
  ROUGE Label: Hallucinated
  LLM Judgment: Hallucinated
  ✅ Agreement: True

------------------------------------------------------------
🟦 Question ID: 5434985093263686694
Q: ما هي أولى جامعات فنلندا؟
Original Answer: أكاديمية توركو

  🔹 Cluster 0
  Answer: أول جامعة في فنلندا هي جامعة هلسنكي، تأسست عام 1640
  ROUGE Label: Hallucinated
  LLM Judgment: Hallucinated
  ✅ Agreement: True

------------------------------------------------------------
🟦 Question ID: 5434985093263686694
Q: ما هي أولى جامعات فنلندا؟
Original Answer: أكاديمية توركو

  🔹 Cluster 0
  Answer: أول جامعة في فنلندا هي جامعة هلسنكي التي تأسست في عام 1640
  ROUGE Label: Hallucinated
  LLM Judgment: Hallucinated
  ✅ Agreement: True

-----------------------------------------


