In [6]:
import json
import os

data_path = "../../data/halu_eval/evaluation/qa/qa_gpt-3.5-turbo_result.json"

with open(data_path, 'r') as handle:
    data = [json.loads(line) for line in handle]

data = [item for item in data if 'judgement' in list(item.keys()) and 'ground_truth' in list(item.keys())]
print(f"Lenght of data before processed: {len(data)}")

Lenght of data before processed: 9997


In [None]:
import random
from collections import Counter

answer_min_size = 50

data = [item for item in data if len(item['answer']) > answer_min_size]
print(f"Length of data with answer longer than {answer_min_size}: {len(data)}")
factual_qas = [item for item in data if item['judgement'] == 'No']
print(f"Length of factual qas: {len(factual_qas)}")
hallucinated_qas = [item for item in data if item['judgement'] == 'Yes']
print(f"Length of hallucinated qas: {len(hallucinated_qas)}")

Length of data with answer longer than 20: 5585
Length of factual qas: 3702
Length of hallucinated qas: 1883


In [8]:
random.seed(42)
sample_size = 300
hallucinated_ratio = 2/3
hallucinated_size = int(sample_size * hallucinated_ratio)
factual_size = sample_size - hallucinated_size
print(f"Hallucinated Size: {hallucinated_size}")
print(f"Factual Size: {factual_size}")

def sample_with_ground_truth(items, total_samples, ground_truth_ratio=0.5):
    try:
        gt_no = [item for item in items if item['ground_truth'] == 'No']
        gt_yes = [item for item in items if item['ground_truth'] == 'Yes']

        ground_truth_yes_size = int(total_samples * ground_truth_ratio)

        sampled_no = random.sample(gt_no, ground_truth_yes_size)
        sampled_yes = random.sample(gt_yes, sample_size - ground_truth_yes_size)
        return sampled_no + sampled_yes
    except:
        return random.sample(items, total_samples)

sampled_factual_qas = sample_with_ground_truth(factual_qas, hallucinated_size)
sampled_hallucinated_qas = sample_with_ground_truth(hallucinated_qas, factual_size)
combined_samples = sampled_factual_qas + sampled_hallucinated_qas
random.shuffle(combined_samples)
ground_truth_counts = Counter(item['ground_truth'] for item in combined_samples)
judgement_counts = Counter(item['judgement'] for item in combined_samples)

print(f"Samples with judgement 'No': {len(sampled_factual_qas)}")
print(f"Samples with judgement 'Yes': {len(sampled_hallucinated_qas)}")
print(f"Ground truth distribution: {ground_truth_counts}")
print(f"Judgement distribution: {judgement_counts}")

Hallucinated Size: 200
Factual Size: 100
Samples with judgement 'No': 300
Samples with judgement 'Yes': 300
Ground truth distribution: Counter({'Yes': 450, 'No': 150})
Judgement distribution: Counter({'No': 300, 'Yes': 300})


In [9]:
result = []

for idx, sample in enumerate(combined_samples):
    result.append(
        {
            "id": idx,
            "knowledge": sample["knowledge"],
            "question": sample["question"],
            "answer": sample["answer"],
            "ground_truth": sample["ground_truth"],
            "is_hallucinated": 0 if sample['judgement'] == 'Yes' else 1
        }
    )

In [None]:
output_path = "../../data/interrogate_llm_zeroshot/halu_eval_long_answer.json"

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(result, f, indent=4)