#### Set Up

In [11]:
import os
from dotenv import load_dotenv

from gpt_model import OpenAIClient

load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
client = OpenAIClient(api_key=openai_api_key, model='gpt-4.1-mini-2025-04-14')

In [12]:
from utils import load_data_json

filepath_halu_eval = '../data/interrogate_llm_zeroshot/halu_eval.json'
filepath_halu_eval_long_answer = '../data/interrogate_llm_zeroshot/halu_eval_long_answer.json'
filepath_books = '../data/interrogate_llm_zeroshot/books.json'

data_halu_eval_long_answer = load_data_json(filepath_halu_eval_long_answer)
data_books = load_data_json(filepath_books)

print(f"Length of HaluEval Long Answer data: {len(data_halu_eval_long_answer)}")
print(f"Length of Books data: {len(data_books)}")

Length of HaluEval Long Answer data: 300
Length of Books data: 498


In [13]:
filepath_reconstruct_prompt = 'prompts/reconstruct.txt'
filepath_reconstruct_prompt_knowledge = 'prompts/reconstruct_with_knowledge.txt'
filepath_reconstruct_prompt_books = 'prompts/reconstruct_books.txt'

with open(filepath_reconstruct_prompt, 'r', encoding='utf-8') as f:
    reconstruct_prompt_template = f.read()

with open(filepath_reconstruct_prompt_knowledge, 'r', encoding='utf-8') as f:
    reconstruct_with_knowledge_prompt_template = f.read()

with open(filepath_reconstruct_prompt_books, 'r', encoding='utf-8') as f:
    reconstruct_books_prompt_template = f.read()

In [14]:
filepath_output_halu_eval_long_answer = '../data/output/interrogate_llm_zeroshot/halu_eval_long_answer.json'
filepath_output_halu_eval_long_answer_knowledge = '../data/output/interrogate_llm_zeroshot/halu_eval_long_answer_with_knowledge.json'
filepath_output_books = '../data/output/interrogate_llm_zeroshot/books.json'

output_halu_eval_long_answer = load_data_json(filepath_output_halu_eval_long_answer)
output_halu_eval_long_answer_knowledge = load_data_json(filepath_output_halu_eval_long_answer_knowledge)
output_books = load_data_json(filepath_output_books)

#### HaluEval Long Answer

In [15]:
from tqdm import tqdm

if len(output_halu_eval_long_answer) > 0: 
    processed_ids = [sample['id'] for sample in output_halu_eval_long_answer]
else:
    processed_ids = []

for idx, sample in enumerate(tqdm(data_halu_eval_long_answer[:10], desc="Processing data")):
    if sample['id'] in processed_ids:
        continue

    reconstruct_prompt = reconstruct_prompt_template.format(
        answer=sample['answer']
    )
    response = client.generate_response(reconstruct_prompt, n=3, temeprature=1)
    output_halu_eval_long_answer.append(
        {
            "id": sample['id'],
            "knowledge": sample["knowledge"],
            "ground_truth": sample["ground_truth"],
            "question": sample["question"],
            "answer": sample["answer"],
            "reconstruct_questions": [question.strip('"') for question in response],
            "is_hallucinated": sample['is_hallucinated']
        }
    )

Processing data: 100%|██████████| 10/10 [00:00<00:00, 169125.16it/s]


In [16]:
import json

with open(filepath_output_halu_eval_long_answer, 'w', encoding='utf-8') as f:
    json.dump(output_halu_eval_long_answer, f, indent=4)

#### HaluEval Long Answer w/ Knowledge

In [17]:
from tqdm import tqdm

if len(output_halu_eval_long_answer_knowledge) > 0: 
    processed_ids = [sample['id'] for sample in output_halu_eval_long_answer_knowledge]
else:
    processed_ids = []

for idx, sample in enumerate(tqdm(data_halu_eval_long_answer[:10], desc="Processing data")):
    if sample['id'] in processed_ids:
        continue

    reconstruct_prompt = reconstruct_with_knowledge_prompt_template.format(
        answer=sample['answer'],
        knowledge=sample['knowledge'],
    )
    response = client.generate_response(reconstruct_prompt, n=3, temeprature=1)
    output_halu_eval_long_answer_knowledge.append(
        {
            "id": sample['id'],
            "knowledge": sample["knowledge"],
            "ground_truth": sample["ground_truth"],
            "question": sample["question"],
            "answer": sample["answer"],
            "reconstruct_questions": [question.strip('"') for question in response],
            "is_hallucinated": sample['is_hallucinated']
        }
    )

Processing data: 100%|██████████| 10/10 [00:00<00:00, 94893.76it/s]


In [18]:
import json

with open(filepath_output_halu_eval_long_answer_knowledge, 'w', encoding='utf-8') as f:
    json.dump(output_halu_eval_long_answer_knowledge, f, indent=4)

#### Books

In [19]:
from tqdm import tqdm

if len(output_books) > 0: 
    processed_ids = [sample['id'] for sample in output_books]
else:
    processed_ids = []

for idx, sample in enumerate(tqdm(data_books[:10], desc="Processing data")):
    if sample['id'] in processed_ids:
        continue
    reconstruct_prompt = reconstruct_books_prompt_template.format(
        answer=sample['answer'],
    )
    response = client.generate_response(reconstruct_prompt, n=3, temeprature=1)
    output_books.append(
        {
            "id": sample['id'],
            "question": sample["question"],
            "answer": sample["answer"],
            "reconstruct_questions": [question.strip('"') for question in response],
            "is_hallucinated": sample['is_hallucinated']
        }
    )

Processing data: 100%|██████████| 10/10 [01:32<00:00,  9.26s/it]


In [20]:
import json

with open(filepath_output_books, 'w', encoding='utf-8') as f:
    json.dump(output_books, f, indent=4)