#### Set Up

In [None]:
import os
import json
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

import torch

from selfcheck_prompt_api import SelfCheckPromptAPI
from selfcheck_prompt_local import SelfCheckPromptLocal

In [None]:
openai_api_key = os.getenv('OPENAI_API_KEY')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
selfcheck_prompt_api = SelfCheckPromptAPI(
    model='gpt-4o-mini-2024-07-18',
    api_key=openai_api_key,
    prompt_template_path='selfcheck_prompt_template.txt'
)

selfcheck_prompt_local = SelfCheckPromptLocal(
    model_name="Qwen/Qwen3-4B-Instruct-2507",
    prompt_template_path='selfcheck_prompt_template.txt'
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


#### Dataset

In [None]:
with open("data/dataset.json", "r") as f:
    dataset = json.loads(f.read())

print(f"The length of the dataset: {len(dataset)}")
print("The keys of each sample:", list(dataset[0].keys()))

The length of the dataset: 238


#### Benchmark

In [None]:
scores_api = {}

for i in tqdm(range(len(dataset[:1]))):
    sample = dataset[i]
    idx = sample['wiki_bio_test_idx']
    
    scores_api[idx] = selfcheck_prompt_api.predict_hallucination(
        sentences=sample['gpt3_sentences'],
        sample_responses=sample['gpt3_text_samples'],
        verbose=True
    )
    
with open("data/scores_gpt4o_mini.json", "w") as f:
    json.dump(scores_api, f)

100%|██████████| 9/9 [02:20<00:00, 15.66s/it]
100%|██████████| 1/1 [02:20<00:00, 140.97s/it]


In [None]:
scores_local = {}

for i in tqdm(range(len(dataset[:1]))):
    sample = dataset[i]
    idx = sample['wiki_bio_test_idx']
    
    scores_local[idx] = selfcheck_prompt_local.predict_hallucination(
        sentences=sample['gpt3_sentences'],
        sample_responses=sample['gpt3_text_samples'],
        verbose=True
    )
    
with open("data/scores_qwen3_4b_instruct.json", "w") as f:
    json.dump(scores_local, f)

100%|██████████| 9/9 [02:12<00:00, 14.74s/it]
100%|██████████| 1/1 [02:12<00:00, 132.64s/it]
