#### Loading Data

In [1]:
import json
import os

folder_path_halu_eval_2 = 'benchmark/halu_eval_2/annotation/human_annotation'
folder_path_halu_eval = 'benchmark/halu_eval/data'

def read_data(folder_path_halu_eval, folder_path_halu_eval_2):
    """
        Return a list = [Halu Eval Dataset, Halu Eval 2.0 Dataset]
    """
    print("Loading Data", end="\n\n")
    
    
    print("Loading HaluEval")
    dataset_halu_eval = {}
    
    for file_name in os.listdir(folder_path_halu_eval):
        file_path = os.path.join(folder_path_halu_eval, file_name)
        file_name = file_name.replace(".json", "")
        content = []

        with open(file_path, "r") as f:
            for line in f:
                content.append(json.loads(line))
            dataset_halu_eval[file_name] = content
        print(f"Length of {file_name}: {len(dataset_halu_eval[file_name])}" )

    print()
    print("Loading HaluEval 2.0")
    dataset_halu_eval_2 = {}
    
    for file_name in os.listdir(folder_path_halu_eval_2):
        file_path = os.path.join(folder_path_halu_eval_2, file_name)
        file_name = file_name.replace(".json", "")
        
        with open(file_path, "r") as f:
            content = f.read()
        dataset_halu_eval_2[file_name] = json.loads(content)
        print(f"Length of {file_name}: {len(dataset_halu_eval_2[file_name])}")
        
    return [dataset_halu_eval, dataset_halu_eval_2]
        
dataset_halueval, dataset_halueval2 = read_data(folder_path_halu_eval, folder_path_halu_eval_2)

Loading Data

Loading HaluEval
Length of dialogue_data: 10000
Length of general_data: 4507
Length of qa_data: 10000
Length of summarization_data: 10000

Loading HaluEval 2.0
Length of Bio-Medical: 200
Length of Education: 200
Length of Finance: 200
Length of Open-Domain: 200
Length of Science: 200


#### Set Up SelfCheckGPT

In [2]:
import warnings
warnings.filterwarnings("ignore")

from dotenv import load_dotenv
import sys
sys.path.append("self_check_gpt")
from self_check_gpt import modeling_selfcheck

load_dotenv()
api_key = os.getenv('UPSTAGE_API_KEY')
selfcheck_prompt = modeling_selfcheck.SelfCheckAPIPrompt(client_type="openai", base_url="https://api.upstage.ai/v1/solar", model="solar-pro", api_key=api_key)

Initiate OpenAI client... model = solar-pro


#### Testing

In [3]:
dataset_halueval2['Bio-Medical'][0]

{'id': 10,
 'user_query': 'Name synonym of Acrokeratosis paraneoplastica.',
 'chatgpt_response': 'The possible synonyms for Acrokeratosis paraneoplastica are:\n- Bazex syndrome\n- Bazex-Dupré-Christol syndrome\n- Acrokeratosis neoplastica\n- Triad syndrome\n- Bazex syndrome of multiple basal cell carcinomas\n- Bazex syndrome of carcinomas of pharynx and esophagus\n- Fingerprint-acral-keratoderma - Specifically for the clinical presentation of palmoplantar keratoderma with dermatoscopic abnormal hand fingerprint patterns.',
 'chatgpt_fact': ['Bazex syndrome is a synonym for Acrokeratosis paraneoplastica.',
  'Bazex-Dupré-Christol syndrome is another name for Acrokeratosis paraneoplastica.',
  'Acrokeratosis neoplastica and Triad syndrome are also synonymous with Acrokeratosis paraneoplastica.',
  "'Bazex syndrome of multiple basal cell carcinomas' and 'Bazex syndrome of carcinomas of pharynx and esophagus' are alternative names for Acrokeratosis paraneoplastica.",
  "The term 'Fingerpri

In [4]:
import re
import pandas as pd
from tqdm import tqdm

samples = pd.Series(dataset_halueval2['Bio-Medical'])[:1]
scores_prompt = {}

for i in tqdm(range(len(samples))):
    sample = samples[i]
    prompt = sample['user_query']
    response = sample['chatgpt_response']
    setences = re.split(r'\.|\n', response)
    sentences = [s.strip() for s in setences if s.strip()]
    sampled_passages = selfcheck_prompt.get_sample_passages(prompt)
    
    scores_prompt[sample['id']] = selfcheck_prompt.predict(
        sentences=sentences,
        sampled_passages=sampled_passages,
        verbose=True
    )
scores_prompt

100%|██████████| 8/8 [00:23<00:00,  2.94s/it]
100%|██████████| 1/1 [00:31<00:00, 31.20s/it]


{10: array([0. , 0. , 1. , 1. , 1. , 0.2, 1. , 1. ])}