#### Loading Datasets

In [6]:
from utils import read_data

OUTPUT_DIR = '../data'
folder_path_halueval = '../halu_eval_2/annotation/human_annotation'
folder_path_selfcheckgpt = '../self_check_gpt/data/dataset_v3.json'

dataset_halu_eval, dataset_self_check_gpt = read_data(folder_path_halueval, folder_path_selfcheckgpt)

Load HaluEval 2.0
Length of Bio-Medical: 200.
Length of Education: 200.
Length of Finance: 200.
Length of Open-Domain: 200.
Length of Science: 200.

Loading SelfCheckGPT
The length of the dataset: 238.


In [7]:
import os
import json

FOLDER_PATH_HALUEVAL_PROCESSED = "../data/halu_eval_2"

def read_processed_data():
    dataset_halu_eval_processed = {}
    
    for file_name in os.listdir(FOLDER_PATH_HALUEVAL_PROCESSED):
        file_path = os.path.join(FOLDER_PATH_HALUEVAL_PROCESSED, file_name)
        file_name = file_name.replace(".json", "")
        
        with open(file_path, 'r') as f:
            content = f.read()
        dataset_halu_eval_processed[file_name] = json.loads(content)
        print(f"Length of {file_name}: {len(dataset_halu_eval_processed[file_name])}.")
    return dataset_halu_eval_processed
    
dataset_halu_eval_processed = read_processed_data()

Length of Bio-Medical: 200.
Length of Education: 200.
Length of Finance: 200.
Length of Open-Domain: 200.
Length of Science: 200.


#### Helper Functions

In [3]:
import json
import re
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()
import logging

from solar_pro import SolarPro
SAMPLE_PASSAGES_SIZE = 5
SAMPLE_PASSAGES_RESPOND_TEMP = 0.7 # [0, 2], Default=0.7
SENTENCE_ENDINGS = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!|\n)\s*'
LLM = SolarPro(
    client_type="openai", 
    base_url="https://api.upstage.ai/v1/solar", 
    model="solar-pro", 
    api_key=os.getenv('UPSTAGE_API_KEY')
)

def get_sample_passages(
    prompt: str,
    respond_length: int,
    respond_word_counter: int,
    sample_passages_size = SAMPLE_PASSAGES_SIZE,
    respond_temperature = SAMPLE_PASSAGES_RESPOND_TEMP,
):
    prompt_padded = f"{prompt}. Answer in {respond_length} sentences and around {respond_word_counter} words"
    sample_passages = []
    
    for _ in range(sample_passages_size):
        sample_passage = LLM.get_respond(prompt=prompt_padded, temperature=respond_temperature)
        sample_passages.append(sample_passage)
    return sample_passages

def get_respond_length(respond: str) -> int:
    respond = respond.strip()
    return len(re.split(SENTENCE_ENDINGS, respond))

def get_respond_word_couner(respond: str) -> int:
    return len(re.findall(r'\b\w+\b', respond))

Initiate OpenAI client... model = solar-pro


#### Adding Sample Passages to HaluEval 2.0

In [4]:
def add_sampled_passages():
    logging.basicConfig(level=logging.INFO, filename="../data/logs/dataset_modifier.log", filemode='w')
    print("Start adding sample passages.")

    for category in dataset_halu_eval.keys():
        logging.info(f"Checking category: {category}")
        print(f"Working w/ category: {category}")
        
        # Set up parameters
        output_path = os.path.join(OUTPUT_DIR, f"halu_eval_2/{category}.json")
        dataset = dataset_halu_eval[category]
        if category in dataset_halu_eval_processed:
            dataset_output = dataset_halu_eval_processed[category]
            processed_ids = [d.get('id') for d in dataset_output]
        else: 
            dataset_output = []
            processed_ids = []
        
        for i, sample in enumerate(tqdm(dataset)):
            sample = dataset[i]
            
            # Check if the sample has already been processed
            if sample['id'] in processed_ids:
                continue
            # Process sample
            sample['sample_passages'] = get_sample_passages(
                sample['user_query'], 
                get_respond_length(sample['chatgpt_response']),
                get_respond_word_couner(sample['chatgpt_response']))
            dataset_output.append(sample)
            logging.info(f"Process sample number: {i}, w/ sample id: {sample['id']}")
        with open(output_path, 'w') as fout:
            json.dump(dataset_output, fout, indent=2)
        print(f"Finish category: {category}")
    logging.shutdown()

In [5]:
add_sampled_passages()

Start adding sample passages.
Working w/ category: Bio-Medical


100%|██████████| 200/200 [00:00<?, ?it/s]


Finish category: Bio-Medical
Working w/ category: Education


100%|██████████| 200/200 [51:28<00:00, 15.44s/it] 


Finish category: Education
Working w/ category: Finance


100%|██████████| 200/200 [1:20:23<00:00, 24.12s/it]


Finish category: Finance
Working w/ category: Open-Domain


100%|██████████| 200/200 [17:59<00:00,  5.40s/it]


Finish category: Open-Domain
Working w/ category: Science


100%|██████████| 200/200 [2:11:20<00:00, 39.40s/it]  

Finish category: Science



