#### Set Up

In [7]:
from dotenv import load_dotenv
import os
from gpt_model import OpenAIEmbeddingClient

load_dotenv()
client = OpenAIEmbeddingClient(api_key=os.getenv('OPENAI_API_KEY'), model='text-embedding-3-large')

In [8]:
from utils import load_data_json, get_cosine_similarity

filepath_output_halu_eval_long_answer = '../data/output/interrogate_llm_zeroshot/re_questions/halu_eval_long_answer.json'
filepath_output_halu_eval_long_answer_knowledge = '../data/output/interrogate_llm_zeroshot/re_questions/halu_eval_long_answer_knowledge.json'
filepath_output_books = '../data/output/interrogate_llm_zeroshot/re_questions/books.json'

output_halu_eval_long_answer = load_data_json(filepath_output_halu_eval_long_answer)
output_halu_eval_long_answer_knowledge = load_data_json(filepath_output_halu_eval_long_answer_knowledge)
output_books = load_data_json(filepath_output_books)

#### Cosine Similarity

In [9]:
from tqdm import tqdm

def process_output(data):
    res = []

    for idx, sample in enumerate(tqdm(data, desc='Processing data:')):
        question = sample['question']
        re_questions = sample['reconstruct_questions']
        
        question_embedding = client.get_embedding(question)
        re_questions_embedding = [client.get_embedding(re_question) for re_question in re_questions]

        cosine_similarity = get_cosine_similarity(question_embedding, re_questions_embedding)
        sample['cosine_similarity'] = float(cosine_similarity)
        res.append(sample)
    return res

In [4]:
import json

res_halu_eval_long_answer = process_output(output_halu_eval_long_answer)

with open('../data/output/interrogate_llm_zeroshot/cosine_similarity/halu_eval_long_answer.json', 'w', encoding='utf-8') as f:
    json.dump(res_halu_eval_long_answer, f, indent=4)

Processing data:: 100%|██████████| 10/10 [00:33<00:00,  3.39s/it]


In [5]:
import json

res_halu_eval_long_answer_knowledge = process_output(output_halu_eval_long_answer_knowledge)

with open('../data/output/interrogate_llm_zeroshot/cosine_similarity/halu_eval_long_answer_knowledge.json', 'w', encoding='utf-8') as f:
    json.dump(res_halu_eval_long_answer_knowledge, f, indent=4)

Processing data:: 100%|██████████| 10/10 [00:29<00:00,  2.91s/it]


In [10]:
import json

res_books = process_output(output_books)

with open('../data/output/interrogate_llm_zeroshot/cosine_similarity/books.json', 'w', encoding='utf-8') as f:
    json.dump(res_books, f, indent=4)

Processing data:: 100%|██████████| 20/20 [01:07<00:00,  3.40s/it]
