In [2]:
!pip install openai datasets rouge_score -q
!pip install evaluate -q
!pip install kiwipiepy -q
!pip install python-dotenv

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatib

In [None]:
import os
from resource.config import api_key, llm_finetune
os.environ['OPENAI_API_KEY'] = api_key

In [26]:
from datasets import Dataset
import evaluate
from kiwipiepy import Kiwi
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()

# BLEU 및 BERTScore 로드
bertscore = evaluate.load("bertscore")

# Kiwi 객체 전역 선언
kiwi = Kiwi()

# Kiwi 형태소 분석기 기반 토큰화
def tokenize_korean(text):
    tokens = [token.form for token in kiwi.tokenize(text)]
    return tokens

# BERTScore 계산
def compute_bertscore_metrics(predictions, references):
    results = bertscore.compute(predictions=predictions, references=references, lang="ko")
    return {
        'bert_precision': sum(results['precision']) / len(results['precision']) * 100,
        'bert_recall': sum(results['recall']) / len(results['recall']) * 100,
        'bert_f1': sum(results['f1']) / len(results['f1']) * 100,
    }

# 모델 평가 함수
def evaluate_model_with_hallucination(model_name, dataset):
    predictions = []
    references = []
    hallucinated_count = 0

    for item in dataset:
        input_text = item["input"]
        ground_truth = item["ground_truth"]

        # 모델 호출
        generated_text = generate_response2(input_text, model_name)

        # 예측 및 참조 추가
        predictions.append(generated_text)
        references.append(ground_truth)

        # Hallucination 평가
        if not any(word in " ".join(ground_truth) for word in generated_text.split()):
            hallucinated_count += 1

    # 메트릭 계산
    bertscore_results = compute_bertscore_metrics(predictions, references)

    # Hallucination 점수 계산
    hallucination_score = (
        hallucinated_count / len(predictions) * 100 if len(predictions) > 0 else 0
    )

    return {
        "bertscore": bertscore_results,
        "hallucination_score": hallucination_score,
    }

# OpenAI API 설정
client = OpenAI()
client.api_key = os.getenv("OPENAI_API_KEY")

def generate_response2(prompt, model_name):
    try:
        completion = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": "응급처치에 대한 전문가로서 내 물음에 대답해줘."},
                {"role": "user", "content": prompt}
            ]
        )
        return completion.choices[0].message.content
    except Exception as e:
        print(f"Error in OpenAI API call: {e}")
        return ""

# 예제 데이터셋
data = {
    "input": [
        "화상을 입었을 때 응급처치는 어떻게 하나요?",
        "코피가 날 때 가장 먼저 해야 할 응급처치는 무엇인가요?",
        "벌에 쏘였을 때는 어떻게 해야 하나요?",
    ],
    "ground_truth": [
        ["화상 부위를 흐르는 찬물로 식히세요.", "화상 부위에 찬물을 10분 이상 흘려줍니다."],
        ["머리를 앞으로 숙이고 코를 손으로 눌러야 합니다.", "코피가 나면 머리를 숙이고 코를 압박하세요."],
        ["벌침을 제거하고 차가운 수건으로 진정시키세요.", "벌침을 신용카드로 긁어 제거합니다."],
    ]
}

dataset = Dataset.from_dict(data)

# 평가 실행
fine_tuned_model = "gpt-4o-mini-2024-07-18"
results = evaluate_model_with_hallucination(fine_tuned_model, dataset)

# 결과 출력
print("Evaluation Results:")
print(f"BERTScore: {results['bertscore']}")
print(f"Hallucination Score: {results['hallucination_score']:.2f}%")


Evaluation Results:
BERTScore: {'bert_precision': 55.71163694063822, 'bert_recall': 73.01715413729349, 'bert_f1': 63.05378476778666}
Hallucination Score: 0.00%


In [None]:
# 모델 이름 설정
fine_tuned_model = llm_finetune

# 평가 실행
results = evaluate_model_with_hallucination(fine_tuned_model, dataset)

# 결과 출력
print("Evaluation Results:")
print(f"BERTScore: {results['bertscore']}") # 의미론적 유사도 평가
print(f"Hallucination Score: {results['hallucination_score']:.2f}%") # 허구적 내용 여부 평가

Evaluation Results:
BERTScore: {'bert_precision': 69.50087547302246, 'bert_recall': 79.32908336321512, 'bert_f1': 73.76715342203775}
Hallucination Score: 0.00%
