In [None]:
import os, sys
import pandas as pd
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric
from evaluate import GroqDeepEvalLLM

current_dir = os.path.dirname(os.path.abspath("__file__"))
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.append(parent_dir)

from llm.rag import generate_answer
from db.qdrant import get_qdrant_client
from llm.embeddings import get_embeddings
from llm.model import get_llm_client

GEN_MODEL = os.getenv('GEN_MODEL', 'llama-3.1-8b-instant')

df = pd.read_csv("../data/whisky_qnas.csv")

llm = get_llm_client()
embedder = get_embeddings(embed_model=os.getenv("EMBED_MODEL"))
qdrant = get_qdrant_client()

# judge = GroqDeepEvalLLM(model="llama-3.3-70b-versatile")  # 가능하면 judge는 큰 모델 추천
judge = GroqDeepEvalLLM(model="llama-3.1-8b-instant")
answer_relevancy = AnswerRelevancyMetric(threshold=0.7, model=judge)

rows = []
for i, r in df.iterrows():

    if i >= 2:
        break

    q = r["question"]
    gold = r["answer"]

    ans, ctxs, ctx_texts = generate_answer(qdrant=qdrant, embedder=embedder, llm=llm, question=q, model=GEN_MODEL)

    # 1) CSV answer relevancy
    tc_gold = LLMTestCase(input=q, actual_output=gold)
    answer_relevancy.measure(tc_gold)
    gold_score = float(answer_relevancy.score)
    gold_reason = answer_relevancy.reason

    # 2) Generated answer relevancy
    tc_gen = LLMTestCase(input=q, actual_output=ans)
    answer_relevancy.measure(tc_gen)
    gen_score = float(answer_relevancy.score)
    gen_reason = answer_relevancy.reason

    data = {
        "case_id": i,
        "question": q,
        "tc_gold": tc_gold,
        "gold": gold,
        "csv_answer_relevancy": gold_score,
        "gen_answer_relevancy": gen_score,
        "gold_reason": gold_reason,
        "gen_reason": gen_reason,
    }

    print(data)

    rows.append(data)
    

out = pd.DataFrame(rows)
# print(out[["case_id", "csv_answer_relevancy", "gen_answer_relevancy", "delta(gen-gold)"]].head())
# print("\n=== Averages ===")
# print(out[["csv_answer_relevancy", "gen_answer_relevancy", "delta(gen-gold)"]].mean())

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
out

In [20]:
out

Unnamed: 0,case_id,question,tc_gold,gold,csv_answer_relevancy,gen_answer_relevancy,gold_reason,gen_reason
0,0,헤비한 육향이나 거칠고 진득한 스모크 계열을 선호해요. 어디 하나 빠지는 거 없이 ...,input='헤비한 육향이나 거칠고 진득한 스모크 계열을 선호해요. 어디 하나 빠지...,그렇다면 **Tobermory 1972 MI**에서 고기·가죽·담배처럼 거친 면과 ...,1.0,1.0,The score is 1.00 because the answer is highly...,The score is 1.00 because the answer is highly...
1,1,Glendronach 1972를 마실 때 팔레트에서 열대 과일이 폭발한다고 들었는데...,input='Glendronach 1972를 마실 때 팔레트에서 열대 과일이 폭발한...,"노즈에선 '말린 자두, 블랙커런트, 약간의 파파야·망고, 나무 바니시, 카라멜, 후...",1.0,0.8,The score is 1.00 because the answer directly ...,The score is 0.80 because the statement is a g...
