In [71]:
import os
import json
import pandas as pd

from rich import print as pprint
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from ragas import evaluate, EvaluationDataset
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics import LLMContextRecall, ContextEntityRecall, NoiseSensitivity, ResponseRelevancy, Faithfulness, FactualCorrectness, SemanticSimilarity

In [72]:
# user_input 為問題
# reference 為標準答案
# response 為RAG應用所提供的回答
# retrieved_contexts 為RAG所檢索的文檔

csv_path = "C:/NTUST/Research/llm-evaluate/eval/eval_global_4omini_USI.csv"
file_name = os.path.splitext(os.path.basename(csv_path))[0]       # 提取檔名（不包含路徑和副檔名）
method_name = file_name.replace("eval_", "")                      # 移除 "eval_" 的部分

dataset_df = pd.read_csv(csv_path)
dataset_df["retrieved_contexts"] = dataset_df["retrieved_contexts"].apply(json.loads)
dataset_df = dataset_df[['user_input', 'reference', 'response', 'retrieved_contexts']]
eval_dataset = EvaluationDataset.from_pandas(dataset_df)

In [73]:
eval_dataset

EvaluationDataset(features=['user_input', 'retrieved_contexts', 'response', 'reference'], len=20)

In [74]:
os.environ["OPENAI_API_KEY"] = os.environ["AZURE_OPENAI_API_KEY"]

evaluator_llm = LangchainLLMWrapper(AzureChatOpenAI(
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"],
    model=os.environ["AZURE_OPENAI_CHAT_MODEL_NAME"],
    validate_base_url=False,
))

# init the embeddings for answer_relevancy, answer_correctness and answer_similarity
evaluator_embeddings = LangchainEmbeddingsWrapper(AzureOpenAIEmbeddings(
    openai_api_version=os.environ["AZURE_OPENAI_API_EMBED_VERSION"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    azure_deployment=os.environ["AZURE_OPENAI_EMBED_DEPLOYMENT_NAME"],
    model=os.environ["AZURE_OPENAI_EMBED_MODEL_NAME"],
))

In [75]:
metrics = [
    LLMContextRecall(llm=evaluator_llm), 
    ContextEntityRecall(llm=evaluator_llm),
    # NoiseSensitivity(llm=evaluator_llm),
    ResponseRelevancy(llm=evaluator_llm, embeddings=evaluator_embeddings),
    Faithfulness(llm=evaluator_llm),
    FactualCorrectness(llm=evaluator_llm), 
    SemanticSimilarity(embeddings=evaluator_embeddings)
]
results = evaluate(dataset=eval_dataset, metrics=metrics)
df_result = results.to_pandas()

Evaluating: 100%|██████████| 120/120 [02:51<00:00,  1.43s/it]


In [78]:
df_result["retrieved_contexts"] = df_result["retrieved_contexts"].apply(json.dumps)
df_result.to_csv(f"./result/result_{method_name}.csv", index=False)