# Result 평가 스크립트


In [14]:
import warnings

warnings.simplefilter(action="ignore", category=Warning)

In [15]:
GENERATED_SUMMARY_CSV = "evaluation/data/generated_summary.csv"
GENERATED_CATEGORY_CSV = "evaluation/data/generated_category.csv"
GENERATED_REPORT_CSV = "evaluation/data/generated_report.csv"

REFERNECE_CSV = "evaluation/data/reference.csv"

## import 및 환경 주입


In [16]:
import ast

import pandas as pd
from dotenv import load_dotenv

from agents.classification.classification_type import ClassificationType
from evaluation.classification.dataframe_manager import DataFrameManager
from evaluation.evaluation_summary import evaluate_summary
from evaluation.gpt_eval import calculate_g_eval
from evaluation.result_printer import print_evaluation_results
from utils.configuration import Config

load_dotenv()
Config.load()

## Reference Load


In [17]:
reference_df = pd.read_csv(REFERNECE_CSV)
print("len(reference_df):", len(reference_df))
reference_df.head()

len(reference_df): 48


Unnamed: 0,id,body,subject,category,action
0,194e90265a3c53fe,*지속가능원이 추천하는 2025-1 지속가능 교과목 25선!*\n\nESG? 지속가...,[지속가능원] 지속가능원이 추천하는 2025-1 지속가능 교과목 25선!,other,read only
1,194e343333e5ff27,\n\n\n\n\n\n\n\n\n\n\n\n\n,ICT명품인재양성사업단 뉴스레터 겨울호를 보내 드립니다.,academic,read only
2,194e34039181a3c0,"안녕하세요,\n\n\n2024년 업적평가를 위해 실적 업데이트 요청을 드립니다. 이...",2024년 업적평가를 위한 연구실적 업데이트 요청의 건 (~2월 7일),academic,action needed
3,194e334940f6bad0,안녕하세요\n친환경 디지털 정보과학 교육연구단 함이열입니다.\n\nBK21 대학원혁...,[BK] 2025년 2월 7일(금) BK21 대학원혁신사업 영어논문작성법 워크숍 안...,academic,action needed
4,194e325d62686180,"Hi there,\n\nWe charged $5.20 to your credit c...",Your OpenAI API account has been funded,other,read only


## Summary 평가

1. 요약전 원문(고정)
2. REFERENCE(고정)
3. 생성 요약문

Return
ROUGE, BERT SCORE, G-EVAL(with summary)


In [None]:
generated_summary_df = pd.read_csv(GENERATED_SUMMARY_CSV)
print("len(generated_summary_df):", len(generated_summary_df))
generated_summary_df.head()

In [None]:
source_texts = reference_df["body"].tolist()
report_texts = generated_summary_df["summary"].tolist()
reference_texts = reference_df["subject"].tolist()


summary_results = evaluate_summary(source_texts, report_texts, reference_texts)

In [None]:
print_evaluation_results(summary_results, eval_type="summary")

## 분류 평가

1. 라벨 Ground truth(고정)
2. 여러번 결과(N번) 가지고 있는 리스트


In [18]:
generated_category_df = pd.read_csv(GENERATED_CATEGORY_CSV)
print("len(generated_category_df):", len(generated_category_df))
generated_category_df.head()

len(generated_category_df): 48


Unnamed: 0,id,categories,actions
0,194eaa45baa51ec0,"['academic', 'academic', 'academic']","['read only', 'read only', 'read only']"
1,194eaa3c5169e920,"['academic', 'academic', 'academic']","['read only', 'read only', 'read only']"
2,194e90265a3c53fe,"['other', 'other', 'other']","['read only', 'read only', 'read only']"
3,194e343333e5ff27,"['other', 'other', 'other']","['read only', 'read only', 'read only']"
4,194e34039181a3c0,"['administration', 'administration', 'administ...","['action needed', 'action needed', 'action nee..."


In [None]:
category_df_manager = DataFrameManager(Config.config["classification"]["inference"], ClassificationType.CATEGORY)
for index, row in generated_category_df.iterrows():
    category_df_manager.update_eval_df(
        row["id"], ast.literal_eval(row["categories"]), reference_df.loc[index, "category"]
    )

category_df_manager.print_df()

✅ Confusion Matrix 저장 완료: evaluation/classification/figure/academic_confusion_matrix.png
✅ Confusion Matrix 저장 완료: evaluation/classification/figure/administration_confusion_matrix.png
✅ Confusion Matrix 저장 완료: evaluation/classification/figure/other_confusion_matrix.png

Correctness
🎯 전체 정확도: 0.7500
🎯 academic 정확도: 0.8125
🎯 administration 정확도: 0.8542
🎯 other 정확도: 0.8333

Consistency
📊 Ground Truth 별 요약된 평가 메트릭
     Ground Truth   Entropy  Diversity Index  Chi-Square p-value  Accuracy  \
0        academic  0.758937         0.071429                   1  0.714286   
1  administration  0.758937         0.071429                   1  0.714286   
2           other  0.639032         0.050000                   1  0.800000   

   Cramer's V  
0         0.0  
1         0.0  
2         0.0  

=== Overall Multiclass Confusion Matrix ===
Labels: ['academic', 'administration', 'other']
[[30  3  9]
 [ 9 30  3]
 [ 6  6 48]]



In [21]:
action_df_manager = DataFrameManager(Config.config["classification"]["inference"], ClassificationType.ACTION)
for index, row in generated_category_df.iterrows():
    action_df_manager.update_eval_df(row["id"], ast.literal_eval(row["actions"]), reference_df.loc[index, "action"])

action_df_manager.print_df()

✅ Confusion Matrix 저장 완료: evaluation/classification/figure/action needed_confusion_matrix.png
✅ Confusion Matrix 저장 완료: evaluation/classification/figure/read only_confusion_matrix.png

Correctness
🎯 전체 정확도: 0.7292
🎯 action needed 정확도: 0.7292
🎯 read only 정확도: 0.7292

Consistency
📊 Ground Truth 별 요약된 평가 메트릭
    Ground Truth   Entropy  Diversity Index  Chi-Square p-value  Accuracy  \
0  action needed  0.545595         0.039216                   1  0.764706   
1      read only  0.602440         0.021505                   1  0.709677   

   Cramer's V  
0         0.0  
1         0.0  

=== Overall Multiclass Confusion Matrix ===
Labels: ['action needed', 'read only']
[[39 12]
 [27 66]]



## 최종 report 평가

1. 요약전 원문(메일 요약문 concat)
2. 생성 요약문

Return
G-EVAL(with final)


In [None]:
generated_report_df = pd.read_csv(GENERATED_REPORT_CSV)
print("len(generated_report_df):", len(generated_report_df))
generated_report_df.head()

In [None]:
results = {}
results["g-eval"] = calculate_g_eval(
    source_texts=generated_report_df["source"].tolist(),
    generated_texts=generated_report_df["report"].tolist(),
    eval_type="report",
    model_name=Config.config["summary"]["g_eval"]["openai_model"],
)

In [None]:
print_evaluation_results(results, eval_type="report")

End.
