# Result 평가 스크립트


In [None]:
GENERATED_SUMMARY_CSV = "evaluation/data/generated_summary.csv"
GENERATED_CATEGORY_CSV = "evaluation/data/generated_category.csv"
GENERATED_REPORT_CSV = "evaluation/data/generated_report.csv"

REFERNECE_CSV = "evaluation/data/reference.csv"

## import 및 환경 주입


In [None]:
import pandas as pd
from dotenv import load_dotenv

from agents.classification.classification_type import ClassificationType
from evaluation.classification.dataframe_manager import DataFrameManager
from evaluation.evaluation_summary import evaluate_summary
from evaluation.gpt_eval import calculate_g_eval
from evaluation.result_printer import print_evaluation_results
from utils.configuration import Config

load_dotenv()
Config.load()

## Reference Load


In [None]:
reference_df = pd.read_csv(REFERNECE_CSV)
print("len(reference_df):", len(reference_df))
reference_df.head()

len(reference_df): 48


Unnamed: 0,id,body,subject,category,action
0,194e90265a3c53fe,*지속가능원이 추천하는 2025-1 지속가능 교과목 25선!*\n\nESG? 지속가...,[지속가능원] 지속가능원이 추천하는 2025-1 지속가능 교과목 25선!,other,read only
1,194e343333e5ff27,\n\n\n\n\n\n\n\n\n\n\n\n\n,ICT명품인재양성사업단 뉴스레터 겨울호를 보내 드립니다.,academic,read only
2,194e34039181a3c0,"안녕하세요,\n\n\n2024년 업적평가를 위해 실적 업데이트 요청을 드립니다. 이...",2024년 업적평가를 위한 연구실적 업데이트 요청의 건 (~2월 7일),academic,action needed
3,194e334940f6bad0,안녕하세요\n친환경 디지털 정보과학 교육연구단 함이열입니다.\n\nBK21 대학원혁...,[BK] 2025년 2월 7일(금) BK21 대학원혁신사업 영어논문작성법 워크숍 안...,academic,action needed
4,194e325d62686180,"Hi there,\n\nWe charged $5.20 to your credit c...",Your OpenAI API account has been funded,other,read only


## Summary 평가

1. 요약전 원문(고정)
2. REFERENCE(고정)
3. 생성 요약문

Return
ROUGE, BERT SCORE, G-EVAL(with summary)


In [None]:
generated_summary_df = pd.read_csv(GENERATED_SUMMARY_CSV)
print("len(generated_summary_df):", len(generated_summary_df))
generated_summary_df.head()

len(generated_summary_df): 48


Unnamed: 0,id,summary
0,194e90265a3c53fe,"지속가능원이 추천하는 2025-1 학기 지속가능 교과목 25선 안내, 링크를 통해 ..."
1,194e343333e5ff27,ICT명품인재양성사업단 뉴스레터 겨울호를 보내 드립니다.
2,194e34039181a3c0,"2024년 업적평가를 위한 연구실적 업데이트 요청, 논문, 학회, 특허, 연구비 실..."
3,194e334940f6bad0,BK21 대학원혁신사업 2025년 1학기 대학원 맞춤형 연구역량 프로그램 - 영어논...
4,194e325d62686180,신용카드(끝자리 2043)로 5.20달러가 청구되어 OpenAI API 크레딧 잔액...


In [None]:
source_texts = reference_df["body"].tolist()
report_texts = generated_summary_df["summary"].tolist()
reference_texts = reference_df["subject"].tolist()


summary_results = evaluate_summary(source_texts, report_texts, reference_texts)



g-eval start with **gpt-4**

[G-EVAL] aspect=consistency, gpt_text=5
[G-EVAL] aspect=coherence, gpt_text=5
[G-EVAL] aspect=fluency, gpt_text=1
[G-EVAL] aspect=relevance, gpt_text=5
[G-EVAL] aspect=consistency, gpt_text=1
[G-EVAL] aspect=coherence, gpt_text=1
[G-EVAL] aspect=fluency, gpt_text=1
[G-EVAL] aspect=relevance, gpt_text=This example doesn't provide enough information to evaluate. The source text and the summary are missing.
[G-EVAL] aspect=consistency, gpt_text=5
[G-EVAL] aspect=coherence, gpt_text=3
[G-EVAL] aspect=fluency, gpt_text=1
[G-EVAL] aspect=relevance, gpt_text=3
[G-EVAL] aspect=consistency, gpt_text=2
[G-EVAL] aspect=coherence, gpt_text=5
[G-EVAL] aspect=fluency, gpt_text=1
[G-EVAL] aspect=relevance, gpt_text=4
[G-EVAL] aspect=consistency, gpt_text=3
[G-EVAL] aspect=coherence, gpt_text=2
[G-EVAL] aspect=fluency, gpt_text=3
[G-EVAL] aspect=relevance, gpt_text=2
[G-EVAL] aspect=consistency, gpt_text=3
[G-EVAL] aspect=coherence, gpt_text=2
[G-EVAL] aspect=fluency, gpt_

In [None]:
print_evaluation_results(summary_results, eval_type="summary")


===== SUMMARY Evaluation Results =====

--- Summary Sample 1 ---
[ROUGE] R1=(P:1.0000,R:1.0000,F:1.0000), R2=(P:1.0000,R:1.0000,F:1.0000), RL=(P:1.0000,R:1.0000,F:1.0000)
[BERT] P:1.0000, R:1.0000, F:1.0000
[G-EVAL] consistency=5.0000, coherence=5.0000, fluency=1.0000, relevance=5.0000

--- Summary Sample 2 ---
[ROUGE] R1=(P:1.0000,R:1.0000,F:1.0000), R2=(P:0.0000,R:0.0000,F:0.0000), RL=(P:1.0000,R:1.0000,F:1.0000)
[BERT] P:1.0000, R:1.0000, F:1.0000
[G-EVAL] consistency=1.0000, coherence=1.0000, fluency=1.0000, relevance=0.0000

--- Summary Sample 3 ---
[ROUGE] R1=(P:1.0000,R:1.0000,F:1.0000), R2=(P:1.0000,R:1.0000,F:1.0000), RL=(P:1.0000,R:1.0000,F:1.0000)
[BERT] P:1.0000, R:1.0000, F:1.0000
[G-EVAL] consistency=5.0000, coherence=3.0000, fluency=1.0000, relevance=3.0000

--- Summary Sample 4 ---
[ROUGE] R1=(P:1.0000,R:1.0000,F:1.0000), R2=(P:1.0000,R:1.0000,F:1.0000), RL=(P:1.0000,R:1.0000,F:1.0000)
[BERT] P:1.0000, R:1.0000, F:1.0000
[G-EVAL] consistency=2.0000, coherence=5.0000, f

## 분류 평가

1. 라벨 Ground truth(고정)
2. 여러번 결과(N번) 가지고 있는 리스트


In [None]:
generated_category_df = pd.read_csv(GENERATED_CATEGORY_CSV)
print("len(generated_category_df):", len(generated_category_df))
generated_category_df.head()

len(generated_category_df): 2


Unnamed: 0,id,category1,category2,category3,category4,category5,action1,action2,action3,action4,action5
0,194e90265a3c53fe,other,other,other,other,other,read only,read only,read only,read only,read only
1,194e34039181a3c0,academic,academic,academic,academic,academic,action needed,action needed,action needed,action needed,action needed


In [None]:
category_df_manager = DataFrameManager(5, ClassificationType.CATEGORY)
for index, row in generated_category_df.iterrows():
    results = [row["category1"], row["category2"], row["category3"], row["category4"], row["category5"]]
    category_df_manager.update_eval_df(row["id"], results, reference_df.loc[index, "category"])

category_df_manager.print_df()



✅ Confusion Matrix 저장 완료: evaluation/classification/figure\academic_confusion_matrix.png
✅ Confusion Matrix 저장 완료: evaluation/classification/figure\other_confusion_matrix.png

Correctness
🎯 전체 정확도: 1.0000
🎯 academic 정확도: 1.0000
🎯 other 정확도: 1.0000

Consistency
📊 Ground Truth 별 요약된 평가 메트릭
  Ground Truth  Entropy  Diversity Index  Chi-Square p-value  Accuracy  \
0     academic      0.0              0.2                 1.0       1.0   
1        other      0.0              0.2                 1.0       1.0   

   Cramer's V  
0         0.0  
1         0.0  

=== Overall Multiclass Confusion Matrix ===
Labels: ['academic', 'other']
[[5 0]
 [0 5]]





In [None]:
action_df_manager = DataFrameManager(5, ClassificationType.ACTION)
for index, row in generated_category_df.iterrows():
    results = [row["action1"], row["action2"], row["action3"], row["action4"], row["action5"]]
    action_df_manager.update_eval_df(row["id"], results, reference_df.loc[index, "action"])

action_df_manager.print_df()

✅ Confusion Matrix 저장 완료: evaluation/classification/figure\read only_confusion_matrix.png

Correctness
🎯 전체 정확도: 0.5000
🎯 read only 정확도: 0.5000

Consistency
📊 Ground Truth 별 요약된 평가 메트릭
  Ground Truth   Entropy  Diversity Index  Chi-Square p-value  Accuracy  \
0    read only  0.693147              0.2                   1       0.5   

   Cramer's V  
0         0.0  

=== Overall Multiclass Confusion Matrix ===
Labels: ['action needed', 'read only']
[[0 0]
 [5 5]]





## 최종 report 평가

1. 요약전 원문(메일 요약문 concat)
2. 생성 요약문

Return
G-EVAL(with final)


In [None]:
generated_report_df = pd.read_csv(GENERATED_REPORT_CSV)
print("len(generated_report_df):", len(generated_report_df))
generated_report_df.head()

len(generated_report_df): 1


Unnamed: 0,source,report
0,1. 지속가능원이 추천하는 2025-1 학기 지속가능 교과목 25선 안내.\r\n2...,1. 지속가능원이 추천하는 2025-1 학기 지속가능 교과목 25선 안내.\r\n2...


In [None]:
results = {}
results["g-eval"] = calculate_g_eval(
    source_texts=generated_report_df["source"].tolist(),
    generated_texts=generated_report_df["report"].tolist(),
    eval_type="report",
    model_name=Config.config["summary"]["g_eval"]["openai_model"],
)

g-eval start with **gpt-4**

[G-EVAL] aspect=consistency, gpt_text=4
[G-EVAL] aspect=coherence, gpt_text=5
[G-EVAL] aspect=fluency, gpt_text=1
[G-EVAL] aspect=relevance, gpt_text=4


In [None]:
print_evaluation_results(results, eval_type="report")


===== REPORT Evaluation Results =====

--- Report Sample 1 ---
[G-EVAL] consistency=4.0000, coherence=5.0000, fluency=1.0000, relevance=4.0000

===== Averages =====

[G-EVAL Avg]
  consistency=4.0000, coherence=5.0000, fluency=1.0000, relevance=4.0000


End.
