# Chapter 10: LangGraph Evaluation 실습

이 노트북은 LangGraph 애플리케이션의 평가와 성능 측정을 실습합니다.

## 환경 설정

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

if not os.getenv("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = input("OpenAI API Key를 입력하세요: ")

## 1. Evaluation Dataset Creation

In [None]:
import json
import pandas as pd
from typing import List, Dict, Any, Optional
from dataclasses import dataclass, asdict
from datetime import datetime
import uuid

# 평가 데이터셋 클래스
@dataclass
class EvaluationCase:
    """평가 케이스"""
    id: str
    input: Dict[str, Any]
    expected_output: Dict[str, Any]
    metadata: Dict[str, Any]
    tags: List[str]
    
    def to_dict(self) -> Dict:
        return asdict(self)

class EvaluationDataset:
    """평가 데이터셋 관리"""
    
    def __init__(self, name: str, description: str = ""):
        self.name = name
        self.description = description
        self.cases: List[EvaluationCase] = []
        self.created_at = datetime.now()
        self.version = "1.0.0"
    
    def add_case(self, 
                 input_data: Dict,
                 expected_output: Dict,
                 metadata: Dict = None,
                 tags: List[str] = None):
        """평가 케이스 추가"""
        case = EvaluationCase(
            id=str(uuid.uuid4()),
            input=input_data,
            expected_output=expected_output,
            metadata=metadata or {},
            tags=tags or []
        )
        self.cases.append(case)
    
    def save_to_file(self, filepath: str):
        """파일로 저장"""
        data = {
            "name": self.name,
            "description": self.description,
            "version": self.version,
            "created_at": self.created_at.isoformat(),
            "cases": [case.to_dict() for case in self.cases]
        }
        with open(filepath, 'w') as f:
            json.dump(data, f, indent=2)
        print(f"데이터셋이 {filepath}에 저장되었습니다.")
    
    @classmethod
    def load_from_file(cls, filepath: str):
        """파일에서 로드"""
        with open(filepath, 'r') as f:
            data = json.load(f)
        
        dataset = cls(data["name"], data.get("description", ""))
        dataset.version = data.get("version", "1.0.0")
        
        for case_data in data["cases"]:
            case = EvaluationCase(**case_data)
            dataset.cases.append(case)
        
        return dataset
    
    def to_dataframe(self) -> pd.DataFrame:
        """DataFrame으로 변환"""
        return pd.DataFrame([case.to_dict() for case in self.cases])
    
    def filter_by_tags(self, tags: List[str]) -> List[EvaluationCase]:
        """태그로 필터링"""
        filtered = []
        for case in self.cases:
            if any(tag in case.tags for tag in tags):
                filtered.append(case)
        return filtered

# RAG 평가 데이터셋 생성
def create_rag_dataset() -> EvaluationDataset:
    """RAG 시스템 평가 데이터셋 생성"""
    dataset = EvaluationDataset(
        name="RAG Evaluation Dataset",
        description="RAG 시스템 평가를 위한 질문-답변 데이터셋"
    )
    
    # 평가 케이스들
    test_cases = [
        {
            "input": {"question": "LangChain이 무엇인가요?"},
            "expected": {
                "relevance": "high",
                "keywords": ["프레임워크", "LLM", "애플리케이션"]
            },
            "tags": ["definition", "basic"]
        },
        {
            "input": {"question": "벡터 데이터베이스의 장점은?"},
            "expected": {
                "relevance": "high",
                "keywords": ["유사도", "검색", "임베딩"]
            },
            "tags": ["technical", "database"]
        },
        {
            "input": {"question": "RAG와 파인튜닝의 차이점은?"},
            "expected": {
                "relevance": "high",
                "keywords": ["검색", "학습", "비용", "유연성"]
            },
            "tags": ["comparison", "advanced"]
        }
    ]
    
    for case in test_cases:
        dataset.add_case(
            input_data=case["input"],
            expected_output=case["expected"],
            tags=case["tags"]
        )
    
    return dataset

# SQL 평가 데이터셋 생성
def create_sql_dataset() -> EvaluationDataset:
    """SQL 생성 평가 데이터셋"""
    dataset = EvaluationDataset(
        name="SQL Generation Dataset",
        description="SQL 쿼리 생성 평가 데이터셋"
    )
    
    test_cases = [
        {
            "input": {"question": "모든 직원의 이름과 부서를 조회하세요"},
            "expected": {
                "query_type": "SELECT",
                "tables": ["employees"],
                "columns": ["name", "department"]
            },
            "tags": ["basic", "select"]
        },
        {
            "input": {"question": "부서별 평균 급여를 계산하세요"},
            "expected": {
                "query_type": "SELECT",
                "functions": ["AVG"],
                "group_by": True
            },
            "tags": ["aggregation", "group_by"]
        }
    ]
    
    for case in test_cases:
        dataset.add_case(
            input_data=case["input"],
            expected_output=case["expected"],
            tags=case["tags"]
        )
    
    return dataset

# 데이터셋 생성 및 저장
rag_dataset = create_rag_dataset()
sql_dataset = create_sql_dataset()

print("📊 RAG 데이터셋:")
print(f"케이스 수: {len(rag_dataset.cases)}")
print(rag_dataset.to_dataframe()[["id", "tags"]].head())

print("\n📊 SQL 데이터셋:")
print(f"케이스 수: {len(sql_dataset.cases)}")
print(sql_dataset.to_dataframe()[["id", "tags"]].head())

## 2. Evaluation Metrics

In [None]:
from typing import Callable
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score
from langchain_openai import ChatOpenAI

# 평가 메트릭 클래스
class EvaluationMetrics:
    """평가 메트릭 계산"""
    
    @staticmethod
    def calculate_accuracy(predictions: List, ground_truth: List) -> float:
        """정확도 계산"""
        if len(predictions) != len(ground_truth):
            raise ValueError("예측과 정답의 길이가 다릅니다.")
        
        correct = sum(1 for p, g in zip(predictions, ground_truth) if p == g)
        return correct / len(predictions)
    
    @staticmethod
    def calculate_bleu_score(reference: str, hypothesis: str) -> float:
        """BLEU 점수 계산 (간단한 버전)"""
        from nltk.translate.bleu_score import sentence_bleu
        reference_tokens = reference.split()
        hypothesis_tokens = hypothesis.split()
        return sentence_bleu([reference_tokens], hypothesis_tokens)
    
    @staticmethod
    def calculate_rouge_score(reference: str, hypothesis: str) -> Dict[str, float]:
        """ROUGE 점수 계산"""
        # 간단한 ROUGE-1 구현
        ref_tokens = set(reference.lower().split())
        hyp_tokens = set(hypothesis.lower().split())
        
        if not hyp_tokens:
            return {"precision": 0, "recall": 0, "f1": 0}
        
        overlap = ref_tokens.intersection(hyp_tokens)
        precision = len(overlap) / len(hyp_tokens) if hyp_tokens else 0
        recall = len(overlap) / len(ref_tokens) if ref_tokens else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        return {
            "precision": precision,
            "recall": recall,
            "f1": f1
        }
    
    @staticmethod
    def calculate_semantic_similarity(text1: str, text2: str, embeddings) -> float:
        """의미적 유사도 계산"""
        emb1 = embeddings.embed_query(text1)
        emb2 = embeddings.embed_query(text2)
        
        # 코사인 유사도
        similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
        return float(similarity)

# LLM 기반 평가자
class LLMEvaluator:
    """LLM을 사용한 평가"""
    
    def __init__(self, model: str = "gpt-4o-mini"):
        self.llm = ChatOpenAI(model=model, temperature=0)
    
    def evaluate_relevance(self, question: str, answer: str) -> Dict[str, Any]:
        """관련성 평가"""
        prompt = f"""
        질문: {question}
        답변: {answer}
        
        위 답변이 질문에 얼마나 관련이 있는지 평가하세요.
        점수: 1-10 (10이 가장 관련성 높음)
        이유를 간단히 설명하세요.
        
        JSON 형식으로 응답: {{"score": <점수>, "reason": "<이유>"}}
        """
        
        response = self.llm.invoke(prompt)
        try:
            result = json.loads(response.content)
            return result
        except:
            return {"score": 0, "reason": "평가 실패"}
    
    def evaluate_correctness(self, answer: str, expected: str) -> Dict[str, Any]:
        """정확성 평가"""
        prompt = f"""
        생성된 답변: {answer}
        예상 답변: {expected}
        
        생성된 답변이 예상 답변과 얼마나 일치하는지 평가하세요.
        의미적으로 같은 내용이면 높은 점수를 주세요.
        
        점수: 1-10
        JSON 형식: {{"score": <점수>, "differences": ["<차이점들>"]}}
        """
        
        response = self.llm.invoke(prompt)
        try:
            result = json.loads(response.content)
            return result
        except:
            return {"score": 0, "differences": ["평가 실패"]}
    
    def evaluate_hallucination(self, answer: str, context: str) -> Dict[str, Any]:
        """환각 평가"""
        prompt = f"""
        컨텍스트: {context}
        생성된 답변: {answer}
        
        답변에 컨텍스트에 없는 정보(환각)가 포함되어 있는지 확인하세요.
        
        JSON 형식: {{
            "has_hallucination": <true/false>,
            "hallucinated_parts": ["<환각 부분들>"],
            "confidence": <0-1>
        }}
        """
        
        response = self.llm.invoke(prompt)
        try:
            result = json.loads(response.content)
            return result
        except:
            return {
                "has_hallucination": False,
                "hallucinated_parts": [],
                "confidence": 0
            }

# 메트릭 테스트
metrics = EvaluationMetrics()
llm_evaluator = LLMEvaluator()

# 예제 텍스트
reference = "LangChain은 대규모 언어 모델을 활용한 애플리케이션 개발 프레임워크입니다."
hypothesis = "LangChain은 LLM 기반 앱 개발을 위한 프레임워크입니다."

# ROUGE 점수
rouge_scores = metrics.calculate_rouge_score(reference, hypothesis)
print("📊 ROUGE 점수:")
print(f"Precision: {rouge_scores['precision']:.3f}")
print(f"Recall: {rouge_scores['recall']:.3f}")
print(f"F1: {rouge_scores['f1']:.3f}")

# LLM 평가
print("\n🤖 LLM 기반 평가:")
relevance = llm_evaluator.evaluate_relevance(
    "LangChain이 무엇인가요?",
    hypothesis
)
print(f"관련성 점수: {relevance.get('score', 0)}/10")
print(f"이유: {relevance.get('reason', 'N/A')}")

## 3. Graph Evaluation Framework

In [None]:
from langgraph.graph import StateGraph, END, MessagesState
from langchain_core.messages import HumanMessage
import time
from typing import Any

# 그래프 평가 프레임워크
class GraphEvaluator:
    """LangGraph 평가 프레임워크"""
    
    def __init__(self, graph, dataset: EvaluationDataset):
        self.graph = graph
        self.dataset = dataset
        self.results = []
        self.metrics = EvaluationMetrics()
        self.llm_evaluator = LLMEvaluator()
    
    def evaluate(self, 
                 eval_functions: List[Callable] = None,
                 verbose: bool = True) -> Dict[str, Any]:
        """전체 평가 실행"""
        
        total_cases = len(self.dataset.cases)
        print(f"📊 평가 시작: {total_cases}개 케이스")
        
        for i, case in enumerate(self.dataset.cases, 1):
            if verbose:
                print(f"\n평가 중 [{i}/{total_cases}]: {case.id}")
            
            # 그래프 실행
            start_time = time.time()
            try:
                output = self.graph.invoke(case.input)
                latency = time.time() - start_time
                success = True
                error = None
            except Exception as e:
                output = None
                latency = time.time() - start_time
                success = False
                error = str(e)
            
            # 평가 실행
            eval_results = {
                "case_id": case.id,
                "success": success,
                "latency": latency,
                "error": error
            }
            
            if success and eval_functions:
                for eval_func in eval_functions:
                    metric_name = eval_func.__name__
                    metric_value = eval_func(output, case.expected_output)
                    eval_results[metric_name] = metric_value
            
            self.results.append(eval_results)
        
        # 집계
        summary = self._calculate_summary()
        
        if verbose:
            self._print_summary(summary)
        
        return summary
    
    def _calculate_summary(self) -> Dict[str, Any]:
        """평가 결과 요약"""
        successful_results = [r for r in self.results if r["success"]]
        
        summary = {
            "total_cases": len(self.results),
            "successful_cases": len(successful_results),
            "failed_cases": len(self.results) - len(successful_results),
            "success_rate": len(successful_results) / len(self.results) if self.results else 0,
            "avg_latency": np.mean([r["latency"] for r in self.results]) if self.results else 0,
            "p95_latency": np.percentile([r["latency"] for r in self.results], 95) if self.results else 0
        }
        
        # 커스텀 메트릭 집계
        metric_names = set()
        for result in successful_results:
            metric_names.update(k for k in result.keys() 
                              if k not in ["case_id", "success", "latency", "error"])
        
        for metric in metric_names:
            values = [r.get(metric, 0) for r in successful_results if metric in r]
            if values:
                summary[f"avg_{metric}"] = np.mean(values)
        
        return summary
    
    def _print_summary(self, summary: Dict[str, Any]):
        """요약 출력"""
        print("\n" + "="*60)
        print("📈 평가 결과 요약")
        print("="*60)
        
        print(f"총 케이스: {summary['total_cases']}")
        print(f"성공: {summary['successful_cases']} ({summary['success_rate']:.1%})")
        print(f"실패: {summary['failed_cases']}")
        print(f"\n평균 지연시간: {summary['avg_latency']*1000:.2f}ms")
        print(f"P95 지연시간: {summary['p95_latency']*1000:.2f}ms")
        
        # 커스텀 메트릭
        custom_metrics = {k: v for k, v in summary.items() 
                         if k.startswith("avg_") and not k.endswith("latency")}
        if custom_metrics:
            print("\n커스텀 메트릭:")
            for metric, value in custom_metrics.items():
                print(f"  {metric}: {value:.3f}")
    
    def export_results(self, filepath: str):
        """결과 내보내기"""
        df = pd.DataFrame(self.results)
        df.to_csv(filepath, index=False)
        print(f"결과가 {filepath}에 저장되었습니다.")

# 샘플 그래프 생성
def create_sample_graph():
    """평가용 샘플 그래프"""
    workflow = StateGraph(MessagesState)
    
    def process_node(state: MessagesState):
        llm = ChatOpenAI(model="gpt-4o-mini")
        response = llm.invoke(state["messages"])
        return {"messages": [response]}
    
    workflow.add_node("process", process_node)
    workflow.set_entry_point("process")
    workflow.add_edge("process", END)
    
    return workflow.compile()

# 평가 함수 정의
def evaluate_response_length(output: Dict, expected: Dict) -> float:
    """응답 길이 평가"""
    if "messages" in output and output["messages"]:
        response_length = len(output["messages"][-1].content)
        return min(response_length / 100, 1.0)  # 100자를 기준으로 정규화
    return 0.0

def evaluate_keyword_presence(output: Dict, expected: Dict) -> float:
    """키워드 포함 평가"""
    if "keywords" not in expected:
        return 1.0
    
    if "messages" in output and output["messages"]:
        response = output["messages"][-1].content.lower()
        keywords = expected.get("keywords", [])
        
        if keywords:
            present = sum(1 for kw in keywords if kw.lower() in response)
            return present / len(keywords)
    return 0.0

# 평가 실행
print("🚀 그래프 평가 시작\n")

# 테스트용 간단한 데이터셋
test_dataset = EvaluationDataset("Test Dataset")
test_dataset.add_case(
    input_data={"messages": [HumanMessage(content="LangChain이 뭐야?")]},
    expected_output={"keywords": ["프레임워크", "LLM"]},
    tags=["basic"]
)
test_dataset.add_case(
    input_data={"messages": [HumanMessage(content="Python 설명해줘")]},
    expected_output={"keywords": ["프로그래밍", "언어"]},
    tags=["basic"]
)

# 그래프 생성 및 평가
sample_graph = create_sample_graph()
evaluator = GraphEvaluator(sample_graph, test_dataset)

# 평가 실행
summary = evaluator.evaluate(
    eval_functions=[evaluate_response_length, evaluate_keyword_presence],
    verbose=True
)

## 4. A/B Testing Framework

In [None]:
import random
from scipy import stats

class ABTestFramework:
    """A/B 테스팅 프레임워크"""
    
    def __init__(self, graph_a, graph_b, dataset: EvaluationDataset):
        self.graph_a = graph_a
        self.graph_b = graph_b
        self.dataset = dataset
        self.results_a = []
        self.results_b = []
    
    def run_test(self, 
                 metric_func: Callable,
                 sample_size: int = None,
                 confidence_level: float = 0.95) -> Dict[str, Any]:
        """A/B 테스트 실행"""
        
        # 샘플 크기 결정
        cases = self.dataset.cases[:sample_size] if sample_size else self.dataset.cases
        
        print(f"🔬 A/B 테스트 시작 (n={len(cases)})")
        
        for case in cases:
            # 그래프 A 평가
            try:
                start = time.time()
                output_a = self.graph_a.invoke(case.input)
                latency_a = time.time() - start
                metric_a = metric_func(output_a, case.expected_output)
                self.results_a.append({
                    "metric": metric_a,
                    "latency": latency_a,
                    "success": True
                })
            except Exception as e:
                self.results_a.append({
                    "metric": 0,
                    "latency": 0,
                    "success": False
                })
            
            # 그래프 B 평가
            try:
                start = time.time()
                output_b = self.graph_b.invoke(case.input)
                latency_b = time.time() - start
                metric_b = metric_func(output_b, case.expected_output)
                self.results_b.append({
                    "metric": metric_b,
                    "latency": latency_b,
                    "success": True
                })
            except Exception as e:
                self.results_b.append({
                    "metric": 0,
                    "latency": 0,
                    "success": False
                })
        
        # 통계 분석
        analysis = self._analyze_results(confidence_level)
        self._print_analysis(analysis)
        
        return analysis
    
    def _analyze_results(self, confidence_level: float) -> Dict[str, Any]:
        """결과 분석"""
        metrics_a = [r["metric"] for r in self.results_a if r["success"]]
        metrics_b = [r["metric"] for r in self.results_b if r["success"]]
        
        latencies_a = [r["latency"] for r in self.results_a if r["success"]]
        latencies_b = [r["latency"] for r in self.results_b if r["success"]]
        
        # t-test
        if len(metrics_a) > 1 and len(metrics_b) > 1:
            t_stat, p_value = stats.ttest_ind(metrics_a, metrics_b)
        else:
            t_stat, p_value = 0, 1
        
        return {
            "graph_a": {
                "mean_metric": np.mean(metrics_a) if metrics_a else 0,
                "std_metric": np.std(metrics_a) if metrics_a else 0,
                "mean_latency": np.mean(latencies_a) if latencies_a else 0,
                "success_rate": sum(r["success"] for r in self.results_a) / len(self.results_a)
            },
            "graph_b": {
                "mean_metric": np.mean(metrics_b) if metrics_b else 0,
                "std_metric": np.std(metrics_b) if metrics_b else 0,
                "mean_latency": np.mean(latencies_b) if latencies_b else 0,
                "success_rate": sum(r["success"] for r in self.results_b) / len(self.results_b)
            },
            "statistical_test": {
                "t_statistic": t_stat,
                "p_value": p_value,
                "significant": p_value < (1 - confidence_level),
                "confidence_level": confidence_level
            },
            "winner": self._determine_winner(metrics_a, metrics_b, p_value, confidence_level)
        }
    
    def _determine_winner(self, metrics_a: List, metrics_b: List, 
                         p_value: float, confidence_level: float) -> str:
        """승자 결정"""
        if p_value >= (1 - confidence_level):
            return "no_significant_difference"
        
        mean_a = np.mean(metrics_a) if metrics_a else 0
        mean_b = np.mean(metrics_b) if metrics_b else 0
        
        return "graph_a" if mean_a > mean_b else "graph_b"
    
    def _print_analysis(self, analysis: Dict[str, Any]):
        """분석 결과 출력"""
        print("\n" + "="*60)
        print("📊 A/B 테스트 결과")
        print("="*60)
        
        print("\n그래프 A:")
        print(f"  평균 메트릭: {analysis['graph_a']['mean_metric']:.3f} (±{analysis['graph_a']['std_metric']:.3f})")
        print(f"  평균 지연시간: {analysis['graph_a']['mean_latency']*1000:.2f}ms")
        print(f"  성공률: {analysis['graph_a']['success_rate']:.1%}")
        
        print("\n그래프 B:")
        print(f"  평균 메트릭: {analysis['graph_b']['mean_metric']:.3f} (±{analysis['graph_b']['std_metric']:.3f})")
        print(f"  평균 지연시간: {analysis['graph_b']['mean_latency']*1000:.2f}ms")
        print(f"  성공률: {analysis['graph_b']['success_rate']:.1%}")
        
        print("\n통계 검정:")
        print(f"  t-통계량: {analysis['statistical_test']['t_statistic']:.3f}")
        print(f"  p-value: {analysis['statistical_test']['p_value']:.4f}")
        print(f"  통계적 유의성: {analysis['statistical_test']['significant']}")
        
        print(f"\n🏆 승자: {analysis['winner']}")

# A/B 테스트용 두 개의 다른 그래프 생성
def create_graph_version_a():
    """그래프 버전 A (기본)"""
    workflow = StateGraph(MessagesState)
    
    def process(state):
        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.7)
        response = llm.invoke(state["messages"])
        return {"messages": [response]}
    
    workflow.add_node("process", process)
    workflow.set_entry_point("process")
    workflow.add_edge("process", END)
    return workflow.compile()

def create_graph_version_b():
    """그래프 버전 B (개선)"""
    workflow = StateGraph(MessagesState)
    
    def process(state):
        # 더 정교한 프롬프트 사용
        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.3)
        enhanced_prompt = "Please provide a detailed and accurate response."
        messages = state["messages"] + [HumanMessage(content=enhanced_prompt)]
        response = llm.invoke(messages[:-1])  # 프롬프트는 제외
        return {"messages": [response]}
    
    workflow.add_node("process", process)
    workflow.set_entry_point("process")
    workflow.add_edge("process", END)
    return workflow.compile()

# A/B 테스트 실행
print("🔬 A/B 테스트 데모\n")

graph_a = create_graph_version_a()
graph_b = create_graph_version_b()

ab_tester = ABTestFramework(graph_a, graph_b, test_dataset)
ab_results = ab_tester.run_test(
    metric_func=evaluate_response_length,
    confidence_level=0.95
)

## 5. Continuous Evaluation Pipeline

In [None]:
from datetime import datetime, timedelta
import hashlib

class ContinuousEvaluationPipeline:
    """지속적 평가 파이프라인"""
    
    def __init__(self, graph_name: str):
        self.graph_name = graph_name
        self.evaluation_history = []
        self.baselines = {}
        self.alerts = []
    
    def add_evaluation(self, 
                      graph,
                      dataset: EvaluationDataset,
                      version: str = None):
        """평가 추가"""
        
        # 버전 생성
        if version is None:
            version = self._generate_version(graph)
        
        # 평가 실행
        evaluator = GraphEvaluator(graph, dataset)
        results = evaluator.evaluate(
            eval_functions=[evaluate_response_length, evaluate_keyword_presence],
            verbose=False
        )
        
        # 기록 저장
        evaluation_record = {
            "timestamp": datetime.now(),
            "version": version,
            "dataset_name": dataset.name,
            "results": results,
            "detailed_results": evaluator.results
        }
        
        self.evaluation_history.append(evaluation_record)
        
        # 성능 체크
        self._check_performance_regression(evaluation_record)
        
        return evaluation_record
    
    def _generate_version(self, graph) -> str:
        """그래프 버전 생성"""
        # 간단한 해시 기반 버전
        graph_str = str(graph)
        hash_obj = hashlib.md5(graph_str.encode())
        return hash_obj.hexdigest()[:8]
    
    def _check_performance_regression(self, current_eval: Dict):
        """성능 회귀 체크"""
        if not self.baselines:
            # 첫 평가를 베이스라인으로 설정
            self.baselines = current_eval["results"]
            return
        
        # 주요 메트릭 비교
        for metric, baseline_value in self.baselines.items():
            if metric in current_eval["results"]:
                current_value = current_eval["results"][metric]
                
                # 회귀 감지 (10% 이상 성능 하락)
                if isinstance(baseline_value, (int, float)):
                    if current_value < baseline_value * 0.9:
                        alert = {
                            "type": "performance_regression",
                            "metric": metric,
                            "baseline": baseline_value,
                            "current": current_value,
                            "timestamp": datetime.now(),
                            "version": current_eval["version"]
                        }
                        self.alerts.append(alert)
                        self._send_alert(alert)
    
    def _send_alert(self, alert: Dict):
        """알림 전송"""
        print(f"\n⚠️ 성능 알림:")
        print(f"  메트릭: {alert['metric']}")
        print(f"  베이스라인: {alert['baseline']:.3f}")
        print(f"  현재: {alert['current']:.3f}")
        print(f"  하락률: {(1 - alert['current']/alert['baseline'])*100:.1f}%")
    
    def get_trend_analysis(self, metric: str, window: int = 5) -> Dict:
        """트렌드 분석"""
        if len(self.evaluation_history) < 2:
            return {"trend": "insufficient_data"}
        
        # 최근 N개 평가에서 메트릭 추출
        recent_evals = self.evaluation_history[-window:]
        values = []
        timestamps = []
        
        for eval_record in recent_evals:
            if metric in eval_record["results"]:
                values.append(eval_record["results"][metric])
                timestamps.append(eval_record["timestamp"])
        
        if len(values) < 2:
            return {"trend": "insufficient_data"}
        
        # 선형 회귀로 트렌드 계산
        x = np.arange(len(values))
        slope = np.polyfit(x, values, 1)[0]
        
        return {
            "trend": "improving" if slope > 0 else "declining",
            "slope": slope,
            "values": values,
            "timestamps": timestamps,
            "mean": np.mean(values),
            "std": np.std(values)
        }
    
    def generate_report(self) -> str:
        """평가 보고서 생성"""
        if not self.evaluation_history:
            return "평가 기록이 없습니다."
        
        latest = self.evaluation_history[-1]
        
        report = f"""
        =====================================
        평가 보고서: {self.graph_name}
        =====================================
        
        최신 평가:
        - 버전: {latest['version']}
        - 시간: {latest['timestamp']}
        - 데이터셋: {latest['dataset_name']}
        
        성능 메트릭:
        - 성공률: {latest['results'].get('success_rate', 0):.1%}
        - 평균 지연시간: {latest['results'].get('avg_latency', 0)*1000:.2f}ms
        
        트렌드 분석:
        """
        
        # 주요 메트릭 트렌드
        for metric in ['success_rate', 'avg_latency']:
            trend = self.get_trend_analysis(metric)
            if trend["trend"] != "insufficient_data":
                report += f"\n        - {metric}: {trend['trend']} (기울기: {trend['slope']:.4f})"
        
        # 알림
        if self.alerts:
            report += f"\n\n        최근 알림: {len(self.alerts)}개"
        
        return report

# 지속적 평가 파이프라인 데모
print("🔄 지속적 평가 파이프라인 데모\n")

pipeline = ContinuousEvaluationPipeline("sample_graph")

# 여러 버전 평가
versions = [
    (create_graph_version_a(), "v1.0.0"),
    (create_graph_version_b(), "v1.1.0"),
    (create_graph_version_a(), "v1.2.0")  # 의도적으로 성능 하락
]

for graph, version in versions:
    print(f"평가 중: {version}")
    eval_record = pipeline.add_evaluation(
        graph=graph,
        dataset=test_dataset,
        version=version
    )
    time.sleep(0.5)  # 시간 간격 시뮬레이션

# 보고서 생성
print(pipeline.generate_report())

# 트렌드 분석
print("\n📈 성공률 트렌드:")
trend = pipeline.get_trend_analysis("success_rate")
if trend["trend"] != "insufficient_data":
    print(f"  트렌드: {trend['trend']}")
    print(f"  평균: {trend['mean']:.3f}")
    print(f"  표준편차: {trend['std']:.3f}")

## 실습 과제

1. 실제 프로덕션 데이터로 평가 데이터셋 구축
2. 커스텀 평가 메트릭 개발 (도메인 특화)
3. 자동화된 평가 파이프라인을 CI/CD에 통합

In [None]:
# 여기에 실습 코드를 작성하세요
