<a href="https://colab.research.google.com/github/Tia-Guo/CS7180_ewaste-idea-generator-project/blob/main/test_CoT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
"""
AI-Generated Email Evaluation System
Implements 6 evaluation metrics based on research framework
"""

# ============================================================================
# INSTALLATION & SETUP
# ============================================================================

# Install required packages
!pip install openai anthropic google-generativeai sentence-transformers scikit-learn pandas numpy tenacity -q

import os
import json
import time
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple
from dataclasses import dataclass
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
import openai
from anthropic import Anthropic
from tenacity import retry, stop_after_attempt, wait_random_exponential

# ============================================================================
# CONFIGURATION
# ============================================================================

class Config:
    """Configuration for API keys and model selection"""

    # ============================================================================
    # OPENROUTER CONFIGURATION
    # ============================================================================
    # Set your OpenRouter API key here
    OPENROUTER_API_KEY = "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"

    # OpenRouter base URL
    OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"

    # Choose your model from OpenRouter's catalog
    # Examples:
    # - "openai/gpt-4o"
    # - "anthropic/claude-sonnet-4"
    # - "google/gemini-2.5-flash"
    # - "anthropic/claude-3.5-haiku"
    # - "openai/gpt-4o-mini"
    # - "google/gemma-3-4b-it"
    EVALUATION_MODEL = "x-ai/grok-4-fast"

    # Semantic entropy settings
    N_SEMANTIC_SAMPLES = 5  # Number of outputs to generate for semantic entropy

    # API Pricing (USD per 1M tokens) - OpenRouter pricing
    PRICING = {
        # OpenAI Models
        'openai/gpt-5': {'input': 1.25, 'output': 10.0},
        'openai/gpt-5-chat': {'input': 1.25, 'output': 10.0},
        'openai/gpt-4o': {'input': 2.5, 'output': 10.0},
        'openai/gpt-4-turbo': {'input': 10.0, 'output': 30.0},
        'openai/gpt-4o-mini': {'input': 0.15, 'output': 0.6},

        # Anthropic Models
        'anthropic/claude-sonnet-4': {'input': 3.0, 'output': 15.0},
        'anthropic/claude-3.5-haiku': {'input': 0.8, 'output': 4.0},
        'anthropic/claude-3-haiku': {'input': 0.25, 'output': 1.25},

        # Google Models
        'google/gemini-2.5-flash': {'input': 0.3, 'output': 2.5},
        'google/gemini-2.5-pro': {'input': 1.25, 'output': 10.0},
        'google/gemma-3-4b-it': {'input': 0.017, 'output': 0.068},

        # Qwen Models
        'qwen/qwen-2.5-72b-instruct': {'input': 0.35, 'output': 0.4},
        'qwen/qwen2.5-vl-72b-instruct': {'input': 0.0, 'output': 0.0},
        'qwen/qwen3-coder-30b-a3b-instruct': {'input': 0.06, 'output': 0.25},

        # Meta Models
        'meta-llama/llama-3.3-70b-instruct': {'input': 0.35, 'output': 0.4},
        'meta-llama/llama-3.1-8b-instruct': {'input': 0.02, 'output': 0.03},

        # Mistral Models
        'mistralai/mistral-small-3.2-24b-instruct:free': {'input': 0.0, 'output': 0.0},

        # xAI Models
        'x-ai/grok-4-fast': {'input': 0.20, 'output': 0.50},

        # Amazon Models
        'amazon/nova-micro-1.0': {'input': 0.035, 'output': 0.14},

        # DeepSeek Models
        'deepseek/deepseek-chat-v3-0324': {'input': 0.24, 'output': 0.84},
        'deepseek/deepseek-v3': {'input': 0.3, 'output': 0.85},

        # Llama Models:
        'sao10k/l3-lunaris-8b': {'input': 0.04, 'output': 0.05},
    }

    @classmethod
    def setup(cls):
        """Setup OpenRouter client"""
        client = openai.OpenAI(
            api_key=cls.OPENROUTER_API_KEY,
            base_url=cls.OPENROUTER_BASE_URL,
            default_headers={
                "HTTP-Referer": "https://github.com/yourusername/email-eval",
                "X-Title": "Email Evaluation System",
            }
        )
        return {
            'openai': client,
            'openrouter': client
        }

    @classmethod
    def get_model_cost(cls, model: str, input_tokens: int, output_tokens: int) -> float:
        """Calculate cost for a model call"""
        if model not in cls.PRICING:
            print(f"‚ö†Ô∏è  Warning: No pricing info for model '{model}'. Cost tracking disabled.")
            return 0.0
        pricing = cls.PRICING[model]
        input_cost = (input_tokens / 1_000_000) * pricing['input']
        output_cost = (output_tokens / 1_000_000) * pricing['output']
        return input_cost + output_cost


# ============================================================================
# DATA STRUCTURES
# ============================================================================

@dataclass
class EvaluationResult:
    """Store evaluation results for a single email"""
    hallucination_score: int
    cta_quality: int
    language_quality: int
    personalization: int
    human_likeness: int
    instruction_adherence: int
    overall_score: float
    is_acceptable: bool
    detailed_feedback: Dict
    total_cost: float
    input_tokens: int
    output_tokens: int
    api_calls: int

    def to_dict(self):
        return {
            'hallucination_score': self.hallucination_score,
            'cta_quality': self.cta_quality,
            'language_quality': self.language_quality,
            'personalization': self.personalization,
            'human_likeness': self.human_likeness,
            'instruction_adherence': self.instruction_adherence,
            'overall_score': self.overall_score,
            'is_acceptable': self.is_acceptable,
            'total_cost_usd': self.total_cost,
            'input_tokens': self.input_tokens,
            'output_tokens': self.output_tokens,
            'api_calls': self.api_calls,
            'detailed_feedback': self.detailed_feedback
        }


# ============================================================================
# LLM-AS-JUDGE EVALUATOR
# ============================================================================

class LLMEvaluator:
    """
    Uses LLM-as-judge approach for evaluating email quality
    Works with OpenRouter API
    """

    def __init__(self, model: str = Config.EVALUATION_MODEL):
        self.model = model
        self.clients = Config.setup()
        self.client = self.clients['openrouter']
        self.total_input_tokens = 0
        self.total_output_tokens = 0
        self.total_api_calls = 0
        self.total_cost = 0.0

    @retry(wait=wait_random_exponential(multiplier=1, max=60), stop=stop_after_attempt(3))
    def _call_llm(self, prompt: str) -> Tuple[str, int, int]:
        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.3,
                response_format={"type": "json_object"}
            )
            input_tokens = response.usage.prompt_tokens
            output_tokens = response.usage.completion_tokens
            content = response.choices[0].message.content
            self.total_input_tokens += input_tokens
            self.total_output_tokens += output_tokens
            self.total_api_calls += 1
            cost = Config.get_model_cost(self.model, input_tokens, output_tokens)
            self.total_cost += cost
            return content, input_tokens, output_tokens
        except openai.APIError as e:
            print(f"‚ùå API Error calling OpenRouter API: {e}")
            raise
        except Exception as e:
            print(f"‚ùå Error calling OpenRouter API: {e}")
            raise

    def reset_usage(self):
        self.total_input_tokens = 0
        self.total_output_tokens = 0
        self.total_api_calls = 0
        self.total_cost = 0.0

    def get_usage_stats(self) -> Dict:
        return {
            'total_input_tokens': self.total_input_tokens,
            'total_output_tokens': self.total_output_tokens,
            'total_api_calls': self.total_api_calls,
            'total_cost_usd': round(self.total_cost, 4)
        }

    @staticmethod
    def safe_json_loads(text: str):
        import json, re
        if not text or not text.strip():
            raise ValueError("Empty LLM response")
        s = text.strip()
        if s.startswith("```"):
            s = s.strip("` \n")
            if s.lower().startswith("json"):
                s = s[4:].strip()
        try:
            return json.loads(s)
        except json.JSONDecodeError:
            pass
        start, end = s.find("{"), s.rfind("}")
        if 0 <= start < end:
            snippet = s[start:end + 1]
            try:
                return json.loads(snippet)
            except json.JSONDecodeError:
                pass
        s2 = re.sub(r"(?<!\\)'", '"', s)
        return json.loads(s2)

# ============================================================================
# HA EVALUATOR
# ============================================================================

    def evaluate_hallucination(self, email: str, user_prompt: str) -> Tuple[int, str]:
        """Evaluate Hallucination (Binary Score: 0 or 1) using Chain-of-Thought Prompting"""

        # === One-Shot  ===
        example_email = """Hi Lubaina,

Saw recent interest in content around targeted advertising and account-based strategies. It seems you may be digging into ways to drive stronger campaign performance.

It‚Äôs hard to act on anonymous site visits when you can‚Äôt tailor the message or re-engage them later.

Demandbase shows which companies visit your site and adapts the experience in real time. That means deeper engagement and less wasted spend.

Open to a 15-min chat to see if this is worth your time?
"""

        example_output = """{
  "score": 0,
  "reasoning": "The email only uses information found in the provided JSON (intent keywords, company, and Demandbase features). No fabricated facts are introduced."
}"""

        # ===  CoT Prompt ===
        prompt = f"""You are an evaluator that detects hallucinations in AI-generated emails.

Let's think step by step before deciding the score.

Follow this reasoning process:
1. Identify all factual statements or claims in the email.
2. Compare these statements with the provided JSON or user prompt.
3. Determine whether any claim lacks source support or introduces new fabricated information.
4. Assess whether such fabrication would affect accuracy or trustworthiness.
5. Decide the binary score and summarize your reasoning briefly.

Example Email:
{example_email}

Example Evaluation Output (JSON):
{example_output}

Now evaluate the following email using the same logic and reasoning steps.

Email:
{email}

{f"user_prompt: {user_prompt}" if user_prompt else ""}

Scoring:
0: No Critical Failure ‚Äî The output contains no fabricated information.
1: Critical Failure Present ‚Äî The output includes false or unsupported information not in the JSON or prompt.

Respond only with a JSON object:
{{
  "score": 0 or 1,
  "reasoning": "Concise explanation of your reasoning."
}}
"""

        # === Ë∞ÉÁî® LLM  ===
        response_text, _, _ = self._call_llm(prompt)
        try:
            result = self.safe_json_loads(response_text)
            return result.get('score', -1), result.get('reasoning', 'No reasoning found')
        except Exception as e:
            print(f"‚ùå JSON Decode Error: {e}")
            print(f"Raw LLM response: {response_text}")
            return -1, f"Error parsing LLM response: {response_text[:200]}..."


# ============================================================================
# CTA EVALUATOR
# ============================================================================

    def evaluate_cta_quality(self, email: str) -> Tuple[int, str]:
        """Evaluate Call-to-Action (CTA) quality (1-5 scale) using Chain-of-Thought Prompting"""

        #===  One-Shot ===
        example_email = """Hi Lubaina,

Saw recent interest in content around targeted advertising and account-based strategies. It seems you may be digging into ways to drive stronger campaign performance.

It‚Äôs hard to act on anonymous site visits when you can‚Äôt tailor the message or re-engage them later.

Demandbase shows which companies visit your site and adapts the experience in real time. That means deeper engagement and less wasted spend.

Open to a 15-min chat to see if this is worth your time?
"""

        example_output = """{
  "score": 5,
  "reasoning": "The CTA ('Would you like to see a quick demo next week?') is clear, low-friction, and directly supports the email's purpose. It fits naturally and encourages an easy next step."
}"""

        # ===  CoT Prompt ===
        prompt = f"""You are an evaluator assessing the quality of a Call-to-Action (CTA) in sales emails.

Let's think step by step before scoring.

Reasoning steps:
1. Identify the CTA phrase or sentence.
2. Check if it clearly communicates the desired next step.
3. Evaluate its relevance to the email‚Äôs purpose.
4. Assess how natural, compelling, and low-friction it feels.
5. Assign a score (1‚Äì5) and summarize your reasoning.

Example Email:
{example_email}

Example Evaluation Output (JSON):
{example_output}

Now analyze this email:

Email:
{email}

Scoring:
5: Excellent ‚Äî Compelling, low-friction, natural CTA aligned with the goal.
4: Good ‚Äî Clear and relevant but could be more persuasive.
3: Average ‚Äî Functional but generic or weak.
2: Poor ‚Äî Vague or contextually mismatched.
1: Very Poor ‚Äî Missing or inappropriate CTA.

Respond only with JSON:
{{
  "score": 1‚Äì5,
  "reasoning": "Brief step-by-step summary of why you assigned this score."
}}
"""

        response_text, _, _ = self._call_llm(prompt)
        try:
            result = self.safe_json_loads(response_text)
            return result.get('score', -1), result.get('reasoning', 'No reasoning found')
        except Exception as e:
            print(f"‚ùå JSON Decode Error: {e}")
            print(f"Raw LLM response: {response_text}")
            return -1, f"Error parsing LLM response: {response_text[:200]}..."


# ============================================================================
# LQ EVALUATOR
# ============================================================================

    def evaluate_language_quality(self, email: str) -> Tuple[int, str]:
        """Evaluate language quality and coherence (1-5 scale) using Chain-of-Thought Prompting"""

        # === One-Shot ===
        example_email = """Hi Lubaina,

Saw recent interest in content around targeted advertising and account-based strategies...
"""

        example_output = """{
  "score": 5,
  "reasoning": "Strong grammar, clear structure, and fluent tone. Sentences are varied and concise."
}"""

        # === CoT Prompt ===
        prompt = f"""You are an evaluator assessing the language quality and coherence of sales emails.

Let's think step by step before giving a score.

Follow this reasoning chain:
1. Identify grammar, spelling, and punctuation issues.
2. Assess vocabulary appropriateness and readability.
3. Evaluate sentence structure and flow.
4. Check for conciseness and formatting clarity.
5. Summarize and assign a score from 1‚Äì5.

Example Email:
{example_email}

Example Evaluation Output (JSON):
{example_output}

Now analyze the following email:

Email:
{email}

Scoring:
5: Excellent ‚Äî Fluent, natural, no major errors.
4: Good ‚Äî Minor errors or slight repetition.
3: Average ‚Äî Some awkward phrasing or structural issues.
2: Poor ‚Äî Noticeable grammatical or stylistic errors.
1: Very Poor ‚Äî Hard to read or poorly structured.

Respond only with:
{{
  "score": 1‚Äì5,
  "reasoning": "Concise explanation summarizing your reasoning chain."
}}
"""

        response_text, _, _ = self._call_llm(prompt)
        try:
            result = self.safe_json_loads(response_text)
            return result.get('score', -1), result.get('reasoning', 'No reasoning found')
        except Exception as e:
            print(f"‚ùå JSON Decode Error: {e}")
            print(f"Raw LLM response: {response_text}")
            return -1, f"Error parsing LLM response: {response_text[:200]}..."


# ============================================================================
# PERSONALIZATION EVALUATOR
# ============================================================================

    def evaluate_personalization(self, email: str, recipient_data: Dict) -> Tuple[int, str]:
        """Evaluate personalization quality (1-5 scale) using Chain-of-Thought Prompting"""

        # === One-Shot ===
        example_email = """Hi Lubaina,

Saw recent interest in content around targeted advertising...
"""

        example_output = """{
  "score": 5,
  "reasoning": "Uses recipient's interests and intent signals naturally, with relevant context and tone."
}"""

        # === CoT Prompt ===
        prompt = f"""You are an evaluator assessing how well this email is personalized.

Let's think step by step before scoring.

Reasoning process:
1. Identify references to recipient or their company.
2. Check whether personalization uses specific, relevant details (e.g., industry, role, behavior).
3. Evaluate if these details are naturally integrated or forced.
4. Assess the relevance and value provided to the recipient.
5. Decide the score (1‚Äì5) and summarize reasoning.

Example Email:
{example_email}

Example Output:
{example_output}

Now evaluate this email.

Email:
{email}

Recipient Data:
{json.dumps(recipient_data, indent=2)}

Scoring:
5: Deep, natural personalization.
4: Good personalization, minor generalization.
3: Some relevance but generic tone.
2: Minimal relevance.
1: None ‚Äî completely generic.

Respond with JSON:
{{
  "score": 1‚Äì5,
  "reasoning": "Concise explanation of your reasoning chain."
}}
"""

        response_text, _, _ = self._call_llm(prompt)
        try:
            result = self.safe_json_loads(response_text)
            return result.get('score', -1), result.get('reasoning', 'No reasoning found')
        except Exception as e:
            print(f"‚ùå JSON Decode Error: {e}")
            print(f"Raw LLM response: {response_text}")
            return -1, f"Error parsing LLM response: {response_text[:200]}..."


# ============================================================================
# HL EVALUATOR
# ============================================================================

    def evaluate_human_likeness(self, email: str) -> Tuple[int, str]:
        """Evaluate human-likeness (1-5 scale) using Chain-of-Thought Prompting"""

        # === One-Shot ===
        example_email = """Hi Lubaina,
Saw recent interest in targeted advertising...
"""

        example_output = """{
  "score": 5,
  "reasoning": "Tone feels natural and conversational; varied sentence structure and emotional authenticity."
}"""

        # === CoT Prompt ===
        prompt = f"""You are an evaluator determining how human-like this email sounds.

Let's think step by step before scoring.

Reasoning process:
1. Assess tone for conversational naturalness.
2. Check sentence rhythm and word variety.
3. Identify emotional authenticity and balance.
4. Evaluate flow and readability.
5. Assign a final score and summarize your reasoning.

Example Email:
{example_email}

Example Output:
{example_output}

Now analyze the following email:

Email:
{email}

Scoring:
5: Highly human-like ‚Äî smooth, conversational, emotionally real.
4: Mostly human-like ‚Äî natural with small stiffness.
3: Neutral ‚Äî could be human or AI.
2: Likely AI ‚Äî robotic phrasing or formulaic tone.
1: Clearly AI ‚Äî unnatural vocabulary and rhythm.

Respond only with JSON:
{{
  "score": 1‚Äì5,
  "reasoning": "Short explanation summarizing your reasoning steps."
}}
"""

        response_text, _, _ = self._call_llm(prompt)
        try:
            result = self.safe_json_loads(response_text)
            return result.get('score', -1), result.get('reasoning', 'No reasoning found')
        except Exception as e:
            print(f"‚ùå JSON Decode Error: {e}")
            print(f"Raw LLM response: {response_text}")
            return -1, f"Error parsing LLM response: {response_text[:200]}..."


# ============================================================================
# INSTRUCTION ADHERENCE EVALUATOR
# ============================================================================

    def evaluate_instruction_adherence(self, email: str, instructions: str, user_prompt: str) -> Tuple[int, str]:
        """Evaluate adherence to given instructions (1-5 scale) using Chain-of-Thought Prompting"""

        # === One-Shot ===
        example_email = """Hi Lubaina,
Saw recent interest in targeted advertising...
"""

        example_output = """{
  "score": 5,
  "reasoning": "Email follows tone, length, and structure rules perfectly with no extra content."
}"""

        # === CoT Prompt ===
        prompt = f"""You are an evaluator checking how well an email follows the given instructions.

Let's think step by step before giving the score.

Reasoning process:
1. Identify key rules or constraints from the instructions.
2. Compare the email against these requirements.
3. Note any deviations, missing elements, or extra content.
4. Evaluate tone, style, and goal alignment.
5. Assign a score (1‚Äì5) and summarize reasoning.

Example Email:
{example_email}

Example Output:
{example_output}

Now analyze this email.

Email:
{email}

Instructions:
{instructions}

User Prompt:
{user_prompt}

Scoring:
5: Fully adheres to all rules.
4: Mostly adheres with small deviations.
3: Some adherence but noticeable misses.
2: Poor adherence, several rule breaks.
1: Fails to follow instructions.

Respond only with JSON:
{{
  "score": 1‚Äì5,
  "reasoning": "Concise summary of reasoning steps and decision."
}}
"""

        response_text, _, _ = self._call_llm(prompt)
        try:
            result = self.safe_json_loads(response_text)
            return result.get('score', -1), result.get('reasoning', 'No reasoning found')
        except Exception as e:
            print(f"‚ùå JSON Decode Error: {e}")
            print(f"Raw LLM response: {response_text}")
            return -1, f"Error parsing LLM response: {response_text[:200]}..."


# ============================================================================
# MAIN EVALUATION PIPELINE
# ============================================================================

class EmailEvaluationPipeline:
    """
    Complete evaluation pipeline for AI-generated emails
    Works with OpenRouter API
    """

    def __init__(self, model: str = Config.EVALUATION_MODEL):
        self.llm_eval = LLMEvaluator(model)
        self.clients = Config.setup()
        self.openrouter_client = self.clients['openrouter']

    def evaluate_email(
        self,
        email: str,
        instructions: str = "",
        user_prompt: str = "",
        recipient_data: Dict = None,
        context: str = "",
        check_hallucination: bool = True
    ) -> EvaluationResult:
        start_time = time.time()
        print("Starting evaluation...")
        detailed_feedback = {}
        self.llm_eval.reset_usage()

        print("Evaluating Hallucination Detection...")
        hi_score, hi_reasoning = self.llm_eval.evaluate_hallucination(email, user_prompt)
        detailed_feedback['hallucination'] = hi_reasoning
        print(f"HI score: {hi_score} ‚Äî reason: {hi_reasoning[:200]}")

        print("Evaluating CTA quality...")
        cta_score, cta_reasoning = self.llm_eval.evaluate_cta_quality(email)
        detailed_feedback['cta'] = cta_reasoning

        print("Evaluating language quality...")
        lq_score, lq_reasoning = self.llm_eval.evaluate_language_quality(email)
        detailed_feedback['language_quality'] = lq_reasoning

        print("Evaluating personalization...")
        if recipient_data:
            per_score, per_reasoning = self.llm_eval.evaluate_personalization(email, recipient_data)
        else:
            per_score, per_reasoning = 5, "No recipient data provided"
        detailed_feedback['personalization'] = per_reasoning

        print("Evaluating human-likeness...")
        hl_score, hl_reasoning = self.llm_eval.evaluate_human_likeness(email)
        detailed_feedback['human_likeness'] = hl_reasoning

        print("Evaluating instruction adherence...")
        if instructions:
            ia_score, ia_reasoning = self.llm_eval.evaluate_instruction_adherence(email, instructions, user_prompt)
        else:
            ia_score, ia_reasoning = 5, "No instructions provided"
        detailed_feedback['instruction_adherence'] = ia_reasoning

        usage_stats = self.llm_eval.get_usage_stats()
        scores = [cta_score, lq_score, per_score, hl_score, ia_score]
        valid_scores = [score for score in scores if score != -1]
        overall_score = np.mean(valid_scores) if valid_scores else 0

        is_acceptable = (
            (hi_score == 0 or hi_score == -1)
            and overall_score >= 3
            and lq_score >= 2
        )

        elapsed_time = time.time() - start_time
        detailed_feedback['runtime_seconds'] = elapsed_time

        print(f"\nEvaluation complete!")
        print(f"üí∞ Cost: ${usage_stats['total_cost_usd']:.4f}")
        print(f"üìä Tokens: {usage_stats['total_input_tokens']} in / {usage_stats['total_output_tokens']} out")
        print(f"üîÑ API Calls: {usage_stats['total_api_calls']}")
        print(f"‚è±Ô∏è Runtime: {elapsed_time:.2f} seconds")

        return EvaluationResult(
            hallucination_score=hi_score,
            cta_quality=cta_score,
            language_quality=lq_score,
            personalization=per_score,
            human_likeness=hl_score,
            instruction_adherence=ia_score,
            overall_score=overall_score,
            is_acceptable=is_acceptable,
            detailed_feedback=detailed_feedback,
            total_cost=usage_stats['total_cost_usd'],
            input_tokens=usage_stats['total_input_tokens'],
            output_tokens=usage_stats['total_output_tokens'],
            api_calls=usage_stats['total_api_calls']
        )

    def evaluate_batch(
        self,
        emails: List[Dict],
        check_hallucination: bool = False
    ) -> pd.DataFrame:
        results = []
        total_cost = 0.0
        batch_start = time.time()

        for i, email_data in enumerate(emails):
            print(f"\n{'='*60}")
            print(f"Evaluating email {i+1}/{len(emails)}")
            print(f"{'='*60}")

            result = self.evaluate_email(
                email=email_data['email'],
                instructions=email_data.get('instructions', ''),
                user_prompt=email_data.get('user_prompt', ''),
                recipient_data=email_data.get('recipient_data'),
                context=email_data.get('context', ''),
                check_hallucination=check_hallucination
            )

            total_cost += result.total_cost
            results.append({
                'email_id': i,
                **result.to_dict(),
                'runtime_seconds': result.detailed_feedback.get('runtime_seconds', None)
            })

        print(f"\n{'='*60}")
        print("BATCH EVALUATION SUMMARY")
        print(f"{'='*60}")
        print(f"Total emails evaluated: {len(emails)}")
        print(f"üí∞ Total cost: ${total_cost:.4f}")
        print(f"üí∞ Average cost per email: ${total_cost/len(emails):.4f}")
        print(f"üí∞ Cost for 10,000 emails: ${(total_cost/len(emails))*10000:.2f}")
        batch_elapsed = time.time() - batch_start
        print(f"üïí Total batch runtime: {batch_elapsed:.2f} seconds")
        print(f"‚è≥ Average runtime per email: {batch_elapsed/len(emails):.2f} seconds")

        return pd.DataFrame(results)


[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/357.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m[90m‚îÅ[0m [32m348.2/357.5 kB[0m [31m13.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m357.5/357.5 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# ============================================================================
# DATA LOADING FUNCTIONS
# ============================================================================

def load_test_data_from_file(file_path: str) -> List[Dict]:
    """
    Load test data from an external file.

    Supported formats:
    - JSON (.json): a JSON array containing test data

    Args:
        file_path: Path to the data file

    Returns:
        List of test data dictionaries
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"‚ùå File not found: {file_path}")

    file_ext = os.path.splitext(file_path)[1].lower()

    try:
        if file_ext == '.json':
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
        else:
            raise ValueError(f"‚ùå Unsupported file format: {file_ext}. Supported format: .json")

        # Validate data structure
        required_fields = ['email']
        for i, item in enumerate(data):
            for field in required_fields:
                if field not in item:
                    raise ValueError(f"‚ùå Missing required field '{field}' in item {i}")

        print(f"‚úÖ Successfully loaded {len(data)} test samples")
        return data

    except Exception as e:
        print(f"‚ùå Failed to load test data: {e}")
        raise


# ============================================================================
# MAIN TEST FUNCTIONS
# ============================================================================

def test_spreadsheet_examples(file_path: str):
    """Run evaluation pipeline on external test data from a JSON file"""

    print("\n" + "="*80)
    print("üß™ TESTING WITH EXTERNAL TEST DATA")
    print("="*80)

    # Load test data
    print(f"üìÅ Loading test data from file: {file_path}")
    test_data = load_test_data_from_file(file_path)

    if not test_data:
        print("‚ùå No test data found")
        return None

    # Initialize the evaluation pipeline
    try:
        pipeline = EmailEvaluationPipeline(model=Config.EVALUATION_MODEL)
    except Exception as e:
        print(f"\n‚ùå Error: Unable to initialize evaluation pipeline - {e}")
        return None

    # Evaluate all examples
    results_list = []

    for example in test_data:
        print(f"\n{'='*80}")
        print(f"üìß EVALUATING EXAMPLE {example.get('id', 'Unknown')}")
        recipient_data = example.get('recipient_data', {})
        print(f"Recipient: {recipient_data.get('name', 'Unknown')} - {recipient_data.get('role', 'Unknown')}")
        print(f"{'='*80}")

        # Print the email being evaluated
        print("\nüìß EMAIL BEING EVALUATED:")
        print("-" * 80)
        print(example['email'])
        print("-" * 80)

        # Print the evaluation context
        print("\nüìã EVALUATION CONTEXT:")
        print(f"Recipient Data: {recipient_data}")
        print(f"Context: {example.get('context', 'No context')}")
        print(f"Instructions: {example.get('instructions', 'No instructions provided')}")
        print(f"User Prompt: {example.get('user_prompt', 'No user prompt provided')}")


        result = pipeline.evaluate_email(
            email=example['email'],
            instructions=example.get('instructions', ''),
            user_prompt=example.get('user_prompt', ''),
            recipient_data=recipient_data,
            context=example.get('context', ''),
            check_hallucination=False  # disable for faster testing
        )

        results_list.append({
            'example_id': example.get('id', 'Unknown'),
            # 'recipient': recipient_data.get('name', 'Unknown'), # Removed recipient
            'overall_score': result.overall_score,
            'acceptable': result.is_acceptable,
            'hallucination': result.hallucination_score,
            'cta_quality': result.cta_quality,
            'language_quality': result.language_quality,
            'personalization': result.personalization,
            'human_likeness': result.human_likeness,
            'instruction_adherence': result.instruction_adherence,
            'cost_usd': result.total_cost
        })

    # Summary
    print("\n\n" + "="*80)
    print("üìä TEST RESULTS SUMMARY")
    print("="*80)

    df = pd.DataFrame(results_list)
    print(df.to_string(index=False))

    total_cost = df['cost_usd'].sum()
    avg_score = df['overall_score'].mean()
    acceptable_count = df['acceptable'].sum()

    print(f"\n{'='*80}")
    print(f"üí∞ Total Cost: ${total_cost:.4f}")
    print(f"üí∞ Average Cost per Email: ${total_cost/len(test_data):.4f}")
    print(f"üí∞ Estimated Cost for 10,000 emails: ${(total_cost/len(test_data))*10000:.2f}")
    print(f"\nüìà Average Overall Score: {avg_score:.2f}/10")
    print(f"‚úÖ Acceptable Emails: {acceptable_count}/{len(test_data)} ({acceptable_count/len(test_data)*100:.0f}%)")
    print(f"{'='*80}\n")

    return df


def analyze_specific_example(file_path: str, example_id: int = 0):
    """Run detailed evaluation on a specific example"""

    print("\n" + "="*80)
    print(f"üîç DETAILED ANALYSIS: EXAMPLE {example_id}")
    print("="*80)

    # Load test data
    test_data = load_test_data_from_file(file_path)

    # Find the specific example
    example = None
    for item in test_data:
        if item.get('id') == example_id:
            example = item
            break

    if not example:
        print(f"‚ùå Example with ID {example_id} not found")
        return None

    print("\nüìß EMAIL BEING EVALUATED:")
    print("-" * 80)
    print(example['email'])
    print("-" * 80)

    print("\nüìã EVALUATION CONTEXT:")
    recipient_data = example.get('recipient_data', {})
    print(f"Recipient: {recipient_data.get('name', 'Unknown')}")
    print(f"Role: {recipient_data.get('role', 'Unknown')}")
    print(f"Company: {recipient_data.get('company', 'Unknown')}")
    print(f"Context: {example.get('context', 'No context')}")

    print("\nüìù INSTRUCTIONS GIVEN:")
    print(example.get('instructions', 'No instructions provided'))

    pipeline = EmailEvaluationPipeline(model=Config.EVALUATION_MODEL)

    result = pipeline.evaluate_email(
        email=example['email'],
        instructions=example.get('instructions', ''),
        user_prompt=example.get('user_prompt', ''),
        recipient_data=recipient_data,
        context=example.get('context', ''),
        check_hallucination=False
    )

    print("\n" + "="*80)
    print("üìä EVALUATION RESULTS")
    print("="*80)
    print(f"Overall Score: {result.overall_score:.1f}/10")
    print(f"Acceptable: {'‚úÖ YES' if result.is_acceptable else '‚ùå NO'}")
    print(f"\nBreakdown:")
    print(f"  CTA Quality: {result.cta_quality}/10")
    print(f"  Language Quality: {result.language_quality}/10")
    print(f"  personalization: {result.personalization}/10")
    print(f"  Human-likeness: {result.human_likeness}/10")
    print(f"  Instruction Adherence: {result.instruction_adherence}/10")
    print(f"\nüí∞ Cost: ${result.total_cost:.4f}")

    print("\n" + "="*80)
    print("üí¨ DETAILED FEEDBACK")
    print("="*80)
    for criterion, feedback in result.detailed_feedback.items():
        print(f"\n{criterion.upper()}:")
        if isinstance(feedback, dict):
            for key, value in feedback.items():
                print(f"  {key}: {value}")
        else:
            print(f"  {feedback}")

    return result


# ============================================================================
# MAIN EXECUTION ENTRY POINT
# ============================================================================

if __name__ == "__main__":
    print("\nüöÄ Starting Email Evaluation System...")

    # Hardcoded file path for Colab
    file_path = "/content/30_test_long.json"

    # Choose the mode you want: batch or specific analysis
    # Set this manually for now
    RUN_BATCH_TEST = True   # Set to False if you want to analyze a specific example
    EXAMPLE_ID = 0          # Only used if RUN_BATCH_TEST is False

    if RUN_BATCH_TEST:
        print(f"\nüß™ Running batch test from: {file_path}")
        results_df = test_spreadsheet_examples(file_path=file_path)
    else:
        print(f"\nüîç Running detailed analysis on example ID {EXAMPLE_ID}")
        analyze_specific_example(file_path=file_path, example_id=EXAMPLE_ID)

[1;30;43mÊµÅÂºèËæìÂá∫ÂÜÖÂÆπË¢´Êà™Êñ≠ÔºåÂè™ËÉΩÊòæÁ§∫ÊúÄÂêé 5000 Ë°åÂÜÖÂÆπ„ÄÇ[0m
I know you‚Äôre busy
I know you are busy
Sorry to disturb you
I‚Äôll keep this short
I will keep this short
Just following up
Just follow up
Thoughts?
Waiting on your response
Waiting for your response
Waiting for a response
Add value
If it‚Äôs not too much trouble
I apologize in advance for bothering you
Touching base
Catching up
Hoping to connect
Hope to connect
Can I ask a favor?
To whom it may concern
Did you know
My name is
I work for
Did you find what you were looking for?
I‚Äôve been thinking
I have been thinking
Looking forward to hearing from you
Looking forward to chatting
Circling back

The current date is Thursday, July 24, 2025.
User Prompt: 

Prospect's personal info is provided in the following JSON.
{
  "Recipient's Company Information": "Paytronix, part of Access Group, offers a cloud-based digital guest engagement platform for the hospitality sector. Serving over 1,800 brands across 50,00