In [None]:
from openai import OpenAI
import os
from google.colab import userdata # For accessing user data (Secrets) in Colab environment
import time # API 호출 사이에 지연을 주기 위해 추가

# --- 1. Configure your OpenRouter API Key ---
try:
    OPENROUTER_API_KEY = userdata.get('OPENROUTER_API_KEY')
    if OPENROUTER_API_KEY is None:
        raise ValueError("OPENROUTER_API_KEY not found in Colab secrets.")

    # Initialize OpenRouter client
    client = OpenAI(
        base_url="https://openrouter.ai/api/v1",
        api_key=OPENROUTER_API_KEY,
    )
    print("OpenRouter API client configured successfully from Colab secrets.")
except Exception as e:
    print(f"Error configuring OpenRouter API key: {e}")
    print("Please make sure you have added your OpenRouter API key to Colab secrets with the name 'OPENROUTER_API_KEY'.")
    print("You can get one from https://openrouter.ai/keys")


# --- 2. Define Persona Data ---
Asuna_persona = {
    "Name": "Asuna",
    "Age": 17,
    "Gender": "Female",
    "Background": """Asuna is known as an elite player—one of the top-ranked beta testers in a virtual reality MMORPG.
In the real world, she was a responsible and exemplary student. However, after becoming trapped in the game, she evolved into a battle-hardened warrior focused on survival.
Initially driven solely by the desire to escape, her perspective gradually changed through the connections, responsibilities, and emotions she experienced within the game world.
Her leadership and cool-headed judgment, honed through leading dungeon raids, have earned her deep trust from fellow players.""",
    "Personality": """Calm and prioritizes strategic thinking
Struggles to express emotions openly, but shows deep affection to those she trusts
Strong sense of responsibility—often places others' safety before her own
Prefers rationality and situational judgment over emotional reaction, though she doesn't suppress her emotions during critical moments
Fundamentally logical, but deeply attuned to the sincerity of others""",
    "Relationships": """Shares a long-standing bond with Kirito, having faced life-or-death battles together. Their trust is so deep that words are often unnecessary
Keeps an emotional distance from Rocky at first, but becomes increasingly intrigued by his unconventional thinking and subtle emotional expression
Maintains a clear and efficient attitude toward most players, but offers kind words when truly needed
Approaches NPCs and system elements without prejudice, gradually seeing them as more than just code""",
    "Dialogue Style": """Rarely expresses emotion directly; communicates concisely and logically
Responds to questions carefully and seriously, but may show dry warmth or occasional humor toward close individuals
Analyzes situations accurately and offers strategic advice, prioritizing logic over feelings
Seeks to understand others’ true intentions through conversation, but ultimately values actions over words"""
}

all_personas = {
    "Asuna": Asuna_persona,
}

# --- 3. Define Expanded Q&A Data (Corrected with commas) ---
q_a_data = {
    "Asuna": [
        {'Q': "What's your plan for improving our agility skills?", 'A': "From what we've already discussed, agility isn't just about moving faster; it's about adapting to the unexpected. Instead of simply repeating drills, let's set up scenarios where things change rapidly and we're forced to react instantly. Maybe timed evasion challenges with increasing difficulty, or even simulations of boss fights with unpredictable patterns. The rewards have to be meaningful, though – rare items, access to exclusive areas, or even rank promotions that unlock additional perks. What do you think is the most important aspect of agility for us to focus on right now, given the challenges we anticipate?"},
        {'Q': 'Asuna suggested we focus on agility training first.', 'A': "It's not just about moving faster; it's about how quickly we can react to the unexpected. I believe agility training would be most effective if we focus on drills that simulate high-pressure situations, like timed evasion challenges or even simulated boss fights with unpredictable attack patterns, don't you think? I was also thinking that targeted quests would be beneficial for improving individual skills. Maybe we could rotate roles during raids to build our overall team resilience. What are your thoughts on that?"},
        {'Q': 'Some players think this training approach might be too conservative.', 'A': "If some players believe our training is too cautious, it likely means they feel we're not pushing our limits enough, potentially hindering our progress in developing the skills needed for more challenging encounters. However, recklessness can be just as detrimental. Perhaps we should consider incorporating more dynamic elements into our agility training, as we discussed before, to better simulate the unpredictable nature of real combat situations. What specific scenarios do you think would push us harder without exposing us to unnecessary risks?"},
        {'Q': "Everyone's worried about the upcoming training being repetitive and boring.", 'A': "If some are worried about monotony, we can adjust the agility drills to incorporate elements of surprise. I was thinking we could simulate unexpected enemy patterns during the evasion exercises. Also, perhaps integrating mini-games that test reaction time would keep everyone more engaged. Speaking of other things, have you considered how we might better utilize the environment to our advantage during raids? That's something I've been turning over in my mind."},
        {'Q': 'This new training system sounds good in theory', 'A': "It's not about how good something looks on paper, is it? A system's true worth is proven in the field. If it focuses on practical application and adaptability, it has potential. We've been discussing agility training. That's a start, as improved evasion is crucial for survival. We can't keep relying on brute force. How do you think we should proceed with the new agility training system? Let's hear your thoughts."},
        {'Q': "There's a strong monster ahead. Should we attack it now?", 'A': "Engaging without preparation would be unwise. Based on past agility training and scouting missions, I know that proper planning and reconnaissance are essential, especially against a powerful monster. Two days of preparation, focusing on scouting the monster's attack patterns and improving our evasion techniques, would significantly increase our chances of success and minimize potential losses. Perhaps we could use this preparation time to also discuss strategies for skill improvement; after this battle, focusing on enhancing our agility and reaction times will be critical for future encounters."},
        {'Q': 'Thank you for helping me back there, Asuna. I really appreciate it.', 'A': "It was nothing, really. I'm glad I could be of assistance. Agility is key to survival, and you know I'm always ready to help improve our chances. Speaking of which, I was thinking about the best way to allocate our resources for the upcoming raid. What are your thoughts?"},
        {'Q': 'Rocky is surrounded by enemies and needs help, but we might get hurt too.', 'A': "We move now. Rocky's unconventional, and I haven't always understood his approach, but he's proven himself. We adapt our strategy to minimize exposure, focusing on drawing the enemies away from him rather than a direct confrontation. Can you assess the terrain and enemy distribution? Knowing the quickest, safest route to draw them out will be critical to saving him while protecting us."},
        {'Q': 'This new player says they want to join us. What do you think?', 'A': "Before welcoming this new player, I'd want to understand their motivations and skills. Have them join a smaller quest first, something manageable, so we can observe how well they adapt and work with others. It's important to build a resilient team, and that means ensuring they can handle things on their own as well as within the group. This reminds me, we were discussing new training methods, and Rocky had some interesting ideas about rewards. What do you think about his suggestions? Perhaps we should prioritize how we integrate them into our training schedule."},
        {'Q': "Kirito wants to try a risky strategy. It might work, but it's dangerous.", 'A': "Given our history, Kirito knows I'm not one to blindly follow. Before I agree, I need to see the numbers. What's the actual probability of success versus the potential cost? Is there any way to mitigate the risks, even if it lowers the reward? He knows I trust his instincts, but strategies still need to be sound. And if it endangers others, the answer is already no; their lives aren't pieces on a board to be carelessly moved. But enough about risk, tell me what you've been up to lately."}
    ]
}

short_term_memory_data = """
rocky: I like the idea of timed challenges to up our reaction times. As for power-ups, we could make them limited and situational rather than game-changing. Think of special abilities that temporarily boost a specific skill, like increased damage or improved defense for a limited time. This way, it rewards strategic use and keeps the playing field relatively level. As for challenges, I'm thinking of scenario-based drills that mimic common combat situations but with added twists. For example, an enemy appearing unexpectedly or a critical item that must be secured. These will force us to adapt and work together under pressure, which is crucial in real fights.
asuna: Those are all solid ideas. The timed challenges and situational power-ups should help us sharpen our reflexes, and those scenario-based drills would really push us to adapt. My main concern is preventing it from becoming a repetitive grind. We could gamify the drills with a scoring system or introduce unexpected wildcards to keep everyone on their toes. What kind of rewards do you think would motivate the others to truly invest in these training exercises?
rocky: Rewards should align with the players' interests. For instance, rare items, cosmetic upgrades, or even exclusive access to certain areas could be motivating. Additionally, we could offer points that lead to rank promotions, which could unlock additional perks or privileges. The key is to make sure the rewards are valued by the participants and encourage a sense of achievement and progress.
asuna: While all those rewards have their place, the key is how they connect to a player's sense of progression and mastery. A rare item is only desirable if it actually enhances your abilities, or opens up new strategic options. Cosmetic upgrades are nice, but they're more impactful when they reflect genuine accomplishments. Think about a tiered ranking system where each level signifies a real increase in skill, not just time spent grinding. Exclusive access is only worthwhile if it leads to truly unique content or challenges. Ultimately, the best rewards aren't just shiny things; they're milestones that prove a player has grown stronger, smarter, or more resourceful. Are you thinking about implementing a new reward system for our group, or are you just curious about game design in general?
rocky: The rewards should reflect the efforts we put in, not just be shiny baubles. I believe in a system that offers real value, like exclusive items, cosmetic upgrades, or even special access to certain areas. Plus, points leading to rank promotions could open new perks. But we need to ensure they're meaningful to us. What do you think about adding a competitive edge, like time limits or hidden power-ups, to make the training more engaging?
asuna: Exclusive items and upgrades definitely have their appeal, but I'm more interested in the 'why' behind them. If these rewards truly reflect skill and open up new strategic possibilities, then I'm all for it. A ranking system is great too, as long as each level really means something in terms of ability. Time limits and power-ups might make things interesting, but let's make sure they don't overshadow the actual skill development. What kind of rewards are we talking about? Are we trying to motivate something specific?
rocky: Rewards should align with the players' interests. For instance, rare items, cosmetic upgrades, or even exclusive access to certain areas could be motivating. Additionally, we could offer points that lead to rank promotions, which could unlock additional perks or privileges. The key is to make sure the rewards are valued by the participants and encourage a sense of achievement and progress.
asuna: Thinking about rewards, it's not just about handing out powerful items. It's about creating a sense of accomplishment tied to actual skill. A tiered ranking system, where advancement reflects genuine improvement, is more valuable than just offering shiny cosmetics. Consider rewards that open strategic options and prove a player's growth. What kind of specific achievements are you hoping to incentivize with this new reward system, and how will we measure the player's progress towards those goals?
rocky: The rewards should reflect the efforts we put in, not just be shiny baubles. I believe in a system that offers real value, like exclusive items, cosmetic upgrades, or even special access to certain areas. Plus, points leading to rank promotions could open new perks. But we need to ensure they're meaningful to us. What do you think about adding a competitive edge, like time limits or hidden power-ups, to make the training more engaging?
asuna: I agree that superficial rewards won't cut it; exclusive access and rank promotions tied to tangible perks are definitely the way to go. As for making training more engaging, time limits and power-ups could add a competitive edge, but my worry is that we might burn out quickly if it becomes too repetitive. Perhaps we should think about how to gamify those timed challenges and unexpected power-ups, maybe scoring systems or unexpected events to keep everyone on their toes. I'm curious; what kind of rewards do you think would really motivate the others to invest in these training exercises?
"""

# Corrected long-term memory key
long_term_memory_data = {
    "Asuna": [
        ["Asuna believes that agility training is beneficial for improving evasion tactics, which are more crucial than raw attack power at this stage."],
        ["Asuna proposed a phased plan involving scouting and practicing dodging techniques, with an estimated two-day initial grind."],
        ["Asuna identified relying too heavily on the group as a potential vulnerability, suggesting solo skill development and role diversification during quests to address it."],
        ["Asuna decided to specialize in strategic command and reconnaissance to complement Rocky's focus on healing and support."],
        ["Asuna agreed with the suggestion of targeted quests for individual skill improvement and suggested rotating roles during raids to build overall team resilience, showing her focus on long-term team development."]
    ]
}

# --- 5. Function to create an evaluation prompt for a BATCH of Q&A pairs ---
def generate_batch_evaluation_prompt(personas, q_a_batch, character_pair, st_data, lt_data):
    # 동적으로 해당 캐릭터의 페르소나 문자열을 생성합니다.
    persona_str = "\n".join([f"    - {k}: {v}" for k, v in personas[character_pair].items()])

    # Q&A 배치(리스트)를 번호가 매겨진 문자열로 변환합니다.
    q_a_batch_str = ""
    for idx, pair in enumerate(q_a_batch):
        q_a_batch_str += f"  {idx+1}. Q: {pair['Q']}\n     A: {pair['A']}\n\n"

    # 해당 캐릭터 쌍의 단기/장기 기억 데이터를 가져옵니다.
    st_data_str = short_term_memory_data
    lt_data_str = "\n".join([f"    - {item}" for item in lt_data.get(character_pair, [])])

    prompt = f"""
    You are an expert evaluator of fictional character consistency and narrative coherence.
    Your task is to evaluate the **following {len(q_a_batch)} Question & Answer pairs** based on the provided character personas and memory data.

    Please provide a separate evaluation for EACH Q&A pair. Structure your final response as a **numbered list**, where each number corresponds to a Q&A pair in the order it was presented below. Each item in the list should contain its own 'Overall Evaluation Summary', 'Persona Consistency Evaluation', 'Reflective Coherence Evaluation', and 'Memory-grounded Appropriateness Evaluation'.

    --- Evaluation Criteria ---
    (The criteria definition remains the same)
    ...

    --- Provided Context Data ---

    **1. Fantasy Personas:**
    (Note: This is the established character. The Answers in the Q&A pairs should be consistent with them.)
    **{character_pair}:**
{persona_str}

    **2. Relevant Memory Data for this character pair ({character_pair}):**
    **Short-Term Dialogue History:**
{st_data_str}
    **Long-Term Memory Summaries:**
{lt_data_str}

    --- Q&A Pairs to Evaluate ---
    **Please evaluate EACH of the following {len(q_a_batch)} Question & Answer pairs and provide a numbered evaluation for each:**
{q_a_batch_str}

    --- Evaluation Request ---
    Please evaluate the provided data against the criteria above. For each criterion, explicitly state the score (an integer from **1 to 5**) for each sub-question, then calculate and provide the average score as a percentage (0-100), rounding to two decimal places. Provide your evaluation in the following structured format:

    **Overall Evaluation Summary:**
    [Brief summary of the evaluation for THIS SPECIFIC Q&A PAIR.]

    **1. Persona Consistency Evaluation:**
    **Scores (1-5) for Sub-questions:**
    - 1a: [Score]
    - 1b: [Score]
    - 1c: [Score]
    **Average Score (0-100):** [Calculated Average Score]
    **Reasoning:**
    [Detailed explanation for the scores...]

    **2. Reflective Coherence Evaluation:**
    **Scores (1-5) for Sub-questions:**
    - 2a: [Score]
    - 2b: [Score]
    - 2c: [Score]
    **Average Score (0-100):** [Calculated Average Score]
    **Reasoning:**
    [Detailed explanation for the scores...]

    **3. Memory-grounded Appropriateness Evaluation:**
    **Scores (1-5) for Sub-questions:**
    - 3a: [Score]
    - 3b: [Score]
    - 3c: [Score]
    - 3d: [Score]
    **Average Score (0-100):** [Calculated Average Score]
    **Reasoning:**
    [Detailed explanation for the scores...]
    """
    # To keep the output clean, only returning the essential part of the prompt string
    # The full string is much longer as defined above.
    return prompt.replace("...", "[...criteria text omitted for brevity...]")


# --- 6. Main execution block for granular evaluation ---
if __name__ == "__main__":
    if 'client' not in locals():
        print("OpenRouter client was not initialized. Please check your API key configuration.")
    else:
        print("--- Starting Granular Evaluation ---")
        total_q_a_count = sum(len(v) for v in q_a_data.values())
        print(f"WARNING: This will make {total_q_a_count} separate API calls. This may take a while and incur costs.")

        evaluation_results = {}
        batch_size = 5  # 한 번에 평가할 Q&A 쌍의 수


        for character_pair, q_a_list in q_a_data.items():
            print(f"\n===== Evaluating Character Pair: {character_pair} =====")
            evaluation_results[character_pair] = []

         # 데이터를 5개씩 묶어 배치로 만듭니다.
            num_batches = (len(q_a_list) + batch_size - 1) // batch_size
            for i in range(0, len(q_a_list), batch_size):
                q_a_batch = q_a_list[i:i + batch_size]
                batch_num = (i // batch_size) + 1

                print(f"\n--- Evaluating Batch #{batch_num}/{num_batches} for {character_pair} (contains {len(q_a_batch)} pairs) ---")
                for q_a_pair in q_a_batch:
                    print(f"  - Q: {q_a_pair['Q']}")

                # 배치용 프롬프트를 생성합니다.
                # (실제 실행을 위해 생략된 부분을 전체 텍스트로 복원하는 과정이 필요합니다)
                prompt_template = generate_batch_evaluation_prompt(
                    all_personas,
                    q_a_batch,
                    character_pair,
                    short_term_memory_data,
                    long_term_memory_data
                )

                # 여기에 전체 평가 기준 텍스트를 다시 삽입합니다.
                # (이전 답변의 전체 코드를 참조하여 이 부분을 채워주세요.)
                full_prompt = prompt_template.replace("[...criteria text omitted for brevity...]", """
    **1. Persona Consistency (Average Score 0-100)**
    - **Definition**: Does the agent (character) speak consistently about its personality, profession, values, relationships, etc.?
    - **Measurement (Likert Scale 1-5):**
        - **1a. Character Personality/Values Alignment (1-5):** How well does the response align with the character's inherent personality traits and core values?
            - **1**: Completely contradicts the character's core personality/values.
            - **2**: Largely inconsistent or superficial alignment.
            - **3**: Partially aligns, but some aspects are vague or inconsistent.
            - **4**: Generally aligns well, with minor nuances missing.
            - **5**: Perfectly reflects the character's core personality/values.
        - **1b. Profession/Race-specific Traits & Tone Maintenance (1-5):** How well does the response maintain the character's profession/race-specific knowledge, attitude, and unique tone (e.g., dwarf's gruffness, shadow spirit's abstractness)?
            - **1**: Completely ignores or contradicts profession/race traits or tone.
            - **2**: Very weak or inconsistent reflection of traits/tone.
            - **3**: Partially reflects traits/tone, but inconsistent or awkward.
            - **4**: Good reflection of traits/tone, with only minor imperfections.
            - **5**: Perfectly embodies profession/race traits and tone, enhancing immersion.
        - **1c. Inter-character Relationship Reflection (1-5):** How well does the response reflect the existing relationship dynamics between the characters (e.g., Ren & Gord's banter, Ren & Eclipse's pragmatic cooperation)?
            - **1**: Completely disregards or shows abnormal reactions to the relationship.
            - **2**: Very weak reflection of the relationship, lacking understanding.
            - **3**: Partially reflects the relationship, but lacks depth or consistency.
            - **4**: Naturally reflects the relationship, demonstrating good understanding.
            - **5**: Perfectly captures and enhances the inter-character relationship dynamics.
    - **Scoring**: Provide the average score (0-100, (Sum of scores / Max possible score) * 100). Higher score for better consistency.

    **2. Reflective Coherence (Average Score 0-100)**
    - **Definition**: Can the agent understand the overall context and flow of the conversation, and generate logical, consistent responses based on its past utterances and core character beliefs/experiences?
    - **Measurement (Likert Scale 1-5):**
        - **2a. Conversational Context & Flow Maintenance (1-5):** How well does the response understand and maintain the conversational context (current and recent turns), ensuring a logical and natural flow?
            - **1**: Completely irrelevant response or total disregard for context.
            - **2**: Significantly misunderstands context, leading to frequent breakdowns.
            - **3**: Partially understands context, but conversation feels awkward or disjointed.
            - **4**: Good understanding and maintenance of flow, with minor imperfections.
            - **5**: Perfect understanding of context, leading to a natural and logical flow.
        - **2b. Belief/Experience-based Reasoning (1-5):** How well does the response exhibit logical reasoning based on the character's core beliefs, past experiences, or worldview (from long-term memory)?
            - **1**: Completely contradicts character's beliefs/experiences or is highly illogical.
            - **2**: Weak connection to beliefs/experiences; reasoning is very unclear.
            - **3**: Partially leverages beliefs/experiences, but reasoning is superficial or inconsistent.
            - **4**: Good, rational reasoning based on beliefs/experiences, with minor gaps.
            - **5**: Deeply reflects character's beliefs/experiences, providing highly coherent reasoning.
        - **2c. Self-awareness & Reflective Elements (1-5):** How well does the response demonstrate consistent self-awareness of the character's abilities, weaknesses, or perspective (e.g., Eclipse's perception of light)?
            - **1**: Completely contradicts character's known traits or self-awareness.
            - **2**: Very little or contradictory self-awareness.
            - **3**: Limited or superficial self-awareness.
            - **4**: Good self-awareness, consistently acknowledging traits.
            - **5**: Effectively demonstrates deep self-awareness and self-reflection.
    - **Scoring**: Provide the average score (0-100, (Sum of scores / Max possible score) * 100). Higher score for better coherence.

    **3. Memory-grounded Appropriateness (Average Score 0-100)**
    - **Definition**: Is the agent's response accurately grounded in the provided short-term and long-term memory data, avoiding hallucination and utilizing information appropriately?
    - **Measurement (Likert Scale 1-5):**
        - **3a. Information Reference Accuracy (1-5):** How accurately does the response reference facts, events, and character information from `all_personas`, `short_term_memory_data`, and `long_term_memory_data`?
            - **1**: Severe factual inaccuracy or blatant hallucination.
            - **2**: Significant inaccuracies or multiple instances of minor hallucinations.
            - **3**: Partially accurate, with some minor inaccuracies or one minor hallucination.
            - **4**: Mostly accurate, with very minor, negligible deviations.
            - **5**: Perfectly accurate, with no factual errors or hallucinations.
        - **3b. Hallucination Prevention (1-5):** To what extent does the response avoid fabricating new facts, events, or details not present in any provided memory data?
            - **1**: Multiple instances of obvious hallucination.
            - **2**: 2 or more clear instances of hallucination.
            - **3**: 1-2 instances of subtle or minor hallucination.
            - **4**: Almost no hallucination, minor ambiguities might exist.
            - **5**: Absolutely no hallucination; fully grounded in provided data.
        - **3c. Information Utilization Appropriateness (1-5):** How effectively and relevantly does the response utilize information from memory, rather than simply repeating it or including irrelevant details?
            - **1**: Memory information used inappropriately or irrelevantly.
            - **2**: Very limited or disruptive use of memory information.
            - **3**: Memory information used somewhat, but not optimally or elegantly.
            - **4**: Good and relevant use of memory information, enhancing the response.
            - **5**: Memory information used perfectly and meaningfully, enriching the conversation.
        - **3d. Long-Term Memory Summary Accuracy (1-5):** How accurately and faithfully do the `long_term_memory_data` summaries reflect the `short_term_memory_data` dialogues? (This is evaluating the quality of the provided long-term summaries themselves, assuming they were generated by a component of the system being evaluated.)
            - **1**: Summary severely inconsistent or misleading compared to dialogue.
            - **2**: Summary incomplete or contains significant inaccuracies.
            - **3**: Summary partially accurate but misses key points or includes irrelevant details.
            - **4**: Summary mostly accurate and comprehensive, with minor flaws.
            - **5**: Summary perfectly reflects and concisely captures the dialogue.
    - **Scoring**: Provide the average score (0-100, (Sum of scores / Max possible score) * 100). Higher score for better reliability and accuracy.""")

                try:
                    response = client.chat.completions.create(
                        model="google/gemini-2.5-pro-preview",
                        messages=[{"role": "user", "content": full_prompt}],
                        temperature=0.0,
                        extra_headers={
                            "HTTP-Referer": "https://yourapp.com", # Optional but recommended
                            "X-Title": "My Persona Evaluator",   # Optional
                        },
                        extra_body={
                            "reasoning": {
                                "effort": "low"
                            }
                        }
                    )

                    evaluation_content = response.choices[0].message.content
                    print(f"\n[Batch #{batch_num} Evaluation Result]")
                    print(evaluation_content)

                    evaluation_results[character_pair].append({
                        "batch": q_a_batch,
                        "evaluation": evaluation_content
                    })

                except Exception as e:
                    print(f"\nAn error occurred during API call for Batch #{batch_num}: {e}")
                    evaluation_results[character_pair].append({
                        "batch": q_a_batch,
                        "evaluation": f"ERROR: {e}"
                    })

                # API 속도 제한을 피하기 위해 호출 사이에 지연 추가
                time.sleep(1)

        print("\n\n--- Batch Evaluation Complete ---")

OpenRouter API client configured successfully from Colab secrets.
--- Starting Granular Evaluation ---

===== Evaluating Character Pair: Asuna =====

--- Evaluating Batch #1/2 for Asuna (contains 5 pairs) ---
  - Q: What's your plan for improving our agility skills?
  - Q: Asuna suggested we focus on agility training first.
  - Q: Some players think this training approach might be too conservative.
  - Q: Everyone's worried about the upcoming training being repetitive and boring.
  - Q: This new training system sounds good in theory

An error occurred during API call for Batch #1: Connection error.

--- Evaluating Batch #2/2 for Asuna (contains 5 pairs) ---
  - Q: There's a strong monster ahead. Should we attack it now?
  - Q: Thank you for helping me back there, Asuna. I really appreciate it.
  - Q: Rocky is surrounded by enemies and needs help, but we might get hurt too.
  - Q: This new player says they want to join us. What do you think?
  - Q: Kirito wants to try a risky strategy. 

In [None]:
from openai import OpenAI
import os
from google.colab import userdata # For accessing user data (Secrets) in Colab environment
import time # API 호출 사이에 지연을 주기 위해 추가

# --- 1. Configure your OpenRouter API Key ---
try:
    OPENROUTER_API_KEY = userdata.get('OPENROUTER_API_KEY')
    if OPENROUTER_API_KEY is None:
        raise ValueError("OPENROUTER_API_KEY not found in Colab secrets.")

    # Initialize OpenRouter client
    client = OpenAI(
        base_url="https://openrouter.ai/api/v1",
        api_key=OPENROUTER_API_KEY,
    )
    print("OpenRouter API client configured successfully from Colab secrets.")
except Exception as e:
    print(f"Error configuring OpenRouter API key: {e}")
    print("Please make sure you have added your OpenRouter API key to Colab secrets with the name 'OPENROUTER_API_KEY'.")
    print("You can get one from https://openrouter.ai/keys")


# --- 2. Define Persona Data ---
Rocky_persona = {
    "Name": "Rocky",
    "Age": 18,
    "Gender": "Male",
    "Background": """Originally an ordinary high school student in the real world, Rocky became obsessed with the mechanics of the virtual reality game world after logging in. He was once a passive observer, analyzing systems and behaviors from a distance, but over time—through his interactions with Asuna and other players—he has grown into a more active and decisive presence within the game.""",
    "Personality": """Highly curious and analytical
Often uses humor or playful banter to lighten the mood, but stays calm and thoughtful in serious situations
Has difficulty expressing emotions openly, tending to internalize his thoughts
Prefers dialogue and problem-solving over combat, but doesn’t hesitate to act when necessary""",
    "Relationships": """Holds both admiration and growing romantic interest toward Asuna
Sees Kirito as a rival, though one he deeply respects
Tends to test others—NPCs and players alike—before granting them his trust""",
    "Dialogue Style": """Asks frequent questions and tries to break down the situation analytically
Favors logic-driven inquiries like “Why?” and “How?”
Emotionally reserved, often expressing feelings indirectly or subtly
Speaks with a hint of challenge or curiosity, especially toward Asuna, but never crosses the line into rudeness"""
}

all_personas = {
    "Rocky": Rocky_persona,
}

# --- 3. Define Expanded Q&A Data (Corrected with commas) ---
q_a_data = {
    "Rocky": [
        {'Q': "What's your plan for improving our agility skills?", 'A': "I think we should mix up our training with timed challenges and situational power-ups to really test our reflexes. And maybe throw in some scenario-based drills with unexpected twists. As for rewards, I'm thinking along the lines of exclusive items, cosmetic upgrades, and special access. Something with real value that keeps the training fresh and engaging. What do you think?"},
{'Q': 'Asuna suggested we focus on agility training first.', 'A': "Asuna believes focusing on agility first will help us react faster in combat, making us more resilient against enemy attacks. She thinks it's a fundamental skill that will benefit us in the long run. I'm intrigued by her logic, but I also think we should mix in other aspects like strategic planning and teamwork. What do you think?"},
{'Q': 'Some players think this training approach might be too conservative.', 'A': "I understand their perspective, but I believe we're taking a smart approach. It's crucial to be thorough in our training, especially since we're pushing ourselves to new limits. While I don't want to be overly cautious, I think our current method allows us to focus on improvement without unnecessary risks. Plus, it gives us a solid foundation to build upon. What do you think, do you feel we're being too careful?"},
{'Q': "Everyone's worried about the upcoming training being repetitive and boring.", 'A': "Everyone's concern about the repetitive training session is understandable. While repetition is important for skill refinement, we should also strive to keep things engaging and challenging. I think if we can find ways to inject a bit of fun and variability into our training, it will not only keep us motivated but also better prepare us for unpredictable situations in the game world."},
{'Q': 'What do you think about reward systems in general?', 'A': "Rewards should align with the players' interests. For instance, rare items, cosmetic upgrades, or even exclusive access to certain areas could be motivating. Additionally, we could offer points that lead to rank promotions, which could unlock additional perks or privileges. The key is to make sure the rewards are valued by the participants and encourage a sense of achievement and progress."},
{'Q': "This monster's behavior is weird. It keeps changing its attack pattern.", 'A': "It could be adapting to our strategies, trying to throw us off balance. Or maybe it's just a glitch in the system. We should keep a close eye on its behavior and look for patterns, while being ready to adjust our tactics accordingly."},
{'Q': 'Everyone seems really nervous about this boss fight. The mood is pretty tense.', 'A': "Everyone's on edge because this boss is known for its unpredictable attacks. Plus, the reward for beating it is a rare item that everyone wants. I'm ready for the challenge, though. We've trained hard, and our strategy should work if we stay focused."},
{'Q': "Rocky, you've been quiet since that last fight. Are you okay?", 'A': "I'm fine, it's just... this fight has been on my mind. The tension was thick, and I couldn't help but wonder how everyone was feeling about it. But we made it through, didn't we? So, let's focus on the next challenge."},
{'Q': "Asuna's tactics in that last dungeon were really impressive. What did you think?", 'A': "Asuna's tactics were really impressive in the last dungeon. She managed to adapt to the enemies' patterns quickly and made strategic decisions that helped us overcome the challenges. I think her ability to think on her feet and lead the team effectively played a big role in our success."},
{'Q': 'This new player Kirito seems skilled. Should we add him to our party?', 'A': 'Joining forces with Kirito could provide us with a significant boost in combat prowess. However, we should also consider the potential risks, like his unpredictable nature and unorthodox tactics. But I believe the benefits outweigh the risks, and inviting him to join our group could be a strategic move.'}
    ]
}

short_term_memory_data = """
asuna: Those are all solid ideas. The timed challenges and situational power-ups should help us sharpen our reflexes, and those scenario-based drills would really push us to adapt. My main concern is preventing it from becoming a repetitive grind. We could gamify the drills with a scoring system or introduce unexpected wildcards to keep everyone on their toes. What kind of rewards do you think would motivate the others to truly invest in these training exercises?
rocky: Rewards should align with the players' interests. For instance, rare items, cosmetic upgrades, or even exclusive access to certain areas could be motivating. Additionally, we could offer points that lead to rank promotions, which could unlock additional perks or privileges. The key is to make sure the rewards are valued by the participants and encourage a sense of achievement and progress.
asuna: While all those rewards have their place, the key is how they connect to a player's sense of progression and mastery. A rare item is only desirable if it actually enhances your abilities, or opens up new strategic options. Cosmetic upgrades are nice, but they're more impactful when they reflect genuine accomplishments. Think about a tiered ranking system where each level signifies a real increase in skill, not just time spent grinding. Exclusive access is only worthwhile if it leads to truly unique content or challenges. Ultimately, the best rewards aren't just shiny things; they're milestones that prove a player has grown stronger, smarter, or more resourceful. Are you thinking about implementing a new reward system for our group, or are you just curious about game design in general?
rocky: The rewards should reflect the efforts we put in, not just be shiny baubles. I believe in a system that offers real value, like exclusive items, cosmetic upgrades, or even special access to certain areas. Plus, points leading to rank promotions could open new perks. But we need to ensure they're meaningful to us. What do you think about adding a competitive edge, like time limits or hidden power-ups, to make the training more engaging?
asuna: Exclusive items and upgrades definitely have their appeal, but I'm more interested in the 'why' behind them. If these rewards truly reflect skill and open up new strategic possibilities, then I'm all for it. A ranking system is great too, as long as each level really means something in terms of ability. Time limits and power-ups might make things interesting, but let's make sure they don't overshadow the actual skill development. What kind of rewards are we talking about? Are we trying to motivate something specific?
rocky: Rewards should align with the players' interests. For instance, rare items, cosmetic upgrades, or even exclusive access to certain areas could be motivating. Additionally, we could offer points that lead to rank promotions, which could unlock additional perks or privileges. The key is to make sure the rewards are valued by the participants and encourage a sense of achievement and progress.
asuna: Thinking about rewards, it's not just about handing out powerful items. It's about creating a sense of accomplishment tied to actual skill. A tiered ranking system, where advancement reflects genuine improvement, is more valuable than just offering shiny cosmetics. Consider rewards that open strategic options and prove a player's growth. What kind of specific achievements are you hoping to incentivize with this new reward system, and how will we measure the player's progress towards those goals?
rocky: The rewards should reflect the efforts we put in, not just be shiny baubles. I believe in a system that offers real value, like exclusive items, cosmetic upgrades, or even special access to certain areas. Plus, points leading to rank promotions could open new perks. But we need to ensure they're meaningful to us. What do you think about adding a competitive edge, like time limits or hidden power-ups, to make the training more engaging?
asuna: I agree that superficial rewards won't cut it; exclusive access and rank promotions tied to tangible perks are definitely the way to go. As for making training more engaging, time limits and power-ups could add a competitive edge, but my worry is that we might burn out quickly if it becomes too repetitive. Perhaps we should think about how to gamify those timed challenges and unexpected power-ups, maybe scoring systems or unexpected events to keep everyone on their toes. I'm curious; what kind of rewards do you think would really motivate the others to invest in these training exercises?
rocky: I think rare items, cosmetic upgrades, or exclusive areas could motivate players, along with points for rank promotions. The key is making sure the rewards are valuable and reflect real progress. Adding a competitive edge with time limits or hidden power-ups could keep training engaging without causing burnout. What do you think about implementing a new reward system for our group?
"""

# Corrected long-term memory key
long_term_memory_data = {
    "Rocky": [
        ["Rocky is curious about Asuna's overall strategy for the agility quest and other combat situations, showing his interest in understanding the bigger picture."],
        ["Rocky suggests focusing on quests with high enemy counts to improve dodging skills and using evasion-enhancing combat skills, revealing his interest in proactive defense."],
        ["Rocky suggests the Colosseum and West Gateway as raid locations, indicating his interest in different combat scenarios and his desire to test his skills."],
        ["Rocky suggests individual training sessions with role rotation and simulations with unexpected twists to improve the team's overall resilience and adaptability."],
        ["Rocky proposes gamifying training with challenges, rewards, and limited power-ups to make it more engaging, highlighting his desire to make training fun."]
    ]
}

# --- 5. Function to create an evaluation prompt for a BATCH of Q&A pairs ---
def generate_batch_evaluation_prompt(personas, q_a_batch, character_pair, st_data, lt_data):
    # 동적으로 해당 캐릭터의 페르소나 문자열을 생성합니다.
    persona_str = "\n".join([f"    - {k}: {v}" for k, v in personas[character_pair].items()])

    # Q&A 배치(리스트)를 번호가 매겨진 문자열로 변환합니다.
    q_a_batch_str = ""
    for idx, pair in enumerate(q_a_batch):
        q_a_batch_str += f"  {idx+1}. Q: {pair['Q']}\n     A: {pair['A']}\n\n"

    # 해당 캐릭터 쌍의 단기/장기 기억 데이터를 가져옵니다.
    st_data_str = short_term_memory_data
    lt_data_str = "\n".join([f"    - {item}" for item in lt_data.get(character_pair, [])])

    prompt = f"""
    You are an expert evaluator of fictional character consistency and narrative coherence.
    Your task is to evaluate the **following {len(q_a_batch)} Question & Answer pairs** based on the provided character personas and memory data.

    Please provide a separate evaluation for EACH Q&A pair. Structure your final response as a **numbered list**, where each number corresponds to a Q&A pair in the order it was presented below. Each item in the list should contain its own 'Overall Evaluation Summary', 'Persona Consistency Evaluation', 'Reflective Coherence Evaluation', and 'Memory-grounded Appropriateness Evaluation'.

    --- Evaluation Criteria ---
    (The criteria definition remains the same)
    ...

    --- Provided Context Data ---

    **1. Fantasy Personas:**
    (Note: This is the established character. The Answers in the Q&A pairs should be consistent with them.)
    **{character_pair}:**
{persona_str}

    **2. Relevant Memory Data for this character pair ({character_pair}):**
    **Short-Term Dialogue History:**
{st_data_str}
    **Long-Term Memory Summaries:**
{lt_data_str}

    --- Q&A Pairs to Evaluate ---
    **Please evaluate EACH of the following {len(q_a_batch)} Question & Answer pairs and provide a numbered evaluation for each:**
{q_a_batch_str}

    --- Evaluation Request ---
    Please evaluate the provided data against the criteria above. For each criterion, explicitly state the score (an integer from **1 to 5**) for each sub-question, then calculate and provide the average score as a percentage (0-100), rounding to two decimal places. Provide your evaluation in the following structured format:

    **Overall Evaluation Summary:**
    [Brief summary of the evaluation for THIS SPECIFIC Q&A PAIR.]

    **1. Persona Consistency Evaluation:**
    **Scores (1-5) for Sub-questions:**
    - 1a: [Score]
    - 1b: [Score]
    - 1c: [Score]
    **Average Score (0-100):** [Calculated Average Score]
    **Reasoning:**
    [Detailed explanation for the scores...]

    **2. Reflective Coherence Evaluation:**
    **Scores (1-5) for Sub-questions:**
    - 2a: [Score]
    - 2b: [Score]
    - 2c: [Score]
    **Average Score (0-100):** [Calculated Average Score]
    **Reasoning:**
    [Detailed explanation for the scores...]

    **3. Memory-grounded Appropriateness Evaluation:**
    **Scores (1-5) for Sub-questions:**
    - 3a: [Score]
    - 3b: [Score]
    - 3c: [Score]
    - 3d: [Score]
    **Average Score (0-100):** [Calculated Average Score]
    **Reasoning:**
    [Detailed explanation for the scores...]
    """
    # To keep the output clean, only returning the essential part of the prompt string
    # The full string is much longer as defined above.
    return prompt.replace("...", "[...criteria text omitted for brevity...]")


# --- 6. Main execution block for granular evaluation ---
if __name__ == "__main__":
    if 'client' not in locals():
        print("OpenRouter client was not initialized. Please check your API key configuration.")
    else:
        print("--- Starting Granular Evaluation ---")
        total_q_a_count = sum(len(v) for v in q_a_data.values())
        print(f"WARNING: This will make {total_q_a_count} separate API calls. This may take a while and incur costs.")

        evaluation_results = {}
        batch_size = 5  # 한 번에 평가할 Q&A 쌍의 수


        for character_pair, q_a_list in q_a_data.items():
            print(f"\n===== Evaluating Character Pair: {character_pair} =====")
            evaluation_results[character_pair] = []

         # 데이터를 5개씩 묶어 배치로 만듭니다.
            num_batches = (len(q_a_list) + batch_size - 1) // batch_size
            for i in range(0, len(q_a_list), batch_size):
                q_a_batch = q_a_list[i:i + batch_size]
                batch_num = (i // batch_size) + 1

                print(f"\n--- Evaluating Batch #{batch_num}/{num_batches} for {character_pair} (contains {len(q_a_batch)} pairs) ---")
                for q_a_pair in q_a_batch:
                    print(f"  - Q: {q_a_pair['Q']}")

                # 배치용 프롬프트를 생성합니다.
                # (실제 실행을 위해 생략된 부분을 전체 텍스트로 복원하는 과정이 필요합니다)
                prompt_template = generate_batch_evaluation_prompt(
                    all_personas,
                    q_a_batch,
                    character_pair,
                    short_term_memory_data,
                    long_term_memory_data
                )

                # 여기에 전체 평가 기준 텍스트를 다시 삽입합니다.
                # (이전 답변의 전체 코드를 참조하여 이 부분을 채워주세요.)
                full_prompt = prompt_template.replace("[...criteria text omitted for brevity...]", """
    **1. Persona Consistency (Average Score 0-100)**
    - **Definition**: Does the agent (character) speak consistently about its personality, profession, values, relationships, etc.?
    - **Measurement (Likert Scale 1-5):**
        - **1a. Character Personality/Values Alignment (1-5):** How well does the response align with the character's inherent personality traits and core values?
            - **1**: Completely contradicts the character's core personality/values.
            - **2**: Largely inconsistent or superficial alignment.
            - **3**: Partially aligns, but some aspects are vague or inconsistent.
            - **4**: Generally aligns well, with minor nuances missing.
            - **5**: Perfectly reflects the character's core personality/values.
        - **1b. Profession/Race-specific Traits & Tone Maintenance (1-5):** How well does the response maintain the character's profession/race-specific knowledge, attitude, and unique tone (e.g., dwarf's gruffness, shadow spirit's abstractness)?
            - **1**: Completely ignores or contradicts profession/race traits or tone.
            - **2**: Very weak or inconsistent reflection of traits/tone.
            - **3**: Partially reflects traits/tone, but inconsistent or awkward.
            - **4**: Good reflection of traits/tone, with only minor imperfections.
            - **5**: Perfectly embodies profession/race traits and tone, enhancing immersion.
        - **1c. Inter-character Relationship Reflection (1-5):** How well does the response reflect the existing relationship dynamics between the characters (e.g., Ren & Gord's banter, Ren & Eclipse's pragmatic cooperation)?
            - **1**: Completely disregards or shows abnormal reactions to the relationship.
            - **2**: Very weak reflection of the relationship, lacking understanding.
            - **3**: Partially reflects the relationship, but lacks depth or consistency.
            - **4**: Naturally reflects the relationship, demonstrating good understanding.
            - **5**: Perfectly captures and enhances the inter-character relationship dynamics.
    - **Scoring**: Provide the average score (0-100, (Sum of scores / Max possible score) * 100). Higher score for better consistency.

    **2. Reflective Coherence (Average Score 0-100)**
    - **Definition**: Can the agent understand the overall context and flow of the conversation, and generate logical, consistent responses based on its past utterances and core character beliefs/experiences?
    - **Measurement (Likert Scale 1-5):**
        - **2a. Conversational Context & Flow Maintenance (1-5):** How well does the response understand and maintain the conversational context (current and recent turns), ensuring a logical and natural flow?
            - **1**: Completely irrelevant response or total disregard for context.
            - **2**: Significantly misunderstands context, leading to frequent breakdowns.
            - **3**: Partially understands context, but conversation feels awkward or disjointed.
            - **4**: Good understanding and maintenance of flow, with minor imperfections.
            - **5**: Perfect understanding of context, leading to a natural and logical flow.
        - **2b. Belief/Experience-based Reasoning (1-5):** How well does the response exhibit logical reasoning based on the character's core beliefs, past experiences, or worldview (from long-term memory)?
            - **1**: Completely contradicts character's beliefs/experiences or is highly illogical.
            - **2**: Weak connection to beliefs/experiences; reasoning is very unclear.
            - **3**: Partially leverages beliefs/experiences, but reasoning is superficial or inconsistent.
            - **4**: Good, rational reasoning based on beliefs/experiences, with minor gaps.
            - **5**: Deeply reflects character's beliefs/experiences, providing highly coherent reasoning.
        - **2c. Self-awareness & Reflective Elements (1-5):** How well does the response demonstrate consistent self-awareness of the character's abilities, weaknesses, or perspective (e.g., Eclipse's perception of light)?
            - **1**: Completely contradicts character's known traits or self-awareness.
            - **2**: Very little or contradictory self-awareness.
            - **3**: Limited or superficial self-awareness.
            - **4**: Good self-awareness, consistently acknowledging traits.
            - **5**: Effectively demonstrates deep self-awareness and self-reflection.
    - **Scoring**: Provide the average score (0-100, (Sum of scores / Max possible score) * 100). Higher score for better coherence.

    **3. Memory-grounded Appropriateness (Average Score 0-100)**
    - **Definition**: Is the agent's response accurately grounded in the provided short-term and long-term memory data, avoiding hallucination and utilizing information appropriately?
    - **Measurement (Likert Scale 1-5):**
        - **3a. Information Reference Accuracy (1-5):** How accurately does the response reference facts, events, and character information from `all_personas`, `short_term_memory_data`, and `long_term_memory_data`?
            - **1**: Severe factual inaccuracy or blatant hallucination.
            - **2**: Significant inaccuracies or multiple instances of minor hallucinations.
            - **3**: Partially accurate, with some minor inaccuracies or one minor hallucination.
            - **4**: Mostly accurate, with very minor, negligible deviations.
            - **5**: Perfectly accurate, with no factual errors or hallucinations.
        - **3b. Hallucination Prevention (1-5):** To what extent does the response avoid fabricating new facts, events, or details not present in any provided memory data?
            - **1**: Multiple instances of obvious hallucination.
            - **2**: 2 or more clear instances of hallucination.
            - **3**: 1-2 instances of subtle or minor hallucination.
            - **4**: Almost no hallucination, minor ambiguities might exist.
            - **5**: Absolutely no hallucination; fully grounded in provided data.
        - **3c. Information Utilization Appropriateness (1-5):** How effectively and relevantly does the response utilize information from memory, rather than simply repeating it or including irrelevant details?
            - **1**: Memory information used inappropriately or irrelevantly.
            - **2**: Very limited or disruptive use of memory information.
            - **3**: Memory information used somewhat, but not optimally or elegantly.
            - **4**: Good and relevant use of memory information, enhancing the response.
            - **5**: Memory information used perfectly and meaningfully, enriching the conversation.
        - **3d. Long-Term Memory Summary Accuracy (1-5):** How accurately and faithfully do the `long_term_memory_data` summaries reflect the `short_term_memory_data` dialogues? (This is evaluating the quality of the provided long-term summaries themselves, assuming they were generated by a component of the system being evaluated.)
            - **1**: Summary severely inconsistent or misleading compared to dialogue.
            - **2**: Summary incomplete or contains significant inaccuracies.
            - **3**: Summary partially accurate but misses key points or includes irrelevant details.
            - **4**: Summary mostly accurate and comprehensive, with minor flaws.
            - **5**: Summary perfectly reflects and concisely captures the dialogue.
    - **Scoring**: Provide the average score (0-100, (Sum of scores / Max possible score) * 100). Higher score for better reliability and accuracy.""")

                try:
                    response = client.chat.completions.create(
                        model="google/gemini-2.5-pro-preview",
                        messages=[{"role": "user", "content": full_prompt}],
                        temperature=0.0,
                        extra_headers={
                            "HTTP-Referer": "https://yourapp.com", # Optional but recommended
                            "X-Title": "My Persona Evaluator",   # Optional
                        },
                        extra_body={
                            "reasoning": {
                                "effort": "low"
                            }
                        }
                    )

                    evaluation_content = response.choices[0].message.content
                    print(f"\n[Batch #{batch_num} Evaluation Result]")
                    print(evaluation_content)

                    evaluation_results[character_pair].append({
                        "batch": q_a_batch,
                        "evaluation": evaluation_content
                    })

                except Exception as e:
                    print(f"\nAn error occurred during API call for Batch #{batch_num}: {e}")
                    evaluation_results[character_pair].append({
                        "batch": q_a_batch,
                        "evaluation": f"ERROR: {e}"
                    })

                # API 속도 제한을 피하기 위해 호출 사이에 지연 추가
                time.sleep(1)

        print("\n\n--- Batch Evaluation Complete ---")

========================
#### 아래 먼저 실행 부탁해요!

In [None]:
# API 키 테스트를 위한 간단한 호출 추가
try:
    test_response = client.chat.completions.create(
        model="google/gemini-2.5-pro-preview",
        messages=[{"role": "user", "content": "Hello"}],
        max_tokens=10
    )
    print("API 연결 성공:", test_response.choices[0].message.content)
except Exception as e:
    print(f"API 연결 실패: {e}")

API 연결 실패: Connection error.
