## Setup

In [1]:
# Initialize environment variables/constants (for Google Colab)
# import os
# from google.colab import userdata

# os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")

# Initialize environment variables/constants (for VS Code)
import os

# Set your Google Gemini API key here or in your environment variables
# You can get a free API key from: https://aistudio.google.com/app/apikey
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

In [2]:
# Install langchain google genai
from IPython.display import clear_output

# google colab command
# !pip install -U langchain-google-genai

# vs code command
%pip install langchain

# Upgrade google-generativeai and langchain-google-genai to latest versions
%pip install --upgrade google-generativeai
%pip install --upgrade langchain-google-genai
clear_output()


Collecting google-ai-generativelanguage<1.0.0,>=0.7.0 (from langchain-google-genai)
  Using cached google_ai_generativelanguage-0.9.0-py3-none-any.whl.metadata (10 kB)
Using cached google_ai_generativelanguage-0.9.0-py3-none-any.whl (1.4 MB)
Installing collected packages: google-ai-generativelanguage
  Attempting uninstall: google-ai-generativelanguage
    Found existing installation: google-ai-generativelanguage 0.6.15
    Uninstalling google-ai-generativelanguage-0.6.15:
      Successfully uninstalled google-ai-generativelanguage-0.6.15
Successfully installed google-ai-generativelanguage-0.9.0


In [3]:
# Instantiate an LLM
from langchain.chat_models import init_chat_model

# Warning: Using different models for generation and fixing may lead to json/response parsing issues.

# LLM model for clue generation
generation_model = init_chat_model(
    # model="gemini-2.5-flash",
    # model="gemini-2.5-flash-lite",
    # model="gemini-2.0-flash",
    model="gemini-2.0-flash-lite",
    model_provider="google_genai"
)

# LLM model for fixing clues
fixing_model = init_chat_model(
    # model="gemini-2.5-flash",
    # model="gemini-2.5-flash-lite",
    # model="gemini-2.0-flash",
    model="gemini-2.0-flash-lite",
    model_provider="google_genai"
)

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


In [None]:
import json

# Load the JSON file
with open('game_topics.json', 'r', encoding='utf-8') as f:
    game_topics_data = json.load(f)

# Process the data into a list of test topics (one per game per category)
game_topics = []
for category, games in game_topics_data.items():
    for game in games:
        game_topics.append({
                "category": category,
                "game_number": game["game_number"],
                "r1_answer": game["r1_answer"],
                "r1_choices": game["r1_choices"],
                "r2_answer": game["r2_answer"],
                "r2_choices": game["r2_choices"]
            })


# Optional: Print a preview of the loaded data
print(f"Loaded {len(game_topics)} test topics from JSON.")
print("First 5 examples:")
for i, topic in enumerate(game_topics[:5]):
    print(f"  {i+1}. {topic}")

Loaded 100 test topics from JSON.
First 5 examples:
  1. {'category': 'Movie', 'game_number': 1, 'r1_answer': 'Sci-Fi', 'r1_choices': ['Sci-Fi', 'Horror', 'Fantasy'], 'r2_answer': 'The Matrix', 'r2_choices': ['The Conjuring', 'The Matrix', 'The Lord of the Rings']}
  2. {'category': 'Movie', 'game_number': 2, 'r1_answer': 'Comedy', 'r1_choices': ['Comedy', 'Action', 'Drama'], 'r2_answer': 'The Hangover', 'r2_choices': ['Die Hard', 'The Hangover', 'Forrest Gump']}
  3. {'category': 'Movie', 'game_number': 3, 'r1_answer': 'Action', 'r1_choices': ['Action', 'Thriller', 'Adventure'], 'r2_answer': 'Mad Max: Fury Road', 'r2_choices': ['Psycho', 'Mad Max: Fury Road', 'Indiana Jones']}
  4. {'category': 'Movie', 'game_number': 4, 'r1_answer': 'Drama', 'r1_choices': ['Drama', 'Romance', 'Comedy'], 'r2_answer': 'The Shawshank Redemption', 'r2_choices': ['Titanic', 'The Shawshank Redemption', 'Groundhog Day']}
  5. {'category': 'Movie', 'game_number': 5, 'r1_answer': 'Horror', 'r1_choices': ['Hor

## Clues Generator

In [17]:
# Write the prompts
import json

system_prompt = """
You are the game master of a game called "Disinformer", which is similar to the "message relay" game. Below is the description of how the game works:
```
In this cooperative game, players use communication and teamwork to uncover the original prompt over multiple rounds of clues. Along the way, they must contend with a disruptive "Disinformer," varying player interpretations, and time limits.

There will be a minimum of 3 players and maximum of 10 players:
- Regular players (a.k.a. the netizens): The job is to solve clues and discover the original prompt
- at most 2 misinformed players: Has the same job as the regular players. However, this player is unknowingly being given vague/ambiguous clues.
- at most 2 disinformer players: The job is to solve clues and discover prompt to persuade other players from clue.

There will be 2 rounds in each game.
- In the first round, the players will be given clues to guess a general category/term (e.g. "movie", "song", "novel", etc)
- In the second round, the players will be given clues to guess a more specific thing (e.g. "The Dark Knight (2008)", "The Hitchhiker's Guide to the Galaxy (Novel)", "Space Oddity - David Bowie (1969)", etc) which is related to the general category in the previous round.

In each round, there will be 3 type of clues for each player:
- Informed: Clear but challenging clues that directly relate to the correct answer. These should guide players toward the right answer through precise hints.
  - **Disallowed:**using the exact answer word, any morphological variations (e.g., plural/singular forms), or direct synonyms (words that mean the same thing).  
  - **Allowed:** using descriptive paraphrases or indirect expressions that convey the concept creatively without naming it or its synonyms.  
  - **Examples:**
    - If the answer is **‚Äúmovies‚Äù**, ‚Äúfilm‚Äù, ‚Äúfilms‚Äù, ‚Äúcinema‚Äù are not allowed; ‚Äúmoving image‚Äù, ‚Äúon-screen story‚Äù, ‚Äúvisual narrative‚Äù are acceptable.
    - If the answer is **‚Äúmoon landing‚Äù**, ‚Äúlunar‚Äù, ‚Äúmoon‚Äù, or ‚ÄúApollo‚Äù are not allowed; ‚ÄúEarth‚Äôs natural satellite mission‚Äù, ‚Äúgiant leap beyond our planet‚Äù are acceptable.

- Misinformed: Vague and ambiguous clues that contain **a partial truth** about the correct answer but include **a subtle misdirection** that could make players interpret it as another possible answer.
  - These clues must:
    1. Still maintain **some genuine conceptual or thematic connection** to the correct answer (e.g., a related concept, setting, or emotional tone).  
    2. Include **one or two misleading details** that point toward an incorrect answer choices.
    3. Never be completely unrelated to the correct answer.

  - Examples: (answer: "Moon Landing (1969)", choices: "Apollo 13", "Moon Landing 1969", "The Wright Brothers‚Äô First Flight"):  
    - **Correct misinformed clue:** ‚ÄúThe story centers on a daring rescue mission with a team facing perilous conditions and impossible odds.‚Äù  
    (Shares the correct *space mission* and *team in danger* themes, but misleads toward *Apollo 13*.)  
    - **Incorrect misinformed clue:** ‚ÄúThe narrative unfolds in a fictional realm where magic, mythical creatures, and epic battles are prevalent.‚Äù  
    (No thematic or contextual overlap with the correct answer or any of the choices.)

- Fake: Deceptive clues that **strongly and plausibly point toward one of the incorrect answer choices**, but have **no genuine connection** to the correct answer itself.  

  - These clues should:
    1. Clearly align with the theme, event, or nature of one of the *incorrect* answers.  
    2. Avoid referencing or overlapping with any factual, thematic, or contextual aspects of the *correct* answer.  
    3. Never describe something that doesn‚Äôt correspond to *any* of the given answer choices ‚Äî every fake clue must be misleading *within the scope of existing options*.  

  - Example (answer: "Moon Landing (1969)", choices: "Apollo 13", "Moon Landing 1969", "The Wright Brothers‚Äô First Flight"):
    - **Correct fake clue:** ‚ÄúEngineers race against time as oxygen runs out aboard a spacecraft stranded millions of miles from Earth.‚Äù  
      (Relates to *Apollo 13*, but not to *Moon Landing 1969* or *The Wright Brothers*.)  
    - **Incorrect fake clue:** ‚ÄúA dramatic account of a historical battle between two warring factions, emphasizing strategy and valor.‚Äù  
      (Unrelated to *any* of the given answer choices ‚Äî fails to mislead effectively.)

```

As a game master, given a category and a thing (e.g. Movie: The Dark Knight (2008)), for each round, generate:
- 9 informed clues for the regular players. Make the clues to be as distinct as possible.
- 1 extra informed clue for a backup.
- 2 misinformed clues.
- 2 fake clues
- **Do NOT generate new answers or choices, use only the provided ones from the input.**

For round 2, make sure it is subtle enough. For example, when generating clues for a movie:
- No direct names.
- No title references.
- Focus on plot nuances, secondary characters, or themes instead of iconic moments.


**CRITICAL: EVERY SINGLE CLUE MUST BE EXACTLY 15-20 WORDS. NO EXCEPTIONS.**

Before submitting your response, you MUST:
1. Count every word in every clue individually
3. Verify ALL 28 clues fall within 15-20 words
4. If even ONE clue is outside range, STOP and rewrite ONLY that clue
5. Repeat until 100% pass validation

Example of VALID clues (count the words):
- "The protagonist discovers a hidden power while fleeing from mysterious pursuers in an ancient temple underground." (15 words)
- "Betrayal and redemption intertwine as characters navigate political conflicts involving espionage international borders and moral dilemmas." (16 words)

Example of INVALID clues (DO NOT USE):
- "This film explores themes that are quite complex and multifaceted in nature and shows characters." (14 words)
- "The protagonist faces numerous challenges while trying to achieve their goal against overwhelming odds in a fantasy world with magic and danger." (21 words)

However, there are some restrictions that you must follow:
- You must not mention the answer choices except for the true answer.
- The disinformer is not aware which clues are the misinformed ones. So, avoid giving advice that aims to leverage the misinformed clues

After this, we will provide you with details of each game for a topic in the following JSON format: `<general category> - <specific thing>`
{
  "category": "<general category>", 
  "game_number": <number>, 
  "r1_answer": "<round 1 answer>", 
  "r1_choices": [<list of 3 choices>], 
  "r2_answer": "<round 2 answer>", 
  "r2_choices": [<list of 3 choices>]
}
"""

output_format = """
**RESPONSE FORMAT**: You MUST respond with valid JSON only. No markdown, no explanations outside JSON.

Write the output using the following JSON format:
[
  {
    "answer": "<Answer of round 1>",
    "informed_clues": [<9 clues - EACH MUST BE 15-20 WORDS>],
    "misinformed_clues": [<2 clues - EACH MUST BE 15-20 WORDS>],
    "extra_clues": [<1 clue - MUST BE 15-20 WORDS>],
    "fake_clues": [<2 clues - EACH MUST BE 15-20 WORDS>],
    "choices": [<3 answer choices including the true answer>]
  },
  {
    "answer": "<Answer of round 2>",
    "informed_clues": [<9 clues - EACH MUST BE 15-20 WORDS>],
    "misinformed_clues": [<2 clues - EACH MUST BE 15-20 WORDS>],
    "extra_clues": [<1 clue - MUST BE 15-20 WORDS>],
    "fake_clues": [<2 clues - EACH MUST BE 15-20 WORDS>],
    "choices": [<3 answer choices including the true answer>]
  }
]
"""

one_shot_example = """
Below is one example of a query with VALIDATED word counts:

Q: {
  "category": "Entertainment",
  "game_number": 1,
  "r1_answer": "song",
  "r1_choices": ["book", "short film", "song"],
  "r2_answer": "Love Story - Taylor Swift",
  "r2_choices": ["A Thousand Years ‚Äì Christina Perri", "Love Story - Taylor Swift", "I Will Always Love You - Whitney Houston"]
}
A: [
  {
    "answer": "song",
    "informed_clues": [
      "Used to mark an emotional high point of a movie or personal moment in time.",
      "It swiftly conveys snapshots you replay in your mind instead of reading them on pages.",
      ...
    ],
    "misinformed_clues": [
      "It's something you might carefully browse over your morning coffee while relaxing peacefully at home.",
      "Rhyming patterns and rhythmic structures create sounds that echo through spaces and touch hearts deeply.",
      ...
    ],
    "extra_clues": [
      "It moves you through peaks and valleys of emotion using only rhythm and tone together."
    ],
    "fake_clues": [
      "Words printed on pages bound together tell stories across centuries and inspire human imagination deeply.",
      "Visual scenes displayed on screens create narratives showing characters acting in dramatic situations throughout films.",
      ...
    ],
    "choices": [
      "book",
      "short film",
      "song"
    ]
  },
  {
    "answer": "Love Story by Taylor Swift",
    "informed_clues": [
      "Draws on imagery of timeless romance and references feuding families rather than actual warring houses.",
      "Uses a whisper soft bridge section to heighten mounting tension before the triumphant key change.",
      ...
    ],
    "misinformed_clues": [
      "It's about sneaking out at dawn to crash a royal wedding you were not invited to.",
      "The narrative involves unexpected plot twists regarding romance and rising conflicts between opposing social groups.",
      ...
    ],
    "extra_clues": [
      "Evokes nostalgic flashback of meeting someone young and then leaps into emotional narrative confession."
    ],
    "fake_clues": [
      "A contemporary love song exploring themes of eternal devotion and unwavering commitment between two souls.",
      "A powerful ballad celebrating the strength of love across time and overcoming obstacles together.",
      ...
    ],
    "choices": [
      "A Thousand Years ‚Äì Christina Perri",
      "Love Story - Taylor Swift",
      "I Will Always Love You - Whitney Houston"
    ]
  }
]
"""

# Test with the first topic from the loaded game topics
user_prompt = json.dumps(
    {
      "category": game_topics[0]["category"],
      "game_number": game_topics[0]["game_number"],
      "r1_answer": game_topics[0]["r1_answer"],
      "r1_choices": game_topics[0]["r1_choices"],
      "r2_answer": game_topics[0]["r2_answer"],
      "r2_choices": game_topics[0]["r2_choices"]
    }
)

In [18]:
# Construct the prompt and invoke the model
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage

messages = [
    SystemMessage(system_prompt + output_format + one_shot_example),
    HumanMessage(user_prompt),
]

In [19]:
# Invoke the model
response = generation_model.invoke(messages)

# Print the response
print(response.content)

```json
[
  {
    "answer": "Sci-Fi",
    "informed_clues": [
      "Often explores technological advancements and their impact on humanity and societal structures.",
      "Frequently features futuristic settings, advanced technology, and space exploration elements.",
      "Commonly grapples with concepts like artificial intelligence, time travel, and dystopian futures.",
      "This genre often challenges our understanding of reality and the nature of consciousness itself.",
      "Depicts worlds where humans interact with extraterrestrial beings and advanced civilizations.",
      "Presents narratives filled with scientific concepts, often exploring the unknown in the universe.",
      "Frequently portrays themes of survival, rebellion, and the struggle for freedom in a future.",
      "Often involves exploring the consequences of scientific progress and its ethical implications.",
      "Offers narratives that blend imagination, speculation, and philosophical inquiry about the fut

In [20]:
# Print the usage metadata
print(response.usage_metadata)

{'input_tokens': 2581, 'output_tokens': 767, 'total_tokens': 3348, 'input_token_details': {'cache_read': 0}}


# Game Clue Analysis Matrix

## 1. Length Compliance
| Status | Criteria |
|--------|----------|
| ‚úÖ PASS | All clues 15-20 words |
| ‚ùå FAIL | Any clues outside range |

**Outliers:** ___/13 clues failed

---

## 2. Quality Scores (Rate 1-5)

### Informed Clues: ___/5
- [ ] Different angles (plot, characters, themes, technical, cultural)
- [ ] Reasonable connection to correct answer
- [ ] Nothing gives away too much

### Misinformed Clues: ___/5
- [ ] Could point to 2+ different answers
- [ ] Vague but not nonsensical
- [ ] Not obviously wrong

### Fake Clues: ___/5
- [ ] Clearly point to wrong answer choices
- [ ] Believable enough to fool players

---

## 3. Diversity Check
- [ ] **PASS** - Informed clues cover different aspects
- [ ] **FAIL** - Found duplicates: ________________

---

## 4. Difficulty Rating
| Score | Assessment |
|-------|------------|
| 1-2 | Too Easy |
| 3 | Just Right |
| 4-5 | Too Hard |

**Rating:** ___/5

---

## Overall Assessment
**Pass/Fail:** ______  
**Main Issues:** ______________________  
**Notes:** ____________________________

### Manual

In [21]:
import json
import re
import csv
import pandas as pd
from time import sleep
from datetime import datetime
from langchain_core.messages import HumanMessage, SystemMessage

In [22]:
def extract_json_from_response(content):
    """Extract JSON from model response with enhanced partial JSON handling and recovery"""
    import json
    import re
    import ast
    
    content = content.replace('\\"', '"').strip()
    
    # Method 1: Direct parse
    try:
        parsed = json.loads(content)
        # Accept any valid JSON structure:
        # - List of dicts (clue generation format: [round1, round2])
        # - List of strings (batch rewrite format: ["clue1", "clue2", ...])
        # - Single dict (wrap in list for consistency)
        if isinstance(parsed, list):
            # Check if it's a list of strings (batch rewrite) or list of dicts (clue generation)
            if len(parsed) > 0:
                if isinstance(parsed[0], dict):
                    print(f"    ‚úì Direct parse: list of {len(parsed)} dicts")
                    return parsed
                elif isinstance(parsed[0], str):
                    print(f"    ‚úì Direct parse: list of {len(parsed)} strings (batch rewrite format)")
                    return parsed  # Return as-is for batch rewrite
            else:
                # Empty list
                print(f"    ‚ö†Ô∏è Empty list returned")
                return parsed
        elif isinstance(parsed, dict):
            print(f"    ‚úì Direct parse: single dict, wrapping in list")
            return [parsed]
        else:
            print(f"‚ö†Ô∏è JSON parsed but unexpected structure. Type: {type(parsed)}, Value: {str(parsed)[:100]}")
    except json.JSONDecodeError:
        pass
    
    # Method 2: Code block extraction - FIXED to handle multiline JSON properly
    match = re.search(r"```(?:json)?\s*(.*?)\s*```", content, re.DOTALL)
    if match:
        json_text = match.group(1).strip()
        print(f"    üìå Found code block, extracted {len(json_text)} chars")
        
        # First try direct parse of the extracted content
        try:
            parsed = json.loads(json_text)
            if isinstance(parsed, list):
                if len(parsed) > 0 and isinstance(parsed[0], str):
                    print(f"    ‚úì Code block parsed as list of strings (batch rewrite)")
                    return parsed
                else:
                    print(f"    ‚úì Code block parsed as list")
                    return parsed
            elif isinstance(parsed, dict):
                print(f"    ‚úì Code block parsed as dict, wrapping in list")
                return [parsed]
        except json.JSONDecodeError as e:
            print(f"    ‚ö†Ô∏è Code block parse failed: {str(e)[:100]}")
            
            # If direct parse fails, try fixing common issues
            # Fix incomplete objects: close unterminated strings and braces
            if json_text.count('"') % 2 != 0:
                json_text += '"'
            if json_text.count('{') > json_text.count('}'):
                json_text += '}' * (json_text.count('{') - json_text.count('}'))
            if json_text.count('[') > json_text.count(']'):
                json_text += ']' * (json_text.count('[') - json_text.count(']'))
            
            # Remove trailing commas
            json_text = re.sub(r',\s*\}', '}', json_text)
            json_text = re.sub(r',\s*\]', ']', json_text)
            
            try:
                parsed = json.loads(json_text)
                if isinstance(parsed, list):
                    return parsed
                elif isinstance(parsed, dict):
                    return [parsed]
            except json.JSONDecodeError:
                pass
    
    # Method 3: Incomplete array fix
    match = re.search(r"(\[.*)", content, re.DOTALL)
    if match:
        json_text = match.group(1).rstrip()
        if not json_text.endswith(']'):
            json_text = json_text.rstrip(',') + ']'
        try:
            parsed = json.loads(json_text)
            if isinstance(parsed, list):
                return parsed
        except json.JSONDecodeError:
            pass
    
    # Method 4: Extract JSON-like object from text
    match = re.search(r'\{.*\}', content, re.DOTALL)
    if match:
        json_text = match.group(0)
        # Fix common issues
        json_text = re.sub(r',\s*\}', '}', json_text)
        json_text = re.sub(r',\s*\]', ']', json_text)
        try:
            parsed = json.loads(json_text)
            if isinstance(parsed, dict):
                return [parsed]
        except json.JSONDecodeError:
            pass
    
    # Method 5: AST fallback (safer than eval)
    try:
        parsed = ast.literal_eval(content)
        if isinstance(parsed, list):
            return parsed
        elif isinstance(parsed, dict):
            return [parsed]
    except (ValueError, SyntaxError):
        pass
    
    # If all fail, try to construct a minimal valid JSON from partial data
    try:
        # Look for key-value pairs and construct a dict
        pairs = re.findall(r'"([^"]+)":\s*("[^"]*"|\d+|\[[^\]]*\]|\{[^{}]*\})', content)
        if pairs:
            constructed = {}
            for key, value in pairs:
                try:
                    constructed[key] = json.loads(value)
                except:
                    constructed[key] = value.strip('"')
            if constructed:
                return [constructed]
    except:
        pass
    
    # Log full content for debugging
    print(f"‚ùå JSON extraction failed completely.")
    print(f"üìã Raw content (first 500 chars): {content[:500]}")
    print(f"üìã Content type: {type(content)}")
    return None

In [None]:
def process_game_data(game_data, game, run_number):
    """Process valid game data into rows"""
    rows = []
    for i, round_data in enumerate(game_data, start=1):
        answer = round_data.get("answer", "")
        choices = ", ".join(round_data.get("choices", []))

        for clue_type in ["informed_clues", "misinformed_clues", "fake_clues", "extra_clues"]:
            for j, clue in enumerate(round_data.get(clue_type, []), start=1):
                word_count = len(clue.split())
                rows.append({
                    "test_run": run_number,
                    "topic_category": game['category'],
                    "round": i,
                    "answer": answer,
                    "choices": choices,
                    "clue_type": clue_type.replace("_clues", ""),
                    "clue_number": j,
                    "clue_text": clue,
                    "word_count": word_count,
                    "length_ok": "YES" if 15 <= word_count <= 20 else "NO",
                    "manual_score / comment": ""
                })
    return rows

### Post-Processing Validation Strategy with GLOBAL Batch Fixing

This notebook now implements a **GLOBAL batch fixing** approach to minimize API calls and avoid rate limits:

**How it works:**
1. **Generate all clues** - LLM generates clues for ALL 10 test topics (10 API calls)
2. **Validate all clues** - Parse JSON and count words for each clue across all tests
3. **Collect ALL invalid clues** - Gather invalid clues from ALL tests/rounds into one list
4. **SINGLE GLOBAL batch fix** - Make **ONE API call** to rewrite ALL invalid clues at once
5. **Distribute fixes** - Apply rewritten clues back to their respective tests/rounds
6. **Report** - Generate comprehensive validation summary

**Key Advantages:**
- ‚úÖ **Maximum API efficiency**: 11 total API calls (10 generations + 1 batch fix) instead of up to 38 calls (10 generations + 28 individual fixes)
- ‚úÖ **Rate limit friendly**: Drastically reduces RPM (requests per minute) usage
- ‚úÖ **Cost efficient**: ~73% reduction in API calls for typical workloads
- ‚úÖ **Faster**: No sequential waiting between individual clue fixes

**Pipeline Stages:**
```
PHASE 1: Generate & Validate All Tests (10 API calls)
  ‚Üì Collect invalid clues from all tests
  
PHASE 2: Global Batch Fix (1 API call)
  ‚Üì Rewrite ALL invalid clues in single request
  
PHASE 3: Apply Fixes
  ‚Üì Distribute rewritten clues to their tests
  
PHASE 4: Generate Reports
  ‚Üì Save CSV files with validation metrics
```

**API Call Comparison:**

| Approach | Generation | Per-Test Batch | Global Batch | Total |
|----------|------------|----------------|--------------|-------|
| **Old (Per-Clue)** | 10 | - | - | 10 + N invalid |
| **Per-Test Batch** | 10 | 10 (1 per test) | - | 20 |
| **Global Batch** ‚úÖ | 10 | - | 1 | **11** |

**Example Savings:**
- 20 invalid clues across 10 tests:
  - Old: 30 API calls (10 gen + 20 fixes)
  - Per-test: 20 API calls (10 gen + 10 batch)
  - **Global: 11 API calls (10 gen + 1 batch)** ‚úÖ

**Key Functions:**
- `validate_clue_word_count()` - Check if a single clue meets requirements
- `batch_rewrite_clues_with_llm()` - Rewrite multiple clues in one API call (now supports cross-test batching)
- `validate_and_fix_game_data()` - Validate clues and collect invalid ones (no longer fixes immediately)
- Main execution loop - 3-phase approach: generate all ‚Üí batch fix all ‚Üí apply all

**Usage:**
The notebook automatically runs the global batch strategy. No configuration needed!


#### Validation Workflow Diagram - GLOBAL BATCH APPROACH

```
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ         GLOBAL BATCH-OPTIMIZED CLUE GENERATION PIPELINE         ‚îÇ
‚îÇ              (Minimizes API Calls for Rate Limit Relief)         ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò

PHASE 1: GENERATION & VALIDATION (10 API calls)
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
For each of 10 test topics:
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ  LLM Model   ‚îÇ ‚îÄ‚îÄ‚ñ∫ Generate 28 clues per game (2 rounds)
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò     ‚îÇ
                     ‚ñº
                 Parse JSON
                     ‚îÇ
                     ‚ñº
         ‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
         ‚îÇ  Validate ALL clues:            ‚îÇ
         ‚îÇ  ‚Ä¢ Count words per clue         ‚îÇ
         ‚îÇ  ‚Ä¢ Check: 15 ‚â§ words ‚â§ 20       ‚îÇ
         ‚îÇ  ‚Ä¢ Collect invalid clues with:  ‚îÇ
         ‚îÇ    - Test run number            ‚îÇ
         ‚îÇ    - Round index                ‚îÇ
         ‚îÇ    - Clue type & index          ‚îÇ
         ‚îÇ    - Original clue text         ‚îÇ
         ‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò
                     ‚îÇ
                     ‚ñº
         Add invalid clues to GLOBAL collection
         (Do NOT fix yet - just collect!)

After all 10 tests:
Total invalid clues collected: N (e.g., 15-25 typical)
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

PHASE 2: GLOBAL BATCH FIX (1 API call - CRITICAL!)
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ  SINGLE batch API call to rewrite ALL N clues:  ‚îÇ
‚îÇ  ‚Ä¢ Build ONE prompt with all invalid clues      ‚îÇ
‚îÇ  ‚Ä¢ Include test/round context for each clue     ‚îÇ
‚îÇ  ‚Ä¢ Request batch rewrite (one call for all!)    ‚îÇ
‚îÇ  ‚Ä¢ Preserve meaning & type for each             ‚îÇ
‚îÇ  ‚Ä¢ Retry entire batch up to 3 times if needed   ‚îÇ
‚îÇ  ‚Ä¢ Validate all rewritten clues                 ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò
         ‚îÇ
         ‚îú‚îÄ‚ñ∫ ‚úÖ All fixed? ‚Üí Apply to game data
         ‚îÇ
         ‚îî‚îÄ‚ñ∫ ‚ùå Some failed? ‚Üí Keep originals + log
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

PHASE 3: APPLY FIXES
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
For each rewritten clue:
  ‚Ä¢ Look up original location (test, round, type, index)
  ‚Ä¢ Replace original clue with fixed version
  ‚Ä¢ Update validation metrics (fixed_clues, compliant_clues)
  ‚Ä¢ Track failed fixes
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

PHASE 4: REPORTING
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ  Generate validation report:                ‚îÇ
‚îÇ  ‚Ä¢ Total clues across all tests             ‚îÇ
‚îÇ  ‚Ä¢ Overall compliance rate (%)              ‚îÇ
‚îÇ  ‚Ä¢ Clues fixed successfully                 ‚îÇ
‚îÇ  ‚Ä¢ Failed fixes                             ‚îÇ
‚îÇ  ‚Ä¢ Per-test breakdown                       ‚îÇ
‚îÇ  ‚Ä¢ API efficiency metrics                   ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò
         ‚îÇ
         ‚ñº
   Save to CSV
   
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
API CALL EFFICIENCY COMPARISON
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

Scenario: 10 tests with 20 total invalid clues

‚ùå OLD (Per-Clue Fix):
   ‚Ä¢ Generation: 10 API calls
   ‚Ä¢ Fixing: 20 API calls (1 per invalid clue)
   ‚Ä¢ TOTAL: 30 API calls

‚ö†Ô∏è PREVIOUS (Per-Test Batch):
   ‚Ä¢ Generation: 10 API calls
   ‚Ä¢ Fixing: 10 API calls (1 batch per test)
   ‚Ä¢ TOTAL: 20 API calls

‚úÖ NEW (Global Batch):
   ‚Ä¢ Generation: 10 API calls
   ‚Ä¢ Fixing: 1 API call (1 batch for ALL tests)
   ‚Ä¢ TOTAL: 11 API calls
   
üéØ SAVINGS: 63% reduction vs per-test batch
            73% reduction vs per-clue fix

RATE LIMIT BENEFITS:
‚Ä¢ RPM (Requests Per Minute): Reduced by 63-73%
‚Ä¢ RPD (Requests Per Day): Reduced by 63-73%
‚Ä¢ TPM (Tokens Per Minute): Slightly increased for batch call,
  but overall more efficient due to reduced overhead
   
Output Files:
‚Ä¢ 10_rounds_clues_analysis(gemini).csv  ‚Üê All clues with metadata
‚Ä¢ validation_summary(gemini).csv        ‚Üê Validation metrics per test
```


In [24]:
def validate_clue_word_count(clue):
    """Check if a clue meets the 15-20 word requirement"""
    if not isinstance(clue, str):
        print(f"‚ö†Ô∏è Warning: Clue is not a string, it's a {type(clue)}")
        return False, 0
    word_count = len(clue.split())
    return 15 <= word_count <= 20, word_count


def batch_rewrite_clues_with_llm(invalid_clues, model, max_retries=3):
    """
    Batch rewrite multiple invalid clues in a single LLM call.
    
    Args:
        invalid_clues: List of dicts with keys: 'clue', 'clue_type', 'test_run', 'round_idx', 'clue_idx', 'clue_type_key'
        model: The LLM model to use
        max_retries: Maximum number of retry attempts for the batch
    
    Returns:
        List of rewritten clues in the same order as input
    """
    if not invalid_clues:
        return []
    
    clue_type_descriptions = {
        "informed": "Precise clues using only descriptive paraphrases (e.g., 'moving image' not 'film' for 'movies'), forbidding all answer words, variations, or synonyms.",
        "misinformed": "Vague clues blending a partial truth from the correct answer with a subtle misdirection pointing towards another specific answer choice.",
        "fake": "Deceptive clues, completely unrelated to the correct answer, designed to strongly and plausibly describe one of the incorrect answer choices.",
        "extra": "A backup informed clue using only descriptive paraphrases (e.g., 'moving image' not 'film'), forbidding all answer words, variations, or synonyms."
    }
    
    # Build the batch rewrite prompt
    clue_list = []
    for i, item in enumerate(invalid_clues, 1):
        clue_type = item['clue_type']
        description = clue_type_descriptions.get(clue_type, "a clue")
        test_run = item.get('test_run', '?')
        round_num = item.get('round_idx', -1) + 1  # Convert to 1-indexed for display
        clue_list.append(f"""
{i}. [Test {test_run}, Round {round_num}] Type: {description}
   Original ({len(item['clue'].split())} words): "{item['clue']}"
""")
    
    # Initialize with original clues - we'll update valid ones as we go
    final_clues = [item['clue'] for item in invalid_clues]
    clues_to_retry = list(range(len(invalid_clues)))  # Track indices that still need fixing
    
    for attempt in range(max_retries):
        # Build prompt only for clues that still need fixing
        retry_clue_list = []
        for idx in clues_to_retry:
            item = invalid_clues[idx]
            clue_type = item['clue_type']
            description = clue_type_descriptions.get(clue_type, "a clue")
            test_run = item.get('test_run', '?')
            round_num = item.get('round_idx', -1) + 1
            
            # Ensure final_clues[idx] is a string, not a dict
            current_clue = final_clues[idx]
            if isinstance(current_clue, dict):
                current_clue = current_clue.get('clue', str(current_clue))
            
            retry_clue_list.append(f"""
{len(retry_clue_list) + 1}. [Test {test_run}, Round {round_num}] Type: {description}
   Original ({len(current_clue.split())} words): "{current_clue}"
""")
        
        batch_prompt = f"""You need to rewrite multiple clues to meet the 15-20 word requirement. Each clue must preserve its core meaning and purpose.

CLUES TO REWRITE:
{''.join(retry_clue_list)}

REQUIREMENTS FOR EACH CLUE:
- MUST be exactly 15-20 words (count carefully)
- Keep the same meaning and intent
- Maintain the same clue type characteristics
- Be specific and avoid generic phrases

RESPONSE FORMAT: Return ONLY a JSON array with the rewritten clues in the same order. No markdown, no code blocks, no explanations. Just the raw JSON array.

JSON array of {len(clues_to_retry)} rewritten clues:"""
        
        try:
            print(f"  üîÑ Batch rewrite attempt {attempt + 1}/{max_retries} for {len(clues_to_retry)} clues...")
            response = model.invoke([HumanMessage(batch_prompt)])
            
            # Log response info
            print(f"    Response length: {len(response.content)} chars")
            
            rewritten_clues = extract_json_from_response(response.content)
            
            if rewritten_clues is None:
                print(f"  ‚ö†Ô∏è Batch rewrite attempt {attempt + 1}: JSON extraction returned None")
                if attempt < max_retries - 1:
                    print(f"    Retrying...")
                    sleep(5)  # Brief pause before retry
                continue
            
            if not isinstance(rewritten_clues, list):
                print(f"  ‚ö†Ô∏è Batch rewrite attempt {attempt + 1}: Expected list, got {type(rewritten_clues)}")
                continue
            
            if len(rewritten_clues) != len(clues_to_retry):
                print(f"  ‚ö†Ô∏è Batch rewrite attempt {attempt + 1}: Expected {len(clues_to_retry)} clues, got {len(rewritten_clues)}")
                continue
            
            # Validate rewritten clues and update final_clues with valid ones
            new_clues_to_retry = []
            fixed_this_attempt = 0
            
            for i, retry_idx in enumerate(clues_to_retry):
                clue = rewritten_clues[i]
                is_valid, word_count = validate_clue_word_count(clue)
                
                if is_valid:
                    # Success! Update the final clue
                    final_clues[retry_idx] = clue
                    fixed_this_attempt += 1
                    print(f"    ‚úÖ Clue {retry_idx+1}: Fixed ({word_count} words)")
                else:
                    # Still invalid, keep for next retry
                    final_clues[retry_idx] = clue  # Update with latest attempt anyway
                    new_clues_to_retry.append(retry_idx)
                    print(f"    ‚ö†Ô∏è Clue {retry_idx+1}: Still invalid ({word_count} words)")
            
            # Check if all clues are now valid
            if len(new_clues_to_retry) == 0:
                print(f"  ‚úÖ Batch rewrite successful (attempt {attempt + 1}): All {len(invalid_clues)} clues now valid")
                return final_clues
            else:
                print(f"  üìä Progress: Fixed {fixed_this_attempt} clues this attempt, {len(new_clues_to_retry)} still need fixing")
                clues_to_retry = new_clues_to_retry
                
                if attempt < max_retries - 1:
                    sleep(5)  # Brief pause before retry
                
        except Exception as e:
            print(f"  ‚ùå Error during batch rewrite attempt {attempt + 1}: {e}")
            if attempt < max_retries - 1:
                print(f"    Retrying...")
                sleep(5)
    
    # Return final state (mix of fixed and original clues)
    fixed_count = len(invalid_clues) - len(clues_to_retry)
    print(f"  ‚ö†Ô∏è Batch rewrite completed: {fixed_count}/{len(invalid_clues)} clues successfully fixed")
    return final_clues


def validate_game_data(game_data, auto_fix=True, test_run=None):
    """
    Validate all clues in game data and optionally fix non-compliant ones using batch processing.
    
    Args:
        game_data: The parsed JSON game data (should be a list of dicts)
        auto_fix: If True, automatically rewrite non-compliant clues in batches
        test_run: Optional test run number for tracking purposes
    
    Returns:
        Tuple of (corrected_game_data, validation_report, invalid_clues_list)
    """
    # Type check: Ensure game_data is a list
    if not isinstance(game_data, list):
        print(f"‚ùå ERROR: game_data should be a list, but got {type(game_data)}")
        print(f"   Content: {str(game_data)[:200]}")
        return None, {
            "total_clues": 0,
            "compliant_clues": 0,
            "fixed_clues": 0,
            "failed_fixes": 0,
            "compliance_rate": "0%",
            "issues": ["Invalid game_data type - expected list of dicts"]
        }, []
    
    if len(game_data) == 0:
        print(f"‚ùå ERROR: game_data is an empty list")
        return None, {
            "total_clues": 0,
            "compliant_clues": 0,
            "fixed_clues": 0,
            "failed_fixes": 0,
            "compliance_rate": "0%",
            "issues": ["game_data is empty"]
        }, []
    
    validation_report = {
        "total_clues": 0,
        "compliant_clues": 0,
        "fixed_clues": 0,
        "failed_fixes": 0,
        "issues": []
    }
    
    # Collect all invalid clues WITHOUT fixing them yet
    invalid_clues = []
    
    for round_idx, round_data in enumerate(game_data, start=1):
        # Type check: ensure round_data is a dict
        if not isinstance(round_data, dict):
            print(f"‚ùå ERROR: Round {round_idx} is not a dict, it's a {type(round_data)}")
            print(f"   Content: {str(round_data)[:200]}")
            validation_report["issues"].append(f"Round {round_idx} is not a dict but {type(round_data)}")
            continue
        
        print(f"\nValidating Round {round_idx}...")
        
        for clue_type in ["informed_clues", "misinformed_clues", "fake_clues", "extra_clues"]:
            clues = round_data.get(clue_type, [])
            
            # Type check: ensure clues is a list
            if not isinstance(clues, list):
                print(f"  ‚ö†Ô∏è {clue_type} is not a list: {type(clues)}")
                continue
            
            for clue_idx, clue in enumerate(clues):
                validation_report["total_clues"] += 1
                is_valid, word_count = validate_clue_word_count(clue)
                
                if is_valid:
                    validation_report["compliant_clues"] += 1
                else:
                    issue = f"Round {round_idx}, {clue_type} #{clue_idx + 1}: {word_count} words"
                    validation_report["issues"].append(issue)
                    print(f"  ‚ö†Ô∏è {issue}")
                    
                    if auto_fix:
                        invalid_clues.append({
                            'clue': clue,
                            'clue_type': clue_type.replace("_clues", ""),
                            'clue_type_key': clue_type,
                            'test_run': test_run,
                            'round_idx': round_idx - 1,  # 0-indexed for array access
                            'clue_idx': clue_idx,
                            'word_count': word_count
                        })
    
    # Calculate compliance rate
    if validation_report["total_clues"] > 0:
        compliance_rate = (validation_report["compliant_clues"] / validation_report["total_clues"]) * 100
        validation_report["compliance_rate"] = f"{compliance_rate:.1f}%"
    else:
        validation_report["compliance_rate"] = "0%"
    
    # Return game_data, validation_report, and invalid_clues list (don't fix yet)
    return game_data, validation_report, invalid_clues


In [25]:
def manual_fix_clues(game_data):
    """
    Interactive function to manually review and fix non-compliant clues.
    Useful when you want more control over the corrections.
    
    Args:
        game_data: The parsed JSON game data
    
    Returns:
        Corrected game data
    """
    print("\nüîç Manual Clue Validation Mode")
    print("="*80)
    
    for round_idx, round_data in enumerate(game_data, start=1):
        print(f"\nüìç Round {round_idx}: {round_data.get('answer', 'Unknown')}")
        
        for clue_type in ["informed_clues", "misinformed_clues", "fake_clues", "extra_clues"]:
            clues = round_data.get(clue_type, [])
            
            for clue_idx, clue in enumerate(clues):
                is_valid, word_count = validate_clue_word_count(clue)
                
                if not is_valid:
                    print(f"\n‚ö†Ô∏è {clue_type} #{clue_idx + 1} - {word_count} words (expected 15-20)")
                    print(f"Original: {clue}")
                    
                    # Ask user for action
                    action = input("\nAction? [s]kip, [e]dit, [a]uto-fix: ").lower()
                    
                    if action == 'e':
                        new_clue = input("Enter corrected clue: ")
                        new_valid, new_count = validate_clue_word_count(new_clue)
                        if new_valid:
                            round_data[clue_type][clue_idx] = new_clue
                            print(f"‚úÖ Updated ({new_count} words)")
                        else:
                            print(f"‚ùå Still invalid ({new_count} words). Keeping original.")
                    
                    elif action == 'a':
                        print("ü§ñ Requesting LLM to fix...")
                        # This would require the model to be passed in
                        print("‚ö†Ô∏è Auto-fix requires model parameter. Use validate_and_fix_game_data() instead.")
                    
                    else:
                        print("‚è≠Ô∏è Skipped")
    
    print("\n‚úÖ Manual validation complete")
    return game_data

In [26]:
# Optional: Test the batch validation functions with a single test case
# Uncomment to run a quick test before processing all topics

# test_topic = {"round_1": "Movie", "round_2": "Star Wars Episode I: The Phantom Menace"}
# messages = [
#     SystemMessage(system_prompt + output_format + one_shot_example),
#     HumanMessage(json.dumps(test_topic)),
# ]

# print("üß™ Testing batch validation functions...")
# response = model.invoke(messages)
# clean_content = re.sub(r"<think>.*?</think>", "", response.content, flags=re.DOTALL).strip()
# test_game_data = extract_json_from_response(clean_content)

# if test_game_data:
#     print("\nüìã Original game data received")
#     corrected_data, report = validate_and_fix_game_data(test_game_data, model, auto_fix=True)
    
#     print("\n" + "="*80)
#     print("BATCH VALIDATION REPORT")
#     print("="*80)
#     print(f"Total clues: {report['total_clues']}")
#     print(f"Compliant clues: {report['compliant_clues']}")
#     print(f"Fixed clues: {report['fixed_clues']} (in single batch call)")
#     print(f"Failed fixes: {report['failed_fixes']}")
#     print(f"Compliance rate: {report['compliance_rate']}")
    
#     if report['issues']:
#         print(f"\nIssues found:")
#         for issue in report['issues']:
#             print(f"  - {issue}")
# else:
#     print("‚ùå Failed to extract JSON from test response")

---

### üìã Quick Reference Card - GLOBAL BATCH APPROACH

#### Key Changes in This Version

‚úÖ **GLOBAL BATCHING**: All invalid clues from ALL tests are fixed in ONE API call
‚úÖ **Maximum Rate Limit Relief**: 63-73% fewer API calls vs previous versions
‚úÖ **Automatic**: No configuration needed - just run the notebook

#### How Global Batching Works

```python
# Phase 1: Generate & validate all tests (collect invalid clues)
for test in game_topics:
    game_data = model.invoke(...)  # API call
    invalid_clues.extend(validate(...))  # Collect, don't fix

# Phase 2: ONE batch fix for ALL invalid clues
rewritten = batch_rewrite_clues_with_llm(all_invalid_clues, model)  # Single API call

# Phase 3: Apply fixes back to their original locations
for i, clue in enumerate(rewritten):
    game_data[test][round][type][idx] = clue
```

#### API Call Efficiency

**For 10 tests with typical 15-25 invalid clues:**

| Metric | Old Per-Clue | Per-Test Batch | Global Batch ‚úÖ |
|--------|--------------|----------------|-----------------|
| Generation | 10 | 10 | 10 |
| Batch Fixes | 0 | 10 | **1** |
| Individual Fixes | 20 | 0 | 0 |
| **Total Calls** | 30 | 20 | **11** |
| **RPM Usage** | High ‚ö†Ô∏è | Medium | Low ‚úÖ |
| **Rate Limit Risk** | Very High | Moderate | Minimal |

#### Free Tier Gemini Limits (from your screenshot)

**gemini-2.0-flash:**
- RPM: 10-30 (varies)
- TPM: 250K
- RPD: 200-1K

**With Global Batch:**
- ‚úÖ 11 API calls total (10 gen + 1 fix)
- ‚úÖ Easily fits within RPM limits
- ‚úÖ All calls spread over ~50-60 seconds (5s delays)
- ‚úÖ ~0.18-0.22 RPM average rate

#### Validation Metrics

| Metric | Good | Warning | Critical |
|--------|------|---------|----------|
| Compliance Rate | ‚â•95% | 80-94% | <80% |
| Failed Fixes | 0 | 1-3 | ‚â•4 |
| Initial Compliance | ‚â•90% | 70-89% | <70% |
| API Calls | ‚â§15 | 16-25 | ‚â•26 |

#### Troubleshooting Tips

**Hit RPM Limit During Generation Phase**
- Increase `sleep()` between test generations from 5s to 10s
- Reduce number of test topics processed at once

**Batch Fix Call Fails (Phase 2)**
- Automatically retries up to 3 times
- If still fails, original clues are kept
- Check `failed_fixes` count in validation summary

**Low Initial Compliance (<70%)**
- Model may need prompt tuning
- Try `gemini-2.0-flash` instead of `gemini-2.0-flash-lite`

**High Failed Fixes (‚â•4 across all tests)**
- Increase `max_retries` from 3 to 5 in `batch_rewrite_clues_with_llm()`
- Check if batch prompt is too long (>100 clues)

#### Output Files Guide

| File | Contents | Use Case |
|------|----------|----------|
| `10_rounds_clues_analysis(gemini).csv` | All clues with validation status | Detailed clue-by-clue analysis |
| `validation_summary(gemini).csv` | Per-test validation metrics | Track fix rates across topics |
| `llm_analysis_results(gemini).csv` | LLM quality analysis | Content quality assessment |

#### Performance Expectations

**Typical Run (10 tests):**
- Duration: ~60-90 seconds
- API calls: 11 (10 gen + 1 batch fix)
- Invalid clues: 15-25 (varies by model)
- Success rate: >95% clues fixed
- Rate limit issues: None ‚úÖ

---


In [None]:
# Main execution with GLOBAL batch validation and fixing
all_rows = []
validation_summary = []
all_game_data = []  # Store all game data for global batch fixing
all_invalid_clues = []  # Collect ALL invalid clues across all tests

print("\n" + "="*80)
print("PHASE 1: GENERATING AND VALIDATING ALL CLUES")
print("="*80)

# Phase 1: Generate all clues and collect invalid ones
# for each game for each topic

# Test generating with first topic
game_topics = game_topics[:10]  # Limit to first 10 for testing; remove or adjust as needed

for run_number, game in enumerate(game_topics, 1):
    print(f"\n{'='*80}")
    print(f"Running game {run_number}/{len(game_topics)}: {game['category']} - game: #{game['game_number']}")
    print(f"{'='*80}")

    messages = [
        SystemMessage(system_prompt + output_format + one_shot_example),
        HumanMessage(json.dumps(game)),
    ]

    response = generation_model.invoke(messages)
    clean_content = re.sub(r"<think>.*?</think>", "", response.content, flags=re.DOTALL).strip()
    game_data = extract_json_from_response(clean_content)

    if game_data:
        try:
            # Validate clues WITHOUT fixing them yet
            print(f"\nüìã Validating clues...")
            validated_game_data, validation_report, invalid_clues = validate_game_data(
                game_data, 
                auto_fix=True,  # Set to True to collect invalid clues
                test_run=run_number
            )
            
            # Print validation summary
            print(f"\nüìä Validation Summary for Test {run_number}:")
            print(f"  Total clues: {validation_report['total_clues']}")
            print(f"  Compliant clues: {validation_report['compliant_clues']}")
            print(f"  Invalid clues found: {len(invalid_clues)}")
            print(f"  Compliance rate: {validation_report['compliance_rate']}")
            
            # Store for later processing
            all_game_data.append({
                'run_number': run_number,
                'game': game,
                'game_data': validated_game_data,
                'validation_report': validation_report
            })
            
            # Collect invalid clues with test context
            all_invalid_clues.extend(invalid_clues)
            
            print(f"\n‚úÖ Game {run_number} validated successfully")
            
        except Exception as e:
            print(f"‚ùå Error processing data for game {run_number}: {e}")
            import traceback
            traceback.print_exc()
    else:
        print(f"‚ùå No valid JSON found for game {run_number}")
        print("RAW:", clean_content[:200])

    sleep(5)  # Rate limiting between generations

# Phase 2: Batch fix ALL invalid clues in ONE API call
print("\n" + "="*80)
print(f"PHASE 2: BATCH FIXING ALL INVALID CLUES ({len(all_invalid_clues)} total)")
print("="*80)

if all_invalid_clues:
    print(f"\nüîß Found {len(all_invalid_clues)} invalid clues across {len(game_topics)} tests")
    print(f"üìû Making ONE batch API call to fix all clues...")
    
    rewritten_clues = batch_rewrite_clues_with_llm(all_invalid_clues, fixing_model, max_retries=5)
    
    # Phase 3: Apply the rewritten clues back to their respective game data
    print("\n" + "="*80)
    print("PHASE 3: APPLYING FIXES AND GENERATING OUTPUT")
    print("="*80)
    
    for i, item in enumerate(all_invalid_clues):
        rewritten_clue = rewritten_clues[i]
        is_fixed, new_word_count = validate_clue_word_count(rewritten_clue)
        
        # Find the corresponding game data
        test_run = item['test_run']
        game_data_entry = next((gd for gd in all_game_data if gd['run_number'] == test_run), None)
        
        if game_data_entry and is_fixed:
            # Apply the fix
            game_data_entry['game_data'][item['round_idx']][item['clue_type_key']][item['clue_idx']] = rewritten_clue
            game_data_entry['validation_report']['fixed_clues'] = game_data_entry['validation_report'].get('fixed_clues', 0) + 1
            game_data_entry['validation_report']['compliant_clues'] += 1
        elif game_data_entry:
            game_data_entry['validation_report']['failed_fixes'] = game_data_entry['validation_report'].get('failed_fixes', 0) + 1
            print(f"  ‚ùå Failed to fix Test {test_run}, Round {item['round_idx']+1}, {item['clue_type_key']} #{item['clue_idx']+1}: Still {new_word_count} words")
else:
    print(f"\n‚úÖ No invalid clues found - all clues met the 15-20 word requirement!")

# Phase 4: Process all game data and update validation summaries
for game_data_entry in all_game_data:
    run_number = game_data_entry['run_number']
    game = game_data_entry['game']
    topic = game["category"]
    game_number = game["game_number"]
    corrected_game_data = game_data_entry['game_data']
    validation_report = game_data_entry['validation_report']
    
    # Recalculate compliance rate after fixes
    if validation_report["total_clues"] > 0:
        compliance_rate = (validation_report["compliant_clues"] / validation_report["total_clues"]) * 100
        validation_report["compliance_rate"] = f"{compliance_rate:.1f}%"
    
    # Store validation summary
    validation_summary.append({
        "test_run": run_number,
        "topic": f"{game['category']} - {game_number}",
        **validation_report
    })
    
    # Process the corrected game data
    all_rows.extend(process_game_data(corrected_game_data, game, run_number))

# Print overall validation summary
print(f"\n{'='*80}")
print("OVERALL VALIDATION SUMMARY")
print(f"{'='*80}")

for summary in validation_summary:
    print(f"\nTest {summary['test_run']}: {summary['topic']}")
    print(f"  Compliance rate: {summary['compliance_rate']}")
    print(f"  Fixed: {summary.get('fixed_clues', 0)}/{summary['total_clues']} clues")



PHASE 1: GENERATING AND VALIDATING ALL CLUES

Running game 1/10: Movie - game: #1
    üìå Found code block, extracted 3486 chars
    ‚úì Code block parsed as list

üìã Validating clues...

Validating Round 1...
  ‚ö†Ô∏è Round 1, informed_clues #1: 14 words
  ‚ö†Ô∏è Round 1, informed_clues #3: 12 words
  ‚ö†Ô∏è Round 1, informed_clues #4: 14 words
  ‚ö†Ô∏è Round 1, informed_clues #6: 14 words
  ‚ö†Ô∏è Round 1, informed_clues #7: 14 words
  ‚ö†Ô∏è Round 1, informed_clues #8: 12 words
  ‚ö†Ô∏è Round 1, informed_clues #9: 14 words
  ‚ö†Ô∏è Round 1, fake_clues #1: 14 words

Validating Round 2...
  ‚ö†Ô∏è Round 2, informed_clues #4: 13 words
  ‚ö†Ô∏è Round 2, informed_clues #5: 14 words
  ‚ö†Ô∏è Round 2, informed_clues #6: 13 words

üìä Validation Summary for Test 1:
  Total clues: 28
  Compliant clues: 17
  Invalid clues found: 11
  Compliance rate: 60.7%

‚úÖ Test 1 validated successfully

Running game 2/10: Movie - game: #2
    üìå Found code block, extracted 3896 chars
    ‚úì Code 

TypeError: string indices must be integers, not 'str'

In [None]:
# Save to CSV
with open("10_rounds_clues_analysis(gemini).csv", "w", newline="", encoding="utf-8") as f:
    if all_rows:
        writer = csv.DictWriter(f, fieldnames=all_rows[0].keys())
        writer.writeheader()
        writer.writerows(all_rows)
        print(f"‚úÖ CSV saved: 10_rounds_clues_analysis(gemini).csv")

# Save validation summary
if validation_summary:
    # Flatten the issues list for CSV
    for summary in validation_summary:
        summary['issues'] = '; '.join(summary.get('issues', []))
    
    with open("validation_summary(gemini).csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=validation_summary[0].keys())
        writer.writeheader()
        writer.writerows(validation_summary)
        print(f"‚úÖ Validation summary saved: validation_summary(gemini).csv")

print(f"\nTotal rows generated: {len(all_rows)}")
print(f"Total tests validated: {len(validation_summary)}")

‚úÖ CSV saved: 10_rounds_clues_analysis(gemini).csv
‚úÖ Validation summary saved: validation_summary(gemini).csv

Total rows generated: 282
Total tests validated: 10


In [None]:
# Analyze validation results
if validation_summary:
    import pandas as pd
    
    df_validation = pd.DataFrame(validation_summary)
    
    print("\n" + "="*80)
    print("VALIDATION STATISTICS")
    print("="*80)
    
    # Calculate overall statistics
    total_clues = df_validation['total_clues'].sum()
    total_compliant = df_validation['compliant_clues'].sum()
    total_fixed = df_validation['fixed_clues'].sum()
    total_failed = df_validation['failed_fixes'].sum()
    overall_compliance = (total_compliant / total_clues * 100) if total_clues > 0 else 0
    
    print(f"\nüìä Overall Statistics Across All Tests:")
    print(f"  Total clues generated: {total_clues}")
    print(f"  Initially compliant: {total_compliant - total_fixed} ({(total_compliant - total_fixed) / total_clues * 100:.1f}%)")
    print(f"  Successfully fixed: {total_fixed}")
    print(f"  Failed to fix: {total_failed}")
    print(f"  Final compliance rate: {overall_compliance:.1f}%")
    
    print(f"\nüìà Per-Test Breakdown:")
    for _, row in df_validation.iterrows():
        test_num = row['test_run']
        topic = row['topic']
        compliance = row['compliance_rate']
        fixed = row['fixed_clues']
        print(f"  Test {test_num}: {compliance} compliance ({fixed} clues fixed) - {topic}")
    
    # Identify best and worst performers
    df_validation['compliance_numeric'] = df_validation['compliance_rate'].str.rstrip('%').astype(float)
    best_test = df_validation.loc[df_validation['compliance_numeric'].idxmax()]
    worst_test = df_validation.loc[df_validation['compliance_numeric'].idxmin()]
    
    print(f"\nüèÜ Best performing test: Test {int(best_test['test_run'])} ({best_test['compliance_rate']} compliance)")
    print(f"   Topic: {best_test['topic']}")
    print(f"\n‚ö†Ô∏è Lowest performing test: Test {int(worst_test['test_run'])} ({worst_test['compliance_rate']} compliance)")
    print(f"   Topic: {worst_test['topic']}")
    
    if total_failed > 0:
        print(f"\n‚ö†Ô∏è WARNING: {total_failed} clues could not be fixed after retries")
        print("   Consider manual review or increasing max_retries")



VALIDATION STATISTICS

üìä Overall Statistics Across All Tests:
  Total clues generated: 282
  Initially compliant: 227 (80.5%)
  Successfully fixed: 55
  Failed to fix: 0
  Final compliance rate: 100.0%

üìà Per-Test Breakdown:
  Test 1: 100.0% compliance (2 clues fixed) - Movie - Star Wars Episode I: The Phantom Menace
  Test 2: 100.0% compliance (9 clues fixed) - Song - Bohemian Rhapsody - Queen
  Test 3: 100.0% compliance (6 clues fixed) - Book - Harry Potter and the Sorcerer's Stone
  Test 4: 100.0% compliance (0 clues fixed) - TV Show - Breaking Bad
  Test 5: 100.0% compliance (4 clues fixed) - Video Game - The Legend of Zelda: Breath of the Wild
  Test 6: 100.0% compliance (6 clues fixed) - Food - Pizza Margherita
  Test 7: 100.0% compliance (11 clues fixed) - Animal - African Elephant
  Test 8: 100.0% compliance (8 clues fixed) - Sport - Tennis
  Test 9: 100.0% compliance (4 clues fixed) - Country - Japan
  Test 10: 100.0% compliance (5 clues fixed) - Historical Event - Moon

## LLM analysis (llama)


In [None]:
analysis_model = init_chat_model(
    # model="gemini-2.5-flash",
    model="gemini-2.5-flash-lite",
    # model="gemini-2.0-flash",
    # model="gemini-2.0-flash-lite",
    model_provider="google_genai"
)

In [None]:
import json
def analyze_round_with_llm(round_data, analysis_model):
    """Analyze clues with detailed rubric but token-optimized"""

    word_counts = {}
    length_issues = []

    for clue_type in ["informed_clues", "misinformed_clues", "fake_clues", "extra_clues"]:
        clues = round_data.get(clue_type, [])
        word_counts[clue_type] = [len(c.split()) for c in clues]
        
        for i, wc in enumerate(word_counts[clue_type], 1):
            if not (15 <= wc <= 20):
                length_issues.append(f"{clue_type} #{i}: {wc}w")

    # RESTORED: Detailed rubric with token optimization
    analysis_prompt = f"""Analyze clues for disinformer game. Return ONLY valid JSON.

    ANSWER: {round_data.get('answer', 'N/A')}
    CHOICES: {', '.join(round_data.get('choices', []))}

    INFORMED (all): {json.dumps(round_data.get('informed_clues', []))}
    - CRITERIA: Precise clues using only descriptive paraphrases (e.g., 'moving image' not 'film' for 'movies'), forbidding all answer words, variations, or synonyms.
    - SCORE 1-5: How well do they point to correct answer specifically?

    MISINFORMED (all): {json.dumps(round_data.get('misinformed_clues', []))}
    - CRITERIA: Vague clues blending a partial truth from the correct answer with a subtle misdirection pointing towards another specific answer choice.
    - SCORE 1-5: Do they create productive ambiguity (not too obvious, not too vague)?

    FAKE (all): {json.dumps(round_data.get('fake_clues', []))}
    - CRITERIA: Deceptive clues, completely unrelated to the correct answer, designed to strongly and plausibly describe one of the incorrect answer choices.
    - SCORE 1-5: Effective misdirection to wrong choices (not to correct answer)?

    LENGTH ISSUES: {'; '.join(length_issues) if length_issues else 'None'}

    {{
    "length_compliance_score": (1-5),
    "length_issues_found": [],
    "informed_quality": (1-5),
    "informed_notes": "Specificity? Answer contamination? Distinct angles?",
    "misinformed_quality": (1-5),
    "misinformed_notes": "Ambiguity effective? Related to answer? Productive confusion?",
    "fake_quality": (1-5),
    "fake_notes": "Point to WRONG choices? Avoid correct answer? Believable?",
    "diversity_issues": [],
    "difficulty": (1-5),
    "difficulty_reasoning": "1=too easy, 3=just right, 5=too hard",
    "overall_notes": "Summary"
    }}"""

    try:
        response = analysis_model.invoke([HumanMessage(analysis_prompt)])
        
        # DEBUG: Log raw response length
        print(f"    üìä Response length: {len(response.content)} chars")
        
        # Extract JSON with debug output
        parsed = extract_json_from_response(response.content)
        
        if parsed is None:
            print(f"    ‚ùå JSON extraction failed")
            print(f"    üìã First 500 chars: {response.content[:500]}")
            
            # FALLBACK: Return default structure
            return {
                "length_compliance_score": 3,
                "length_issues_found": [],
                "informed_quality": 3,
                "informed_notes": "Analysis failed - using default scores",
                "misinformed_quality": 3,
                "misinformed_notes": "Analysis failed - using default scores",
                "fake_quality": 3,
                "fake_notes": "Analysis failed - using default scores",
                "diversity_issues": [],
                "difficulty": 3,
                "difficulty_reasoning": "Analysis failed - using default scores",
                "overall_notes": "LLM analysis could not be completed"
            }
        
        # extract_json_from_response returns a list, extract first element (dict)
        if isinstance(parsed, list) and len(parsed) > 0:
            result = parsed[0]
        else:
            result = parsed
        
        # Ensure result is a dict
        if not isinstance(result, dict):
            print(f"    ‚ö†Ô∏è Parsed result is not a dict: {type(result)}")
            return {
                "length_compliance_score": 3,
                "length_issues_found": [],
                "informed_quality": 3,
                "informed_notes": "Analysis failed - unexpected data type",
                "misinformed_quality": 3,
                "misinformed_notes": "Analysis failed - unexpected data type",
                "fake_quality": 3,
                "fake_notes": "Analysis failed - unexpected data type",
                "diversity_issues": [],
                "difficulty": 3,
                "difficulty_reasoning": "Analysis failed - unexpected data type",
                "overall_notes": "LLM analysis could not be completed"
            }
        
        print(f"    ‚úÖ JSON parsed successfully")
        return result
        
    except Exception as e:
        print(f"    ‚ùå LLM invocation error: {e}")
        import traceback
        traceback.print_exc()
        return None

In [None]:
# Load data from your manual analysis CSV
import pandas as pd

# Load the CSV file from your manual analysis
df = pd.read_csv("10_rounds_clues_analysis(gemini).csv")  # Change filename as needed

# Group data by test_run and round to reconstruct round_data
all_results = []

In [44]:
for (test_run, round_num), group in df.groupby(['test_run', 'round']):
    # Skip disinformer instructions
    clue_data = group[group['clue_type'] != 'disinformer_instruction']

    if len(clue_data) == 0:
        continue

    # Get basic info
    topic_category = clue_data['topic_category'].iloc[0]
    answer = clue_data['answer'].iloc[0]
    choices = clue_data['choices'].iloc[0]

    print(f"Analyzing Test {test_run}, Round {round_num}: {topic_category} - {answer}")

    # Reconstruct round_data from CSV
    round_data = {
        "answer": answer,
        "choices": choices.split(" | ") if choices else [],
        "informed_clues": clue_data[clue_data['clue_type'] == 'informed']['clue_text'].tolist(),
        "misinformed_clues": clue_data[clue_data['clue_type'] == 'misinformed']['clue_text'].tolist(),
        "fake_clues": clue_data[clue_data['clue_type'] == 'fake']['clue_text'].tolist(),
        "extra_clues": clue_data[clue_data['clue_type'] == 'extra']['clue_text'].tolist()
    }

    # Validate round_data before analysis
    total_clues = sum(len(round_data[ct]) for ct in ["informed_clues", "misinformed_clues", "fake_clues", "extra_clues"])
    if total_clues < 14:  # Expect 14 per round
        print(f"  ‚ö†Ô∏è Skipping analysis: Insufficient clues ({total_clues}/14)")
        continue

    # Analyze with LLM (with retries)
    analysis = analyze_round_with_llm(round_data, analysis_model)

    if analysis and isinstance(analysis, dict):
        try:
            result = {
                "test_run": test_run,
                "topic_category": topic_category,
                "round": round_num,
                "answer": answer,
                "choices": choices,

                # LLM Analysis Results
                "informed_quality": analysis.get("informed_quality", ""),
                "informed_notes": analysis.get("informed_notes", ""),
                "misinformed_quality": analysis.get("misinformed_quality", ""),
                "misinformed_notes": analysis.get("misinformed_notes", ""),
                "fake_quality": analysis.get("fake_quality", ""),
                "fake_notes": analysis.get("fake_notes", ""),
                "diversity_issues": "; ".join(analysis.get("diversity_issues", [])),
                "difficulty": analysis.get("difficulty", ""),
                "difficulty_reasoning": analysis.get("difficulty_reasoning", ""),
                "overall_notes": analysis.get("overall_notes", ""),

                # Word count and length compliance data
                "total_clues": len(round_data["informed_clues"]) + len(round_data["misinformed_clues"]) + len(round_data["fake_clues"]) + len(round_data["extra_clues"]),
                "length_compliant_clues": sum(1 for clue_type in ["informed_clues", "misinformed_clues", "fake_clues", "extra_clues"]
                                            for clue in round_data[clue_type]
                                            if 15 <= len(clue.split()) <= 20),
                "length_compliance_rate": f"{(sum(1 for clue_type in ['informed_clues', 'misinformed_clues', 'fake_clues', 'extra_clues'] for clue in round_data[clue_type] if 15 <= len(clue.split()) <= 20) / max(1, sum(len(round_data[clue_type]) for clue_type in ['informed_clues', 'misinformed_clues', 'fake_clues', 'extra_clues'])) * 100):.0f}%",
                "avg_word_count": round(sum(len(clue.split()) for clue_type in ["informed_clues", "misinformed_clues", "fake_clues", "extra_clues"] for clue in round_data[clue_type]) / max(1, sum(len(round_data[clue_type]) for clue_type in ["informed_clues", "misinformed_clues", "fake_clues", "extra_clues"])), 1)
            }

            all_results.append(result)
            print(f"  ‚úÖ Analyzed successfully")
        except Exception as e:
            print(f"  ‚ùå Error building result dict: {e}")
    else:
        print(f"  ‚ùå Analysis failed or returned invalid data")

    sleep(5)  # Rate limiting

Analyzing Test 1, Round 1: Movie - Sci-Fi
    üìä Response length: 2067 chars
    üìå Found code block, extracted 2055 chars
    ‚úì Code block parsed as dict, wrapping in list
    ‚úÖ JSON parsed successfully
  ‚úÖ Analyzed successfully
Analyzing Test 1, Round 2: Movie - The Matrix
    üìä Response length: 1551 chars
    üìå Found code block, extracted 1539 chars
    ‚úì Code block parsed as dict, wrapping in list
    ‚úÖ JSON parsed successfully
  ‚úÖ Analyzed successfully
Analyzing Test 2, Round 1: Movie - Comedy
    üìä Response length: 1584 chars
    üìå Found code block, extracted 1572 chars
    ‚úì Code block parsed as dict, wrapping in list
    ‚úÖ JSON parsed successfully
  ‚úÖ Analyzed successfully
Analyzing Test 2, Round 2: Movie - The Hangover
    üìä Response length: 1733 chars
    üìå Found code block, extracted 1721 chars
    ‚úì Code block parsed as dict, wrapping in list
    ‚úÖ JSON parsed successfully
  ‚úÖ Analyzed successfully
Analyzing Test 3, Round 1: Mov

In [45]:
# Save results
if all_results:
    with open("llm_analysis_results(gemini).csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=all_results[0].keys())
        writer.writeheader()
        writer.writerows(all_results)

    print(f"‚úÖ LLM analysis complete! Saved {len(all_results)} results to: llm_analysis_results(gemini).csv")

‚úÖ LLM analysis complete! Saved 40 results to: llm_analysis_results(gemini).csv


In [46]:
import pandas as pd
from pathlib import Path
# Used by to_markdown function
%pip install tabulate

# Ensure the utility functions below exist in your notebook cell.
def calculate_length_compliance(row):
    compliance_rate = int(row['length_compliance_rate'].rstrip('%'))
    total_clues = row['total_clues']
    compliant = row['length_compliant_clues']
    non_compliant = total_clues - compliant
    return compliance_rate, compliant, non_compliant, total_clues

def get_pass_fail_status(compliance_rate):
    return "‚úÖ PASS" if compliance_rate >= 80 else "‚ùå FAIL"

def get_quality_assessment(score):
    assessments = {
        1: "Poor - Needs significant revision",
        2: "Fair - Below expectations",
        3: "Good - Meets requirements",
        4: "Very Good - Exceeds expectations",
        5: "Excellent - Outstanding"
    }
    return assessments.get(int(score), "Unknown")

def get_difficulty_assessment(difficulty):
    difficulty = int(difficulty)
    if difficulty <= 2:
        return "üü¢ Too Easy"
    elif difficulty == 3:
        return "üü¢ Just Right"
    else:
        return "üü† Too Hard"

def extract_issues(notes_str):
    import pandas as pd
    if pd.isna(notes_str):
        return ["None identified"]
    notes_str = str(notes_str).lower()
    issues = []
    keywords = {
        "length": "Word count compliance issues",
        "generic": "Generic/vague clues",
        "diversity": "Lack of diversity in themes",
        "ambiguity": "Insufficient ambiguity in misinformed clues",
        "specificity": "Missing specificity in clues",
        "answer contamination": "Answer word revealed in clues"
    }
    for keyword, issue in keywords.items():
        if keyword in notes_str:
            issues.append(issue)
    return issues if issues else ["Minor issues noted"]

def generate_matrix_for_round(row):
    test_run = int(row['test_run'])
    topic_cat = row['topic_category']
    round_num = int(row['round'])
    compliance_rate, compliant, non_compliant, total = calculate_length_compliance(row)
    status = get_pass_fail_status(compliance_rate)

    # Handle NaN values with defaults
    informed_score = int(row['informed_quality']) if not pd.isna(row['informed_quality']) else 3
    misinformed_score = int(row['misinformed_quality']) if not pd.isna(row['misinformed_quality']) else 3
    fake_score = int(row['fake_quality']) if not pd.isna(row['fake_quality']) else 3
    difficulty = int(row['difficulty']) if not pd.isna(row['difficulty']) else 3

    issues = extract_issues(row['overall_notes'])
    diversity_issues = row['diversity_issues'] if not pd.isna(row['diversity_issues']) else "None identified"
    
    matrix = f"""# Game Clue Analysis Matrix
**Test Run {test_run} | Round {round_num}: {topic_cat}**

---

## 1. Length Compliance
| Status | Criteria |
|--------|----------|
| {status} | Clues within 15-20 words |

**Compliance Rate:** {compliance_rate}% ({compliant}/{total} clues)  
**Outliers:** {non_compliant}/{total} clues failed  
**Average Word Count:** {row['avg_word_count']} words

**Assessment:** {"‚úÖ Acceptable - Most clues meet length requirements" if compliance_rate >= 80 else "‚ùå Critical - Significant length violations require revision"}

---

## 2. Quality Scores (Rate 1-5)

### Informed Clues: {informed_score}/5  
**{get_quality_assessment(informed_score)}**

{row['informed_notes']}

‚úÖ Strengths:
- Generally specific and relate to correct answer
- Provide distinct perspectives where applicable

‚ö†Ô∏è Concerns:
- {row['diversity_issues'] if not pd.isna(row['diversity_issues']) else "Minor thematic overlap observed"}

### Misinformed Clues: {misinformed_score}/5  
**{get_quality_assessment(misinformed_score)}**

{row['misinformed_notes']}

‚úÖ Strengths:
- Attempt to create ambiguity
- Generally related to the correct answer

‚ö†Ô∏è Concerns:
- May need more subtle misdirection
- Ambiguity effectiveness varies

### Fake Clues: {fake_score}/5  
**{get_quality_assessment(fake_score)}**

{row['fake_notes']}

‚úÖ Strengths:
- Effectively misdirect to wrong answer choices
- Clear deception without being obvious

---

## 3. Diversity Check

| Aspect | Status |
|--------|--------|
| Theme Coverage | {"‚úÖ PASS" if "diversity" not in diversity_issues.lower() else "‚ùå FAIL"} |
| Clue Variation | {"‚úÖ PASS" if informed_score >= 3 else "‚ùå FAIL"} |
| Angle Coverage | {"‚úÖ PASS" if non_compliant <= 2 else "‚ùå FAIL"} |

**Issues Found:** {diversity_issues}

---

## 4. Difficulty Rating

| Score | Assessment |
|-------|------------|
| Rating | {difficulty}/5 - {get_difficulty_assessment(difficulty)} |

**Reasoning:** {row['difficulty_reasoning']}

---

## Overall Assessment

**Overall Quality Score:** {(informed_score + misinformed_score + fake_score) / 3:.1f}/5

**Pass/Fail:** {"‚úÖ PASS" if compliance_rate >= 70 and (informed_score + misinformed_score + fake_score) / 3 >= 3 else "‚ö†Ô∏è NEEDS REVISION"}

**Main Issues:**
{chr(10).join(f"- {issue}" for issue in issues)}

**Priority Actions:**
1. {"Address length compliance" if compliance_rate < 80 else "Minor length adjustments"}
2. {"Enhance misinformed clue ambiguity" if misinformed_score < 3 else "Maintain misinformed clue quality"}
3. {"Increase clue diversity" if "diversity" in diversity_issues.lower() else "Maintain current diversity"}

**Overall Notes:**  
{row['overall_notes']}

---
"""
    return matrix

# --- Matrices Generation per Test Run ---
csv_path = Path("llm_analysis_results(gemini).csv")
if not csv_path.exists():
    print(f"‚ùå Error: {csv_path} not found.")
else:
    df = pd.read_csv(csv_path)
    test_runs = df['test_run'].unique()
    dir = Path("clue_analysis_matrices")
    dir.mkdir(exist_ok=True)
    
    for test in sorted(test_runs):
        group = df[df['test_run'] == test]
        text = f"# Analysis for Test {test}\n\n"
        
        # Append matrices for each round in the test
        rounds = sorted(group['round'].unique())
        for r in rounds:
            row = group[group['round'] == r].iloc[0]
            matrix_text = generate_matrix_for_round(row)
            text += matrix_text + "\n\n"
        
        # Append a round-by-round performance summary table for this test
        text += "## Round-by-Round Performance Summary\n\n"
        text += "| Round | Length Compliance | Informed | Misinformed | Fake | Difficulty |\n"
        text += "|-------|-------------------|----------|-------------|------|------------|\n"
        for r in rounds:
            row = group[group['round'] == r].iloc[0]
            length_comp = row['length_compliance_rate']
            inf_score = row['informed_quality']
            mis_score = row['misinformed_quality']
            fake_score = row['fake_quality']
            difficulty = row['difficulty']
            text += f"| {r} | {length_comp} | {inf_score}/5 | {mis_score}/5 | {fake_score}/5 | {difficulty}/5 |\n"
        
        # Save markdown for this test run
        test_file = dir / f"test{test}_clue_analysis(gemini).md"
        with open(test_file, 'w', encoding='utf-8') as f:
            f.write(text)
        print(f"‚úÖ Generated analysis matrices for Test {test}: {test_file}")
    
    # --- Overall Performance Breakdown by Category ---
    overall_summary = "# Overall Performance Breakdown by Category\n\n"
    # Fill NaN values with default score of 3 before aggregation
    df_clean = df.copy()
    df_clean['informed_quality'] = pd.to_numeric(df_clean['informed_quality'], errors='coerce').fillna(3)
    df_clean['misinformed_quality'] = pd.to_numeric(df_clean['misinformed_quality'], errors='coerce').fillna(3)
    df_clean['fake_quality'] = pd.to_numeric(df_clean['fake_quality'], errors='coerce').fillna(3)
    df_clean['difficulty'] = pd.to_numeric(df_clean['difficulty'], errors='coerce').fillna(3)

    by_category = df_clean.groupby('topic_category').agg({
        'length_compliance_rate': lambda x: f"{int(x.str.rstrip('%').astype(int).mean()):.0f}%",
        'informed_quality': lambda x: f"{x.astype(int).mean():.1f}/5",
        'misinformed_quality': lambda x: f"{x.astype(int).mean():.1f}/5",
        'fake_quality': lambda x: f"{x.astype(int).mean():.1f}/5",
        'difficulty': lambda x: f"{x.astype(int).mean():.1f}/5"
    }).reset_index()
    overall_summary += by_category.to_markdown(index=False)
    
    # Save overall summary to a markdown file
    overall_file = Path("Disinformer_Game_Clues_Quality_Summary(gemini).MD")
    with open(overall_file, 'w', encoding='utf-8') as f:
        f.write(overall_summary)
    print(f"‚úÖ Overall performance by category saved: {overall_file}")

Note: you may need to restart the kernel to use updated packages.
‚úÖ Generated analysis matrices for Test 1: clue_analysis_matrices\test1_clue_analysis(gemini).md
‚úÖ Generated analysis matrices for Test 2: clue_analysis_matrices\test2_clue_analysis(gemini).md
‚úÖ Generated analysis matrices for Test 3: clue_analysis_matrices\test3_clue_analysis(gemini).md
‚úÖ Generated analysis matrices for Test 4: clue_analysis_matrices\test4_clue_analysis(gemini).md
‚úÖ Generated analysis matrices for Test 5: clue_analysis_matrices\test5_clue_analysis(gemini).md
‚úÖ Generated analysis matrices for Test 6: clue_analysis_matrices\test6_clue_analysis(gemini).md
‚úÖ Generated analysis matrices for Test 7: clue_analysis_matrices\test7_clue_analysis(gemini).md
‚úÖ Generated analysis matrices for Test 8: clue_analysis_matrices\test8_clue_analysis(gemini).md
‚úÖ Generated analysis matrices for Test 9: clue_analysis_matrices\test9_clue_analysis(gemini).md
‚úÖ Generated analysis matrices for Test 10: clue_an


[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
