In [2]:
# Environment Setup for Kaggle
# The '!' tells Kaggle to run this as a terminal command
!pip install -qU langgraph langchain langchain_community langchain_huggingface
!pip install -qU bitsandbytes accelerate transformers
!pip install -qU tiktoken faiss-cpu

print("Libraries installed successfully!")

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m157.3/157.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m101.8/101.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.5/2.5 MB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.0/1.0 MB[0m [31m61.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m473.8/473.8 kB[0m [31m33.8 MB/s[0m eta [36

In [3]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

# # Option A: Interactive Login (Easiest - just paste token when asked)
# login()

# Option B: Automatic (If you don't want to paste every time)
# 1. In Kaggle menu, go to "Add-ons" -> "Secrets"
# 2. Add a new secret called "HF_TOKEN" and paste your token there.
# 3. Uncomment the lines below:
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")
login(token=hf_token)

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from langchain_huggingface import HuggingFacePipeline

# 1. Configuration for 4-Bit Quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# 2. Load Model
# Make sure you have accepted the license on the HuggingFace Llama-3 page!
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"

print("Loading Model... (This takes about 2-3 mins on Kaggle)")

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    
    # We load the model onto the first GPU
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        quantization_config=quantization_config,
        device_map="auto"
    )

    # 3. Create the Pipeline
    text_generation_pipeline = HuggingFacePipeline(pipeline=pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=1024,
        temperature=0.1,
        return_full_text=False
    ))
    
    print("SUCCESS: Model Loaded on GPU.")

except Exception as e:
    print(f"Error loading model: {e}")
    print("Tip: Did you turn on the GPU in the sidebar?")

Loading Model... (This takes about 2-3 mins on Kaggle)


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Device set to use cuda:0


SUCCESS: Model Loaded on GPU.


In [6]:
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser

# --- 1. The Analyst's Persona (System Prompt) ---
# We force Llama-3 to act as a strict critic.
analyst_template = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are an Expert Prompt Analyst. Your job is to critique user queries for Large Language Models.
You do NOT rewrite the prompt. You only identify weaknesses.

Analyze the user's input for:
1. Ambiguity (Vague intent)
2. Missing Context (Target audience, format, specific technologies)
3. Constructive Feedback (What specific details need to be added?)

Output your analysis in this exact JSON format:
{{
    "is_ambiguous": "yes/no",
    "missing_context": ["list", "of", "missing", "items"],
    "critique": "A brief summary of what is wrong."
}}
<|eot_id|><|start_header_id|>user<|end_header_id|>
User Input: {user_input}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

analyst_prompt = PromptTemplate(
    template=analyst_template,
    input_variables=["user_input"]
)

# --- 2. The Node Function ---
def analyst_node(state):
    """
    Node 1: Analyzes the raw input and updates the state with a critique.
    """
    print(f"\n--- [Node 1] Analyst is thinking... ---")
    
    # 1. Get user input from the state
    user_query = state["user_input"]
    
    # 2. Invoke the LLM
    # We chain the prompt -> model -> string parser
    chain = analyst_prompt | text_generation_pipeline
    
    response = chain.invoke({"user_input": user_query})
    
    # 3. Simple parsing (Handling potential extra text from Llama-3)
    # Sometimes models chatter before the JSON. We find the JSON structure.
    import json
    import re
    
    try:
        # Extract JSON using regex (robustness for M.Tech projects)
        json_match = re.search(r"\{.*\}", response, re.DOTALL)
        if json_match:
            critique_data = json.loads(json_match.group())
        else:
            # Fallback if model fails JSON format
            critique_data = {"error": "Failed to parse JSON", "raw": response}
            
    except Exception as e:
        critique_data = {"error": str(e)}

    # 4. Update the state
    # We save the critique so the next agent (Architect) can see it.
    state["critique"] = critique_data
    
    print(f"Critique Generated: {critique_data['critique']}")
    return state

not need every time to run below cell


In [7]:
# --- TEST ZONE ---
# Let's test with a purposefully bad prompt
test_state = {
    "user_input": "write a code for snake game",  # Vague: No language, no library specified
    "retry_count": 0,
    "critique": None,
    "refined_prompt": "",
    "score": 0,
    "feedback_logs": []
}

# Run the node
result_state = analyst_node(test_state)

# Display the result
print("\n--- TEST RESULTS ---")
print(f"Input: {result_state['user_input']}")
print(f"Analyst Findings: {result_state['critique']}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--- [Node 1] Analyst is thinking... ---
Critique Generated: The prompt is too vague. It's unclear what kind of code the user wants to write (e.g., console-based, graphical, mobile app), what features they want to include (e.g., scoring, levels, multiplayer), and what programming language they want to use. Providing more context would help the model generate a more accurate and relevant response.

--- TEST RESULTS ---
Input: write a code for snake game
Analyst Findings: {'is_ambiguous': 'yes', 'missing_context': ['specific programming language', 'desired features', 'target platform'], 'critique': "The prompt is too vague. It's unclear what kind of code the user wants to write (e.g., console-based, graphical, mobile app), what features they want to include (e.g., scoring, levels, multiplayer), and what programming language they want to use. Providing more context would help the model generate a more accurate and relevant response."}


In [27]:
import re
from langchain_core.prompts import PromptTemplate

# --- 1. The Architect's Persona (Updated for Flexibility) ---
architect_template = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are the "Architect," an advanced prompt engineering agent.
Your goal is to rewrite the user's raw prompt into a high-fidelity instruction using the CO-STAR framework.

INPUT DATA:
1. User Raw Input: {user_input}
2. Analyst's Critique: {critique}

INSTRUCTIONS:
- Fix the issues identified in the critique.
- Use the CO-STAR format for the rewrite:
  (C)ontext: Provide background.
  (O)bjective: Define the specific task.
  (S)tyle: Define the style (e.g., PEP-8 for code, Professional for text).
  (T)one: Professional and technical.
  (A)udience: Define the audience (e.g., Expert Developer or Hiring Manager).
  (R)esponse: Define the format (e.g., Single Code Block, Bullet Points).

IMPORTANT:
- If the user asks for CODE, assume Python unless specified otherwise.
- If the user asks for TEXT (Essay, Interview, Email), do NOT mention code standards like PEP-8.
- OUTPUT ONLY THE REWRITTEN PROMPT. Do not say "Here is the prompt".

<|eot_id|><|start_header_id|>user<|end_header_id|>
Refine this prompt.
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

architect_prompt = PromptTemplate(
    template=architect_template,
    input_variables=["user_input", "critique"]
)

# --- 2. The Node Function (With Auto-Cleaning) ---
def architect_node(state):
    """
    Node 2: Rewrites the prompt based on the critique.
    Includes Regex cleaning to prevent 'blank output' errors.
    """
    print(f"\n--- [Node 2] Architect is designing... (Attempt {state.get('retry_count', 0) + 1}) ---")
    
    # 1. Get data
    user_query = state["user_input"]
    critique_data = state["critique"]
    
    # 2. Invoke LLM
    chain = architect_prompt | text_generation_pipeline
    raw_output = chain.invoke({
        "user_input": user_query,
        "critique": str(critique_data)
    })
    
    # --- 3. CLEAN THE OUTPUT (The Fix) ---
    # We strip out conversational filler to ensure Llama-3 answers the prompt later.
    
    # Pattern A: Look for text inside double quotes at the end (Common Llama-3 habit)
    match_quotes = re.search(r'"([^"]*)"\s*$', raw_output, re.DOTALL)
    
    # Pattern B: Look for text after common intro phrases
    match_text = re.search(r"(?:Here is the rewritten prompt:|Rewritten Prompt:)\s*(.*)", raw_output, re.DOTALL | re.IGNORECASE)
    
    if match_quotes:
        cleaned_prompt = match_quotes.group(1)
        print(">> DEBUG: Extracted prompt from quotes.")
    elif match_text:
        cleaned_prompt = match_text.group(1)
        print(">> DEBUG: Extracted prompt after filler text.")
    else:
        # Fallback: Use the whole thing if no filler detected
        cleaned_prompt = raw_output

    # Clean whitespace
    cleaned_prompt = cleaned_prompt.strip()
    
    # 4. Update State
    state["refined_prompt"] = cleaned_prompt
    
    # Increment Retry Counter
    state["retry_count"] = state.get("retry_count", 0) + 1
    
    print(f"Prompt Rewritten (Length: {len(cleaned_prompt)} chars).")
    return state

below cell not need to rum everytime

In [16]:
# --- TEST ZONE ---
# We continue from the previous 'result_state'
# It already has 'user_input' and 'critique' inside.

final_state = architect_node(result_state)

print("\n--- ARCHITECT OUTPUT ---")
print(final_state["refined_prompt"])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--- [Node 2] Architect is designing... (Attempt 1) ---
Prompt Rewritten.

--- ARCHITECT OUTPUT ---
Here is the rewritten prompt in the CO-STAR format:

**(C)ontext:** The goal is to create a console-based Snake game using Python.

**(O)bjective:** Write a Python code that implements a basic Snake game, featuring a snake that moves around a grid, eats food pellets, and grows in length.

**(S)tyle:** Clean, following PEP-8 guidelines.

**(T)one:** Professional and technical.

**(A)udience:** Expert Python developer.

**(R)esponse:** Single Code Block

Please provide the rewritten prompt.


In [28]:
# --- 1. The Judge's Persona (G-Eval Logic) ---
judge_template = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are the "Supreme Judge" of prompt quality.
Evaluate the "Refined Prompt" based on a strict rubric:
1. Clarity (Is the objective clear?)
2. Specificity (Are languages/tools defined?)
3. Robustness (Does it handle constraints?)

Give a score from 0 to 10 (10 is perfect).
If the score is below 8, provide specific feedback on what to fix.

Format your response as valid JSON:
{{
    "score": 8,
    "feedback": "Excellent prompt, no changes needed."
}}
<|eot_id|><|start_header_id|>user<|end_header_id|>
Refined Prompt: {refined_prompt}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

judge_prompt = PromptTemplate(
    template=judge_template,
    input_variables=["refined_prompt"]
)

# --- 2. The Node Function ---
def judge_node(state):
    """
    Node 3: Scores the prompt.
    """
    print(f"\n--- [Node 3] Judge is evaluating... ---")
    
    refined_prompt = state["refined_prompt"]
    
    # Invoke
    chain = judge_prompt | text_generation_pipeline
    response = chain.invoke({"refined_prompt": refined_prompt})
    
    # Parse JSON
    import json
    import re
    
    try:
        json_match = re.search(r"\{.*\}", response, re.DOTALL)
        if json_match:
            judge_data = json.loads(json_match.group())
            score = int(judge_data.get("score", 0))
            feedback = judge_data.get("feedback", "No feedback.")
        else:
            score = 5 # Penalty for bad formatting
            feedback = "Judge could not parse output."
            
    except Exception as e:
        score = 0
        feedback = str(e)

    # Update State
    state["score"] = score
    state["feedback_logs"].append(f"Score: {score} | Feedback: {feedback}")
    
    print(f"Verdict: Score {score}/10")
    print(f"Feedback: {feedback}")
    
    return state

In [29]:
# --- The Router Function ---
def decide_next_step(state):
    """
    Determines the path:
    - If Score > 8: End
    - If Score < 8 AND Retry Count < 3: Retry (Back to Architect)
    - Else: End (Give up to prevent infinite loops)
    """
    score = state["score"]
    retry_count = state["retry_count"]
    
    if score >= 8:
        print(">>> Decision: APPROVED. Proceeding to Output.")
        return "end"
    
    if retry_count < 3:
        print(f">>> Decision: REJECTED (Score {score}). Retrying... (Attempt {retry_count+1}/3)")
        return "retry"
    
    print(">>> Decision: MAX RETRIES REACHED. Proceeding anyway.")
    return "end"

In [30]:
from typing import TypedDict, List, Optional
from langgraph.graph import StateGraph, END

# --- 1. RE-DEFINE THE STATE (The Memory) ---
class AgentState(TypedDict):
    user_input: str             # The raw prompt from the user
    target_model: str           # e.g., "Llama-3"
    
    critique: Optional[dict]    # Analyst output
    refined_prompt: str         # Architect output
    score: int                  # Judge score
    feedback_logs: List[str]    # History
    
    retry_count: int            # Loop limiter

# --- 2. RE-DEFINE THE ROUTER (The Decision Logic) ---
def decide_next_step(state):
    """
    Determines if we loop back or finish.
    """
    score = state.get("score", 0)
    retry_count = state.get("retry_count", 0)
    
    # If score is good (8+), we stop.
    if score >= 8:
        print(">>> Decision: APPROVED. Proceeding to Output.")
        return "end"
    
    # If score is bad, but we have retries left, we loop.
    if retry_count < 3:
        # Increment retry count manually here if needed, 
        # or handle it in the Architect node (we'll update state below)
        print(f">>> Decision: REJECTED (Score {score}). Retrying... (Attempt {retry_count+1}/3)")
        return "retry"
    
    # If out of retries, we force stop.
    print(">>> Decision: MAX RETRIES REACHED. Proceeding anyway.")
    return "end"

# --- 3. BUILD THE GRAPH ---
workflow = StateGraph(AgentState)

# Add Nodes (Make sure analyst_node, architect_node, judge_node are defined above!)
workflow.add_node("analyst", analyst_node)
workflow.add_node("architect", architect_node)
workflow.add_node("judge", judge_node)

# Set the Entry Point
workflow.set_entry_point("analyst")

# Connect the Nodes (Normal flow)
workflow.add_edge("analyst", "architect")
workflow.add_edge("architect", "judge")

# Connect the Conditional Edge (The Loop)
workflow.add_conditional_edges(
    "judge",
    decide_next_step,
    {
        "retry": "architect", 
        "end": END
    }
)

# Compile
app = workflow.compile()
print("Graph Compiled Successfully!")

Graph Compiled Successfully!


In [20]:
# --- FINAL EXECUTION ---
initial_input = {
    "user_input": "make a snake game code",
    "retry_count": 0,
    "feedback_logs": []
}

final_output = app.invoke(initial_input, config={"recursion_limit": 10})

print("\n======================================")
print("FINAL OPTIMIZED PROMPT:")
print("======================================")
print(final_output["refined_prompt"])
print("\nFinal Score:", final_output["score"])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--- [Node 1] Analyst is thinking... ---


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Critique Generated: The prompt is too vague. It's unclear what kind of snake game is being requested (e.g., console-based, graphical, 2D, 3D), what features are required (e.g., scoring, levels, multiplayer), and what programming language or platform is preferred. Providing more specific details would help generate a more accurate and relevant response.

--- [Node 2] Architect is designing... (Attempt 1) ---


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prompt Rewritten.

--- [Node 3] Judge is evaluating... ---
Verdict: Score 9/10
Feedback: Excellent prompt! The CO-STAR format is well-structured, and the requirements are clear and concise. The only minor suggestion is to consider adding a specific constraint on the game's difficulty level or a target score for the game. This would make the prompt more robust and challenging for the developer.
>>> Decision: APPROVED. Proceeding to Output.

FINAL OPTIMIZED PROMPT:
Here is the rewritten prompt in the CO-STAR format:

**(C)ontext:** A console-based, 2D snake game is to be developed using Python.

**(O)bjective:** Write a Python code that implements a console-based, 2D snake game with the following features: scoring, levels, and basic game logic.

**(S)tyle:** PEP-8 compliant code with proper indentation and comments.

**(T)one:** Professional and technical.

**(A)udience:** Expert Python developer.

**(R)esponse:** A single code block in Markdown format.

Here is the rewritten prompt:
```

In [22]:
import pandas as pd
import time

# --- 1. The Test Dataset (3 Distinct Categories) ---
test_prompts = [
    # 1. Coding (Ambiguous)
    "write a python script for scraping data",
    
    # 2. Reasoning (Missing Context)
    "explain quantum physics",
    
    # 3. Creative/Business (Tone Mismatch)
    "write an email to my boss asking for leave"
]

results = []

print(f"--- STARTING TWIN-TEST BENCHMARK (n={len(test_prompts)}) ---\n")

for i, raw_prompt in enumerate(test_prompts):
    print(f"Processing Prompt {i+1}: '{raw_prompt}'...")
    
    # --- PATH A: THE "DUMB" MODEL (Control) ---
    start_time = time.time()
    
    # FIX: Use .invoke() instead of calling it directly
    response_a = text_generation_pipeline.invoke(raw_prompt)
    
    time_a = time.time() - start_time
    
    # --- PATH B: THE "APE" SYSTEM (Experiment) ---
    start_time = time.time()
    
    # 1. Optimize the prompt
    initial_input = {"user_input": raw_prompt, "retry_count": 0, "feedback_logs": []}
    ape_result = app.invoke(initial_input, config={"recursion_limit": 10})
    optimized_prompt = ape_result["refined_prompt"]
    score = ape_result["score"]
    
    # 2. Generate final answer using the optimized prompt
    # FIX: Use .invoke() here as well
    response_b = text_generation_pipeline.invoke(optimized_prompt)
    
    time_b = time.time() - start_time
    
    # --- SAVE RESULTS ---
    # We truncate strings to 100 chars to keep the table clean
    results.append({
        "Original Prompt": raw_prompt,
        "Optimized Prompt": optimized_prompt[:100] + "...", 
        "Judge Score": score,
        "Response A (Raw)": response_a[:100] + "...",      
        "Response B (APE)": response_b[:100] + "..."       
    })

print("\n--- BENCHMARK COMPLETE ---")

# --- DISPLAY AS TABLE ---
df = pd.DataFrame(results)
pd.set_option('display.max_colwidth', None) # Show full text if needed
display(df)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


--- STARTING TWIN-TEST BENCHMARK (n=3) ---

Processing Prompt 1: 'write a python script for scraping data'...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--- [Node 1] Analyst is thinking... ---


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Critique Generated: The prompt is too vague and lacks essential details. It's unclear what data the user wants to scrape, from where, and in what format. Providing more context about the data source, target data, and desired output format would help the model generate a more accurate and relevant script.

--- [Node 2] Architect is designing... (Attempt 1) ---


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prompt Rewritten.

--- [Node 3] Judge is evaluating... ---


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Verdict: Score 9/10
Feedback: Excellent prompt! The CO-STAR format is well-structured, and the objective is clear. The only suggestion is to specify the target data, specific data source, and desired output format in the prompt itself, rather than leaving them as placeholders. This will help the developer understand the requirements better and provide a more accurate response.
>>> Decision: APPROVED. Proceeding to Output.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing Prompt 2: 'explain quantum physics'...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--- [Node 1] Analyst is thinking... ---


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Critique Generated: The prompt is too broad and open-ended, making it difficult to provide a concise and meaningful response. Without specifying a particular topic or concept within quantum physics, it's hard to know where to start. Additionally, the lack of context about the target audience and desired format makes it challenging to tailor the response to the user's needs.

--- [Node 2] Architect is designing... (Attempt 1) ---


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prompt Rewritten.

--- [Node 3] Judge is evaluating... ---


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Verdict: Score 9/10
Feedback: Excellent prompt! The CO-STAR format is well-structured, and the objective is clear. The tone and style are well-defined, and the audience is accurately targeted. The only minor suggestion is to consider adding a specific constraint on the length of the response, as it's not explicitly mentioned. Overall, this prompt is well-crafted and ready for use.
>>> Decision: APPROVED. Proceeding to Output.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing Prompt 3: 'write an email to my boss asking for leave'...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--- [Node 1] Analyst is thinking... ---


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Critique Generated: The prompt is too vague and lacks specific details. It's unclear who the target audience is (e.g., specific boss, HR department), what type of leave is being requested (e.g., vacation, sick leave), and what format the email should take (e.g., formal, informal). Providing more context and details would help the model generate a more effective email.

--- [Node 2] Architect is designing... (Attempt 1) ---


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prompt Rewritten.

--- [Node 3] Judge is evaluating... ---


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Verdict: Score 9/10
Feedback: Excellent prompt! The CO-STAR format is well-structured, and the objective is clear. However, there's room for improvement in the 'Style' and 'Response' sections. Consider adding more specific guidelines for the tone and response format to make it more robust.
>>> Decision: APPROVED. Proceeding to Output.

--- BENCHMARK COMPLETE ---


Unnamed: 0,Original Prompt,Optimized Prompt,Judge Score,Response A (Raw),Response B (APE)
0,write a python script for scraping data,Here is the rewritten prompt in the CO-STAR format:\n\n**(C)ontext:** We aim to develop a Python scrip...,9,from a website\n```\nimport requests\nfrom bs4 import BeautifulSoup\n\n# Send a GET request\nresponse = r...,...
1,explain quantum physics,Here is the rewritten prompt in the CO-STAR format:\n\n(C)ontext: Quantum physics is a fundamental bra...,9,in simple terms\nQuantum physics is a branch of physics that deals with the behavior of matter and e...,"Use a code block to illustrate the concept, and provide a brief example of how superposition is use..."
2,write an email to my boss asking for leave,Here is the rewritten prompt in the CO-STAR format:\n\n(C)ontext: I am an employee seeking to request ...,9,\nHere is an example email you could send to your boss:\nSubject: Request for Leave on [Date]\n\nDear [B...,The email should be written in a professional tone and include the following information:\n\n* A clea...


In [23]:
!pip install -q gradio

[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m68.6/68.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m444.8/444.8 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.0/2.0 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0m

In [31]:
import gradio as gr

# --- 1. The Wrapper Function ---
# This connects the UI inputs to your LangGraph logic
def process_prompt(user_input):
    status_msg = "Status: Initializing Agents..."
    
    # A. Run the APE System (Analyst -> Architect -> Judge)
    inputs = {"user_input": user_input, "retry_count": 0, "feedback_logs": []}
    
    # Run the graph
    result = app.invoke(inputs, config={"recursion_limit": 10})
    
    refined_prompt = result["refined_prompt"]
    score = result["score"]
    critique = result["critique"]
    logs = "\n".join(result["feedback_logs"])
    
    # B. Generate Actual Responses (The "Twin-Test")
    # 1. Raw Response (Control)
    raw_response = text_generation_pipeline.invoke(user_input)
    
    # 2. Optimized Response (Experiment)
    optimized_response = text_generation_pipeline.invoke(refined_prompt)
    
    # Format the critique for display
    critique_text = f"Ambiguity: {critique.get('is_ambiguous', 'N/A')}\nMissing: {critique.get('missing_context', [])}\n\nAnalyst Notes: {critique.get('critique', '')}"
    
    return (
        raw_response,           # Output 1: Raw Result
        refined_prompt,         # Output 2: The New Prompt
        f"Score: {score}/10",   # Output 3: Judge Score
        critique_text,          # Output 4: Critique
        optimized_response      # Output 5: Final Optimized Result
    )

# --- 2. The Dashboard Layout ---
custom_css = """
#heading {text-align: center; color: #2d3748;}
#score {font-size: 24px; font-weight: bold; color: #38a169;}
"""

with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
    
    gr.Markdown("# ü¶ç APE: Autonomous Prompt Engineer", elem_id="heading")
    gr.Markdown("### An Agentic Framework for Iterative Prompt Optimization")
    
    with gr.Row():
        input_box = gr.Textbox(
            label="Enter Raw Prompt", 
            placeholder="e.g., write a code for snake game...",
            lines=2
        )
        submit_btn = gr.Button("üöÄ Optimize & Run", variant="primary")

    # --- RESULTS SECTION ---
    with gr.Row():
        # LEFT COLUMN: THE BEFORE
        with gr.Column(variant="panel"):
            gr.Markdown("### üî¥ Control (Raw Input)")
            out_raw_res = gr.Textbox(label="Llama-3 Response (Raw)", lines=10, interactive=False)

        # RIGHT COLUMN: THE AFTER
        with gr.Column(variant="panel"):
            gr.Markdown("### üü¢ Experimental (APE Agent)")
            
            # Internal Agent Details (Accordion to keep it clean)
            with gr.Accordion("See Agent Thinking Process", open=False):
                with gr.Row():
                    out_critique = gr.Textbox(label="Agent 1: Analyst Critique", lines=4)
                    out_score = gr.Textbox(label="Agent 3: Judge Score", lines=1, elem_id="score")
                out_refined = gr.Textbox(label="Agent 2: Architect's Rewritten Prompt", lines=4)
            
            out_opt_res = gr.Textbox(label="Llama-3 Response (Optimized)", lines=10, interactive=False)

    # --- CLICK EVENT ---
    submit_btn.click(
        fn=process_prompt,
        inputs=[input_box],
        outputs=[out_raw_res, out_refined, out_score, out_critique, out_opt_res]
    )

# --- 3. Launch ---
# share=True creates a public link (expires in 72h)
demo.launch(share=True, debug=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://6d2db1371f9279030b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--- [Node 1] Analyst is thinking... ---


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Critique Generated: The prompt is too vague and open-ended. It's unclear what type of game the user wants to build (e.g. a video game, a board game, a card game, etc.), what aspect of Indian culture they want to focus on (e.g. mythology, festivals, daily life, etc.), and who the target audience is (e.g. children, adults, educational, entertainment, etc.). Providing more specific details would help the model better understand the user's intent and provide a more accurate response.

--- [Node 2] Architect is designing... (Attempt 1) ---


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prompt Rewritten (Length: 629 chars).

--- [Node 3] Judge is evaluating... ---


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Verdict: Score 9/10
Feedback: Excellent prompt! The objective is clear, and the specificity is high. The tone and style are professional and technical, and the audience is well-defined. The only suggestion I have is to consider adding more context about the specific aspect of Indian culture to focus on, as it is currently left open-ended. For example, you could specify a particular region, era, or theme to make the prompt more concrete.
>>> Decision: APPROVED. Proceeding to Output.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://6d2db1371f9279030b.gradio.live


