In [1]:
# Step 1
import os, time, random, hashlib, json, re
from typing import List, Dict, Optional, Tuple
from collections import Counter

try:
    from dotenv import load_dotenv
    load_dotenv(override=True)
except Exception:
    pass

from openai import OpenAI
from openai import RateLimitError, APIError, APITimeoutError, APIConnectionError

PROVIDER   = os.getenv("PROVIDER", "groq").lower()
BASE_URL   = os.getenv("BASE_URL", "https://api.groq.com/openai/v1")
MODEL_NAME = os.getenv("MODEL_NAME", "llama3-8b-8192")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

assert PROVIDER == "groq", "Set PROVIDER=groq in your .env"
assert GROQ_API_KEY, "Missing GROQ_API_KEY in .env"
assert BASE_URL and MODEL_NAME, "BASE_URL / MODEL_NAME missing"

_client = OpenAI(api_key=GROQ_API_KEY, base_url=BASE_URL)

# Retry/abort knobs
class TooManyRateLimits(RuntimeError): ...
MAX_RETRIES = 3
PAUSE_ON_RATE_LIMIT_SEC = 5
BASE_DELAY = 1.0
BACKOFF_BASE = 1.6
BACKOFF_CAP = 8.0
CONSECUTIVE_429_ABORT = 3

_llm_cache = {}
def _hash_messages(messages, model, temperature, max_tokens):
    payload = {"m": messages, "model": model, "t": temperature, "max_tokens": max_tokens}
    return hashlib.sha256(json.dumps(payload, sort_keys=True).encode()).hexdigest()

def chat_complete(messages: List[Dict], model: str = MODEL_NAME, temperature: float = 0.2,
                  max_tokens: int = 512, timeout: int = 60, use_cache: bool = True) -> str:
    if use_cache and temperature <= 0.2:
        key = _hash_messages(messages, model, temperature, max_tokens)
        if key in _llm_cache:
            return _llm_cache[key]

    delay = BASE_DELAY
    last_err = None
    consecutive_429 = 0

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            resp = _client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=temperature,
                max_tokens=max_tokens,
                timeout=timeout,
                n=1,
            )
            text = resp.choices[0].message.content
            if use_cache and temperature <= 0.2:
                _llm_cache[_hash_messages(messages, model, temperature, max_tokens)] = text
            return text

        except RateLimitError as e:
            msg = str(e).lower()
            if "insufficient_quota" in msg or "quota" in msg:
                raise RuntimeError("Groq reports insufficient quota for this key.") from e
            consecutive_429 += 1
            if consecutive_429 >= CONSECUTIVE_429_ABORT:
                raise TooManyRateLimits("Aborting stage due to repeated 429s") from e
            print(f"[429] {consecutive_429}/{CONSECUTIVE_429_ABORT} → cooling {PAUSE_ON_RATE_LIMIT_SEC}s")
            time.sleep(PAUSE_ON_RATE_LIMIT_SEC)
            last_err = e

        except (APITimeoutError, APIConnectionError, APIError) as e:
            msg = str(e).lower()
            if any(bad in msg for bad in ["invalid api key","unsupported model","model_not_found","invalid_request_error","insufficient_quota"]):
                raise
            sleep_for = min(BACKOFF_CAP, delay) * (1 + random.random()*0.3)
            print(f"[Retryable] attempt {attempt}/{MAX_RETRIES} → sleeping {sleep_for:.1f}s")
            time.sleep(sleep_for)
            delay *= BACKOFF_BASE
            last_err = e

    raise RuntimeError(f"chat_complete failed after retries. Last error: {last_err}")

print("Groq client ready. MODEL_NAME:", MODEL_NAME)


Groq client ready. MODEL_NAME: llama-3.1-8b-instant


In [2]:
#Step 2: GSM8K loader with retries/streaming + local fallback
import os, re, pandas as pd
from typing import Optional
from datasets import load_dataset, DownloadConfig

CSV_PATH = "gsm8k_subset.csv"
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")

def extract_final_number(text: str) -> Optional[str]:
    m = re.findall(r"####\s*([+-]?\d[\d,]*(?:\.\d+)?)", str(text))
    if not m: return None
    return m[-1].replace(",", "").strip()

def to_float_str(s: str) -> Optional[str]:
    try:
        f = float(str(s).strip())
        if abs(f) < 1e-12: f = 0.0
        if abs(f - round(f)) < 1e-9: return str(int(round(f)))
        return str(f)
    except: return None

def build_df_from_hf_split(split, take: int, seed: int = 42) -> pd.DataFrame:
    rows = []
    split = split.shuffle(seed=seed).select(range(min(take, len(split))))
    for r in split:
        q = r["question"]
        gt = extract_final_number(r["answer"])
        gt_norm = to_float_str(gt) if gt is not None else None
        if gt_norm is not None:
            rows.append({"question": q, "gt_answer": gt_norm})
    return pd.DataFrame(rows)

def try_load_gsm8k(max_questions: int, seed: int = 42) -> pd.DataFrame:
    candidates = [("gsm8k", "main"), ("openai/gsm8k", "main")]
    last_err = None

    # 1) normal download (portable DownloadConfig)
    for ds_id, name in candidates:
        try:
            print(f"Trying '{ds_id}', split '{name}' ...")
            cfg = DownloadConfig(max_retries=5)
            ds = load_dataset(ds_id, name, download_config=cfg)
            df = build_df_from_hf_split(ds["train"], max_questions, seed=seed)
            if len(df) == 0: raise RuntimeError("Dataset loaded but parsed 0 numeric rows.")
            print(f"Loaded {len(df)} items from '{ds_id}'.")
            return df
        except Exception as e:
            print(f"  Failed: {type(e).__name__}: {e}")
            last_err = e

    # 2) streaming fallback
    for ds_id, name in candidates:
        try:
            print(f"Trying '{ds_id}' with streaming=True ...")
            ds = load_dataset(ds_id, name, streaming=True)
            rows = []
            for i, r in enumerate(ds["train"]):
                if i >= max_questions: break
                q = r["question"]
                gt = extract_final_number(r["answer"])
                gt_norm = to_float_str(gt) if gt is not None else None
                if gt_norm is not None:
                    rows.append({"question": q, "gt_answer": gt_norm})
            df = pd.DataFrame(rows)
            if len(df) == 0: raise RuntimeError("Streaming worked but produced 0 numeric rows.")
            print(f"Loaded {len(df)} items from '{ds_id}' (streaming).")
            return df
        except Exception as e:
            print(f"  Streaming failed: {type(e).__name__}: {e}")
            last_err = e

    # 3) ultimate fallback: small built-in sample
    print("\n  Could not load GSM8K (network/pipe issue). Using small built-in sample.")
    local_sample = [
        {"question": "Sarah has 3 apples and buys 4 more. How many apples total?", "gt_answer": "7"},
        {"question": "A box holds 6 pens. There are 5 boxes. How many pens total?", "gt_answer": "30"},
        {"question": "Tom read 12 pages on Monday and 13 on Tuesday. How many total?", "gt_answer": "25"},
        {"question": "A train travels 60 miles in 1.5 hours. What is the speed (mph)?", "gt_answer": "40"},
        {"question": "There are 8 rows of 7 chairs. How many chairs?", "gt_answer": "56"},
    ]
    return pd.DataFrame(local_sample)

# knobs
import random
MAX_QUESTIONS = int(os.getenv("MAX_QUESTIONS", "300"))
random.seed(42)

data_df = try_load_gsm8k(MAX_QUESTIONS, seed=42)
data_df["gt_answer"] = data_df["gt_answer"].apply(to_float_str)
data_df = data_df.dropna(subset=["gt_answer"]).reset_index(drop=True)
data_df.to_csv(CSV_PATH, index=False)
print(f"\n Saved fresh subset to {CSV_PATH} ({len(data_df)} rows)")
display(data_df.head(3))


Trying 'gsm8k', split 'main' ...


  Failed: ValueError: Fast download using 'hf_transfer' is enabled (HF_HUB_ENABLE_HF_TRANSFER=1) but 'hf_transfer' package is not available in your environment. Try `pip install hf_transfer`.
Trying 'openai/gsm8k', split 'main' ...
  Failed: ValueError: Fast download using 'hf_transfer' is enabled (HF_HUB_ENABLE_HF_TRANSFER=1) but 'hf_transfer' package is not available in your environment. Try `pip install hf_transfer`.
Trying 'gsm8k' with streaming=True ...


  Streaming failed: ValueError: Fast download using 'hf_transfer' is enabled (HF_HUB_ENABLE_HF_TRANSFER=1) but 'hf_transfer' package is not available in your environment. Try `pip install hf_transfer`.
Trying 'openai/gsm8k' with streaming=True ...
  Streaming failed: ValueError: Fast download using 'hf_transfer' is enabled (HF_HUB_ENABLE_HF_TRANSFER=1) but 'hf_transfer' package is not available in your environment. Try `pip install hf_transfer`.

  Could not load GSM8K (network/pipe issue). Using small built-in sample.

 Saved fresh subset to gsm8k_subset.csv (5 rows)


Unnamed: 0,question,gt_answer
0,Sarah has 3 apples and buys 4 more. How many a...,7
1,A box holds 6 pens. There are 5 boxes. How man...,30
2,Tom read 12 pages on Monday and 13 on Tuesday....,25


In [3]:
#Step 3
import os, time, random, hashlib, json, re
from typing import List, Dict, Optional, Tuple
from collections import Counter

try:
    from dotenv import load_dotenv
    load_dotenv(override=True)
except Exception:
    pass

from openai import OpenAI
from openai import RateLimitError, APIError, APITimeoutError, APIConnectionError

PROVIDER   = os.getenv("PROVIDER", "groq").lower()
BASE_URL   = os.getenv("BASE_URL", "https://api.groq.com/openai/v1")
MODEL_NAME = os.getenv("MODEL_NAME", "llama-3.1-8b-instant")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

assert PROVIDER == "groq", "Set PROVIDER=groq in your .env"
assert GROQ_API_KEY, "Missing GROQ_API_KEY in .env"
assert BASE_URL and MODEL_NAME, "BASE_URL / MODEL_NAME missing"

_client = OpenAI(api_key=GROQ_API_KEY, base_url=BASE_URL)

class TooManyRateLimits(RuntimeError): ...
# conservative knobs
MAX_RETRIES = 3
PAUSE_ON_RATE_LIMIT_SEC = 5
BASE_DELAY = 1.0
BACKOFF_BASE = 1.6
BACKOFF_CAP = 8.0
CONSECUTIVE_429_ABORT = 3

# low-temp cache
_llm_cache = {}
def _hash_messages(messages, model, temperature, max_tokens):
    payload = {"m": messages, "model": model, "t": temperature, "max_tokens": max_tokens}
    return hashlib.sha256(json.dumps(payload, sort_keys=True).encode()).hexdigest()

def chat_complete(messages: List[Dict], model: str = MODEL_NAME, temperature: float = 0.2,
                  max_tokens: int = 512, timeout: int = 60, use_cache: bool = True) -> str:
    if use_cache and temperature <= 0.2:
        key = _hash_messages(messages, model, temperature, max_tokens)
        if key in _llm_cache:
            return _llm_cache[key]

    delay = BASE_DELAY
    last_err = None
    consecutive_429 = 0
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            resp = _client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=temperature,
                max_tokens=max_tokens,
                timeout=timeout,
                n=1,
            )
            text = resp.choices[0].message.content
            if use_cache and temperature <= 0.2:
                _llm_cache[_hash_messages(messages, model, temperature, max_tokens)] = text
            return text

        except RateLimitError as e:
            msg = str(e).lower()
            if "insufficient_quota" in msg or "quota" in msg:
                raise RuntimeError("Groq reports insufficient quota for this key.") from e
            consecutive_429 += 1
            if consecutive_429 >= CONSECUTIVE_429_ABORT:
                raise TooManyRateLimits("Aborting stage due to repeated 429s") from e
            print(f"[429] {consecutive_429}/{CONSECUTIVE_429_ABORT} → cooling {PAUSE_ON_RATE_LIMIT_SEC}s")
            time.sleep(PAUSE_ON_RATE_LIMIT_SEC)
            last_err = e

        except (APITimeoutError, APIConnectionError, APIError) as e:
            msg = str(e).lower()
            if any(bad in msg for bad in ["invalid api key","unsupported model","model_not_found","invalid_request_error","insufficient_quota"]):
                raise
            sleep_for = min(BACKOFF_CAP, delay) * (1 + random.random()*0.3)
            print(f"[Retryable] attempt {attempt}/{MAX_RETRIES} → sleeping {sleep_for:.1f}s")
            time.sleep(sleep_for)
            delay *= BACKOFF_BASE
            last_err = e

    raise RuntimeError(f"chat_complete failed after retries. Last error: {last_err}")

print("Groq client ready. MODEL_NAME:", MODEL_NAME)


Groq client ready. MODEL_NAME: llama-3.1-8b-instant


In [4]:
#Step 4: Helpers (parser + prompt builder)
def to_float_str(s: str) -> Optional[str]:
    try:
        f = float(str(s).strip())
        if abs(f) < 1e-12: f = 0.0
        if abs(f - round(f)) < 1e-9: return str(int(round(f)))
        return str(f)
    except: return None

def parse_pred_answer(text: str) -> Optional[str]:
    s = str(text)
    m = re.findall(r"####\s*([+-]?\d[\d,]*(?:\.\d+)?)", s)
    if m: return to_float_str(m[-1].replace(",", ""))
    m = re.findall(r"(?:final answer|answer)\s*[:\-]?\s*([+-]?\d[\d,]*(?:\.\d+)?)", s, flags=re.I)
    if m: return to_float_str(m[-1].replace(",", ""))
    m = re.findall(r"([+-]?\d[\d,]*(?:\.\d+)?)", s)
    if m: return to_float_str(m[-1].replace(",", ""))
    return None

SYSTEM_PRIME = "Follow the user’s instructions exactly. Only produce the format requested."

def build_query(prompt_header: str, question: str) -> List[Dict]:
    content = f"""{prompt_header}

Q: {question}

STRICT OUTPUT RULES:
- Put only the final numeric answer on a new line prefixed exactly with four hashes and a space.
- Example: #### 40
- Do not add units or extra text after the number.
"""
    return [
        {"role": "system", "content": SYSTEM_PRIME},
        {"role": "user",   "content": content},
    ]


In [5]:
#Step 5:Prompt templates
BASE_PROMPT = """You will answer an elementary math word problem.
Provide only the final numeric answer at the end on a new line as: #### ANSWER
Do not include units in the final answer.
"""

MANUAL_IMPROVED_PROMPT = """You are an expert math tutor. Think step by step and verify each calculation carefully.
Solve the problem using clear reasoning, compute the final value, and double-check arithmetic.
At the end, output only the final numeric answer on a new line as: #### ANSWER
Do not include units in the final answer.
"""

FEW_SHOT_BLOCK = """
Here are a few examples:

Q: Sarah has 3 apples and buys 4 more. How many apples does she have now?
Reasoning: 3 + 4 = 7
#### 7

Q: A box holds 6 pens. If there are 5 boxes, how many pens in total?
Reasoning: 6 * 5 = 30
#### 30

Q: Tom read 12 pages on Monday and 13 on Tuesday. How many pages total?
Reasoning: 12 + 13 = 25
#### 25

Now solve the next problem.
"""

FEW_SHOT_PROMPT = f"""You are an expert math tutor. Think step by step and verify each calculation carefully.
{FEW_SHOT_BLOCK}
At the end, output only the final numeric answer on a new line as: #### ANSWER
Do not include units in the final answer.
"""


In [6]:
#Step 6:Evaluators
TEMPERATURE      = float(os.getenv("TEMPERATURE", "0.2"))
SC_TEMPERATURE   = float(os.getenv("SC_TEMPERATURE", "0.7"))
SC_SAMPLES       = int(os.getenv("SC_SAMPLES", "3"))
RATE_LIMIT_SLEEP = float(os.getenv("RATE_LIMIT_SLEEP", "1.0"))

from typing import Tuple

def score_one(question: str, gt_answer: str, prompt_header: str, 
              temperature: float = TEMPERATURE, model: str = MODEL_NAME, 
              sleep_sec: float = RATE_LIMIT_SLEEP) -> Tuple[int, str]:
    try:
        msgs = build_query(prompt_header, question)
        text = chat_complete(msgs, model=model, temperature=temperature)
        pred = parse_pred_answer(text)
        time.sleep(sleep_sec)
        if pred is None:
            return 0, text
        return int(pred == gt_answer), text
    except Exception as e:
        time.sleep(sleep_sec)
        return 0, f"[ERROR] {type(e).__name__}: {e}"

def evaluate_dataset(df: pd.DataFrame, prompt_header: str, temperature=TEMPERATURE, 
                     model=MODEL_NAME, max_eval: Optional[int] = None):
    recs = []
    total = len(df) if max_eval is None else min(max_eval, len(df))
    try:
        for i, row in enumerate(df.itertuples(index=False), start=1):
            if max_eval is not None and i > max_eval: break
            ok, raw = score_one(row.question, row.gt_answer, prompt_header, temperature=temperature, model=model)
            recs.append({"idx": i, "question": row.question, "gt_answer": row.gt_answer, "correct": ok, "raw_output": raw})
            if i % 5 == 0:
                got = sum(r["correct"] for r in recs)
                print(f"  progress: {i}/{total} • running acc={got/i:.3f}")
    except TooManyRateLimits as e:
        print(" Stage aborted:", e)

    out = pd.DataFrame(recs)
    acc = out["correct"].mean() if len(out) else 0.0
    return acc, out

from collections import Counter

def self_consistency_vote(question: str, prompt_header: str, k: int = SC_SAMPLES, 
                          temperature: float = SC_TEMPERATURE, model: str = MODEL_NAME):
    preds, raws = [], []
    for _ in range(k):
        msgs = build_query(prompt_header, question)
        text = chat_complete(msgs, model=model, temperature=temperature)
        raws.append(text)
        pred = parse_pred_answer(text)
        if pred is not None:
            preds.append(pred)
        time.sleep(RATE_LIMIT_SLEEP)
    if not preds:
        return None, raws
    vote = Counter(preds).most_common(1)[0][0]
    return vote, raws

def evaluate_self_consistency(df: pd.DataFrame, prompt_header: str, k=SC_SAMPLES, 
                              temperature=SC_TEMPERATURE, model=MODEL_NAME, max_eval: Optional[int]=None):
    recs = []
    total = len(df) if max_eval is None else min(max_eval, len(df))
    try:
        for i, row in enumerate(df.itertuples(index=False), start=1):
            if max_eval is not None and i > max_eval: break
            vote, raws = self_consistency_vote(row.question, prompt_header, k=k, temperature=temperature, model=model)
            ok = int(vote == row.gt_answer) if vote is not None else 0
            recs.append({"idx": i, "question": row.question, "gt_answer": row.gt_answer, "pred_vote": vote, "correct": ok, "raw_samples": raws})
            if i % 5 == 0:
                got = sum(r["correct"] for r in recs)
                print(f"  SC progress: {i}/{total} • running acc={got/i:.3f}")
    except TooManyRateLimits as e:
        print(" Self-consistency aborted:", e)

    out = pd.DataFrame(recs)
    acc = out["correct"].mean() if len(out) else 0.0
    return acc, out


In [7]:
#Step 7:Baseline -> Manual -> Few-shot
RESULTS = {}
PER_STAGE_LIMIT = int(os.getenv("PER_STAGE_LIMIT", "5"))  # try 5–10 first

print("== Baseline (minimal prompt) ==")
acc_base, df_base = evaluate_dataset(
    data_df, BASE_PROMPT, temperature=0.0, model=MODEL_NAME, max_eval=PER_STAGE_LIMIT
)
print(f"Baseline accuracy: {acc_base:.3f}")
RESULTS["baseline"] = acc_base
df_base.to_csv("results_baseline.csv", index=False)

print("\n== Manual improved prompt (role + step-by-step) ==")
acc_manual, df_manual = evaluate_dataset(
    data_df, MANUAL_IMPROVED_PROMPT, temperature=0.0, model=MODEL_NAME, max_eval=PER_STAGE_LIMIT
)
print(f"Manual improved accuracy: {acc_manual:.3f}")
RESULTS["manual_improved"] = acc_manual
df_manual.to_csv("results_manual_improved.csv", index=False)

print("\n== Few-shot prompt (short exemplars + CoT) ==")
acc_fewshot, df_few_shot = evaluate_dataset(
    data_df, FEW_SHOT_PROMPT, temperature=0.2, model=MODEL_NAME, max_eval=PER_STAGE_LIMIT
)
print(f"Few-shot accuracy: {acc_fewshot:.3f}")
RESULTS["few_shot"] = acc_fewshot
df_few_shot.to_csv("results_few_shot.csv", index=False)


== Baseline (minimal prompt) ==


  progress: 5/5 • running acc=1.000
Baseline accuracy: 1.000

== Manual improved prompt (role + step-by-step) ==


  progress: 5/5 • running acc=1.000
Manual improved accuracy: 1.000

== Few-shot prompt (short exemplars + CoT) ==


  progress: 5/5 • running acc=1.000
Few-shot accuracy: 1.000


In [8]:
#Step 8: Self-consistency
print("\n== Self-consistency over few-shot prompt (k samples, majority vote) ==")
SC_SAMPLES = max(3, int(os.getenv("SC_SAMPLES", "3")))  # 3 to start; 5–7 later
SC_EVAL_LIMIT = int(os.getenv("SC_EVAL_LIMIT", "3"))

acc_sc, df_sc = evaluate_self_consistency(
    data_df, FEW_SHOT_PROMPT, k=SC_SAMPLES, temperature=0.7, model=MODEL_NAME, max_eval=SC_EVAL_LIMIT
)
print(f"Self-consistency accuracy: {acc_sc:.3f}")
RESULTS["self_consistency"] = acc_sc
df_sc.to_csv("results_self_consistency.csv", index=False)

print("\n== Summary so far ==")
for k, v in RESULTS.items():
    print(f"{k:>20}: {v:.3f}")



== Self-consistency over few-shot prompt (k samples, majority vote) ==


Self-consistency accuracy: 1.000

== Summary so far ==
            baseline: 1.000
     manual_improved: 1.000
            few_shot: 1.000
    self_consistency: 1.000


In [9]:
#Step 9: Automated prompt search (OPRO-style) on a subset
AUTO_MAX_EVAL = min(50, len(data_df))   # keep small first
CANDIDATES_PER_GEN = 2
GENERATIONS = 1

OPRO_META_PROMPT = """You are optimizing the instruction that precedes math questions to improve accuracy.
Propose ONE improved instruction block (2-4 sentences) that helps a small model solve grade-school math word problems.
Emphasize step-by-step reasoning, estimation checks, and a final numeric answer formatting exactly like '#### ANSWER'.
Return ONLY the instruction text, no explanation.
"""

def propose_instruction_blocks(n: int) -> List[str]:
    props = []
    for _ in range(n):
        text = chat_complete([{"role":"user","content": OPRO_META_PROMPT}],
                             model=MODEL_NAME, temperature=0.7, max_tokens=250)
        if "#### ANSWER" not in text:
            text = text.strip() + "\nAlways end with the final numeric answer line formatted exactly as: #### ANSWER"
        props.append(text.strip())
        time.sleep(RATE_LIMIT_SLEEP)
    return props

def evaluate_instruction_block(instr: str, max_eval=AUTO_MAX_EVAL):
    acc, out = evaluate_dataset(data_df, instr, temperature=0.2, model=MODEL_NAME, max_eval=max_eval)
    return acc, out

pool = [BASE_PROMPT.strip(), MANUAL_IMPROVED_PROMPT.strip(), FEW_SHOT_PROMPT.strip()]
history = []

print(f"\n=== Automated Prompt Search: {GENERATIONS} gen × {CANDIDATES_PER_GEN} candidates ===")
new_instrs = propose_instruction_blocks(CANDIDATES_PER_GEN)
pool = pool + new_instrs

scored = []
for i, instr in enumerate(pool):
    print(f"\nEvaluating candidate {i+1}/{len(pool)}")
    acc, out = evaluate_instruction_block(instr, max_eval=AUTO_MAX_EVAL)
    print(f"  -> accuracy (subset {AUTO_MAX_EVAL}): {acc:.3f}")
    scored.append((acc, instr))
scored.sort(key=lambda x: x[0], reverse=True)

best_auto_prompt = scored[0][1]
print("\nBest automated instruction block:\n")
print(best_auto_prompt)

#Final full evaluation of automated-best (still capped by PER_STAGE_LIMIT if set)
acc_auto_full, df_auto_full = evaluate_dataset(data_df, best_auto_prompt, temperature=0.2, model=MODEL_NAME)
print(f"\nAutomated-best (full set) accuracy: {acc_auto_full:.3f}")
RESULTS["automated_best_full"] = acc_auto_full
df_auto_full.to_csv("results_automated_best_full.csv", index=False)

print("\n== Final Summary ==")
for k, v in RESULTS.items():
    print(f"{k:>24}: {v:.3f}")



=== Automated Prompt Search: 1 gen × 2 candidates ===



Evaluating candidate 1/5


  progress: 5/5 • running acc=1.000
  -> accuracy (subset 5): 1.000

Evaluating candidate 2/5


  progress: 5/5 • running acc=1.000
  -> accuracy (subset 5): 1.000

Evaluating candidate 3/5


  progress: 5/5 • running acc=1.000
  -> accuracy (subset 5): 1.000

Evaluating candidate 4/5


  progress: 5/5 • running acc=1.000
  -> accuracy (subset 5): 1.000

Evaluating candidate 5/5


  progress: 5/5 • running acc=1.000
  -> accuracy (subset 5): 1.000

Best automated instruction block:

You will answer an elementary math word problem.
Provide only the final numeric answer at the end on a new line as: #### ANSWER
Do not include units in the final answer.


  progress: 5/5 • running acc=1.000

Automated-best (full set) accuracy: 1.000

== Final Summary ==
                baseline: 1.000
         manual_improved: 1.000
                few_shot: 1.000
        self_consistency: 1.000
     automated_best_full: 1.000


In [10]:
#Step 10: Save accuracy table + preview CSVs
summary_rows = [{"prompt_type": name, "accuracy": float(acc)} for name, acc in RESULTS.items()]
summary_df = pd.DataFrame(summary_rows).sort_values("accuracy", ascending=False)
summary_df.to_csv("summary_accuracies.csv", index=False)
display(summary_df)

def sample_preview(fname: str, df: pd.DataFrame, n=10):
    cols = [c for c in ["idx","question","gt_answer","correct","raw_output","pred_vote","raw_samples"] if c in df.columns]
    df.head(n)[cols].to_csv(fname, index=False)

sample_preview("preview_baseline.csv", df_base)
sample_preview("preview_manual_improved.csv", df_manual)
sample_preview("preview_few_shot.csv", df_few_shot)
if 'df_sc' in globals():
    df_sc.head(10)[["idx","question","gt_answer","pred_vote","correct"]].to_csv("preview_self_consistency.csv", index=False)
if 'df_auto_full' in globals():
    df_auto_full.head(10)[["idx","question","gt_answer","correct","raw_output"]].to_csv("preview_automated_best.csv", index=False)

print("Wrote: summary_accuracies.csv + preview_*.csv")


Unnamed: 0,prompt_type,accuracy
0,baseline,1.0
1,manual_improved,1.0
2,few_shot,1.0
3,self_consistency,1.0
4,automated_best_full,1.0


Wrote: summary_accuracies.csv + preview_*.csv
