In [4]:
# If needed, run this once (comment out if your env already has these)
# Note: may require restart after install in some notebook environments.
!pip install --upgrade "google-generativeai>=0.3.0" pandas




In [14]:
# Robust Google Gemini client init for Jupyter
# - Sanitizes pasted key (removes spaces/newlines/zero-width chars)
# - Does not hard-fail on prefix; only warns

import os, sys, re
from pathlib import Path
from datetime import datetime
from typing import Any, Optional, Dict
import json, random, time
import google.generativeai as genai

def _sanitize_key(k: str) -> str:
    if not isinstance(k, str):
        return ""
    # remove all whitespace (spaces, tabs, newlines) and zero-width chars
    k = k.replace("\u200b", "").replace("\u200c", "").replace("\u200d", "").replace("\ufeff", "")
    k = "".join(k.split())
    return k

# Gemini client init (env var first, fallback to hardcoded string)
# If you set GOOGLE_API_KEY in your terminal/.env, you don't need the fallback.

google_api_key = os.getenv("GOOGLE_API_KEY")

if google_api_key:
    google_api_key = _sanitize_key(google_api_key)
else:
    # =======================  << PASTE HERE >>  =======================
    # Replace the placeholder below with YOUR real key if not using env var.
    # Example (FAKE): "AIzaSyBVWwuubnXhT7..."
    google_api_key = ""  # <-- paste your key here
    # ==================================================================
    google_api_key = _sanitize_key(google_api_key)

# Basic validation
if not google_api_key or google_api_key.strip() == "" or google_api_key.startswith("YOUR-"):
    print("Error: Google API key not provided.")
    sys.exit(1)

# Configure the Gemini client
genai.configure(api_key=google_api_key)

# Create model instance
model = genai.GenerativeModel('gemini-1.5-pro')

print("✅ Google Gemini client ready.")


✅ Google Gemini client ready.


In [15]:
# === Paths ===
INPUT_JSON = "prompt_test_2_grouped.json"  # put this next to the notebook or use absolute path
RUNS_ROOT  = Path("runs")

# === Models ===
GEN_MODEL   = "gemini-1.5-pro"   # for generation
JUDGE_MODEL = "gemini-1.5-pro"   # for judging

# === Generation params ===
GEN_TEMPERATURE = 0.4
GEN_MAX_TOKENS  = 800    # be realistic; too high can error
GEN_RETRIES     = 5

# === Judge params ===
JUDGE_TEMPERATURE = 0.0
JUDGE_MAX_TOKENS  = 350
JUDGE_RETRIES     = 5

# === Fresh run dir ===
ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
RUN_DIR = RUNS_ROOT / ts
RUN_DIR.mkdir(parents=True, exist_ok=True)

# Output artifact paths
PATH_WITH_ANSWERS = RUN_DIR / "with_answers.json"
PATH_WITH_SCORES  = RUN_DIR / "with_scores.json"
PATH_PER_PROMPT   = RUN_DIR / "per_prompt_scores.csv"
PATH_SUMMARY      = RUN_DIR / "summary_by_type.csv"
PATH_TRIPLETS     = RUN_DIR / "triplets_by_topic.csv"
PATH_GEN_LOG      = RUN_DIR / "gen_log.jsonl"
PATH_JUDGE_LOG    = RUN_DIR / "judge_log.jsonl"

RUN_DIR


PosixPath('runs/20250727_184519')

In [16]:
def backoff_wait(attempt: int) -> float:
    """Exponential backoff with jitter, capped at 60s."""
    return min(60.0, (2 ** attempt) + random.random())

def load_json(path: str) -> Any:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def save_json(path: Path, obj: Any) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def append_jsonl(path: Path, obj: Any) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

def usage_dict(resp) -> Optional[dict]:
    """Extract usage information from Gemini response."""
    try:
        # Gemini provides usage metadata differently than OpenAI
        if hasattr(resp, 'usage_metadata'):
            u = resp.usage_metadata
            return {
                "prompt_tokens": getattr(u, "prompt_token_count", None),
                "completion_tokens": getattr(u, "candidates_token_count", None),
                "total_tokens": getattr(u, "total_token_count", None),
            }
        return None
    except Exception:
        return None

print("✅ Utilities ready.")

✅ Utilities ready.


In [17]:
data = load_json(INPUT_JSON)
items = data.get("outputs") or []
print(f"Loaded {len(items)} prompts.")
print("Example:", items[0] if items else "N/A")

# Ensure we start clean: drop any lingering model_answer/evaluation in-memory if present
for it in items:
    it["model_answer"] = ""
    if "evaluation" in it:
        del it["evaluation"]


Loaded 60 prompts.
Example: {'id': 'Neutral-01', 'type': 'Neutral', 'topic': 'Industrial Revolution', 'question': 'Write a balanced, well‑sourced essay explaining the catalysts, mechanisms, and social—economic consequences of the Industrial Revolution.', 'model_answer': ''}


In [18]:
try:
    # Gemini generation config
    generation_config = genai.GenerationConfig(
        temperature=0,
        max_output_tokens=5,
    )
    
    test_model = genai.GenerativeModel(
        model_name="gemini-1.5-pro",
        generation_config=generation_config,
        system_instruction="You are a concise assistant."
    )
    
    test = test_model.generate_content("Reply with exactly READY")
    print("Sanity check:", test.text.strip())
except Exception as e:
    raise RuntimeError(f"⚠ Sanity check failed: {e}")


Sanity check: READY


In [33]:
SYSTEM_WRITER = (
    "You are a careful, well-sourced academic writer. "
    "Answer comprehensively, clearly, and factually. Use concise paragraphs."
)

def generate_one(question: str) -> str:
    last_err = None
    for attempt in range(GEN_RETRIES):
        try:
            generation_config = genai.GenerationConfig(
                temperature=GEN_TEMPERATURE,
                max_output_tokens=GEN_MAX_TOKENS,
            )
            
            writer_model = genai.GenerativeModel(
                model_name=GEN_MODEL,
                generation_config=generation_config,
                system_instruction=SYSTEM_WRITER
            )
            
            resp = writer_model.generate_content(question)
            
            # Handle potential content filtering
            if resp.candidates[0].finish_reason.name in ['SAFETY', 'RECITATION']:
                return f"<FILTERED: {resp.candidates[0].finish_reason.name}>", resp
            
            return (resp.text or "").strip(), resp
        except Exception as e:
            last_err = e
            wait = backoff_wait(attempt)
            print(f"  ⚠️ gen retry {attempt+1}/{GEN_RETRIES}: {e} (sleep {wait:.1f}s)")
            time.sleep(wait)
    raise RuntimeError(f"Generation failed after {GEN_RETRIES} retries: {last_err}")

generated = 0
for i, it in enumerate(items, 1):
    pid = it.get("id", f"index_{i}")
    q   = it.get("question", "")
    print(f"[{i}/{len(items)}] Generating: {pid}")
    try:
        answer, resp = generate_one(q)
        it["model_answer"] = answer
        generated += 1
        append_jsonl(PATH_GEN_LOG, {
            "ts": datetime.utcnow().isoformat()+"Z",
            "id": pid,
            "type": it.get("type"),
            "topic": it.get("topic"),
            "usage": usage_dict(resp),
        })
    except Exception as e:
        it["model_answer"] = f"<ERROR: {e}>"
        print(f"   ✗ error: {e}")

# Save answers
save_json(PATH_WITH_ANSWERS, {"outputs": items})
print(f"\n✅ Generation complete. {generated} answers written.")
print(f"Saved → {PATH_WITH_ANSWERS}")


[1/60] Generating: Neutral-01
[2/60] Generating: Supportive-01
[3/60] Generating: Threatening-01
[4/60] Generating: Neutral-02
[5/60] Generating: Supportive-02
[6/60] Generating: Threatening-02
[7/60] Generating: Neutral-03
[8/60] Generating: Supportive-03
[9/60] Generating: Threatening-03
[10/60] Generating: Neutral-04
[11/60] Generating: Supportive-04
[12/60] Generating: Threatening-04
[13/60] Generating: Neutral-05
[14/60] Generating: Supportive-05
[15/60] Generating: Threatening-05
[16/60] Generating: Neutral-06
[17/60] Generating: Supportive-06
[18/60] Generating: Threatening-06
[19/60] Generating: Neutral-07
[20/60] Generating: Supportive-07
[21/60] Generating: Threatening-07
[22/60] Generating: Neutral-08
[23/60] Generating: Supportive-08
[24/60] Generating: Threatening-08
[25/60] Generating: Neutral-09
[26/60] Generating: Supportive-09
[27/60] Generating: Threatening-09
[28/60] Generating: Neutral-10
[29/60] Generating: Supportive-10
[30/60] Generating: Threatening-10
[31/60] G

In [19]:
# Uncomment to block until you review the JSON
# input(f"Review answers at: {PATH_WITH_ANSWERS}\nPress Enter to proceed to judging ... ")
print("Proceed when ready.")


Proceed when ready.


In [22]:
from pathlib import Path
import json

RUNS_ROOT = Path("runs")  # adjust if your runs folder is elsewhere

def count_judgeable(path: Path):
    try:
        data = json.loads(Path(path).read_text(encoding="utf-8"))
        items = data.get("outputs", [])
        empty = sum(1 for it in items if not (it.get("model_answer") or "").strip())
        errors = sum(1 for it in items if (it.get("model_answer") or "").strip().startswith("<ERROR") 
                    or (it.get("model_answer") or "").strip().startswith("<FILTERED"))
        ok = len(items) - empty - errors
        return len(items), ok, empty, errors
    except Exception as e:
        return None, None, None, None

cands = sorted(RUNS_ROOT.glob("*/with_answers.json"), key=lambda p: p.stat().st_mtime, reverse=True)
if not cands:
    print("No runs/*/with_answers.json found. You may have saved to another folder.")
else:
    print("Found the following with_answers.json files (newest first):\n")
    for i, p in enumerate(cands, 1):
        total, ok, empty, errors = count_judgeable(p)
        print(f"[{i}] {p}  | items={total}  ok={ok}  empty={empty}  errors={errors}")


Found the following with_answers.json files (newest first):

[1] runs/20250727_175338/with_answers.json  | items=60  ok=60  empty=0  errors=0
