In [None]:
import time
import csv
import math
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-base"
device = "cuda" if __import__("torch").cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

def scoring_function(prompt, max_new_tokens=80):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def make_variations(base):
    variations = []
    variations.append(("direct", base, 20, []))
    variations.append(("contextual", "You are an expert AI instructor. Answer concisely and clearly:\n\n" + base, 25, []))
    variations.append(("constrained", base + "\n\nGive 3 short numbered points (1-2 sentences each) and a one-line summary at the end.", 40, []))
    return variations

def keyword_score(text, keywords):
    if not keywords:
        return 0.0
    text_low = text.lower()
    hits = sum(1 for k in keywords if k.lower() in text_low)
    return hits / len(keywords)

def length_score(text, target_tokens):
    tokens = len(text.split())
    sigma = max(3, target_tokens * 0.6)
    return math.exp(-0.5 * ((tokens - target_tokens) / sigma) ** 2)

def combined_score(text, keywords, target_tokens):
    ks = keyword_score(text, keywords)
    ls = length_score(text, target_tokens)
    if not keywords:
        return 0.35 * ls + 0.65 * min(1.0, len(text) / max(1, target_tokens * 4))
    return 0.6 * ks + 0.4 * ls

keyword_guidance = {
    "Explain LangChain in one line.": ["chains", "agent", "component", "LLM", "prompts"],
    "Explain LangGraph simply.": ["graph", "nodes", "edges", "flow", "pipeline"],
    "What is MLOps for LLMs?": ["deployment", "monitoring", "model", "data", "scaling"],
    "Translate to Hindi: Hello, how are you?": ["नमस्ते", "कैसे", "हो"]
}

base_prompts = [
    "Explain LangChain in one line.",
    "Explain LangGraph simply.",
    "What is MLOps for LLMs?",
    "Translate to Hindi: Hello, how are you?"
]

output_csv = "prompt_variation_results.csv"

all_results = []
print("\nPrompt Engineering Monitoring\n")

THRESHOLD = 1.5

for base in base_prompts:
    variations = make_variations(base)
    keywords = keyword_guidance.get(base, [])
    for i, (vname, vprompt, target_len, _) in enumerate(variations):
        v_keywords = keywords
        start = time.perf_counter()
        try:
            gen = scoring_function(vprompt, max_new_tokens=80)
        except Exception as e:
            gen = f"[ERROR: {e}]"
        latency = time.perf_counter() - start
        status = "WITHIN_LIMIT" if latency <= THRESHOLD else "EXCEEDED"
        score = combined_score(gen, v_keywords, target_len)

        row = {
            "base_prompt": base,
            "variation": vname,
            "prompt_text": vprompt,
            "generated": gen,
            "latency_sec": round(latency, 3),
            "latency_status": status,
            "keyword_hits": keyword_score(gen, v_keywords),
            "length_score": round(length_score(gen, target_len), 4),
            "combined_score": round(score, 4)
        }
        all_results.append(row)

        print(f"Base: {base}")
        print(f"Variation: {vname}  |  Latency: {latency:.3f}s  ({status})")
        print("Generated:\n", gen)
        print("Heuristic scores -> keyword:", row["keyword_hits"], " length:", row["length_score"], " combined:", row["combined_score"])
        print("-" * 70)

fieldnames = ["base_prompt","variation","prompt_text","generated","latency_sec","latency_status","keyword_hits","length_score","combined_score"]

with open(output_csv, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    for r in all_results:
        writer.writerow({k: r.get(k, "") for k in fieldnames})

print("\nRecommendation summary:\n")

for base in base_prompts:
    rows = [r for r in all_results if r["base_prompt"] == base]
    best = max(rows, key=lambda r: r["combined_score"])
    print(f"- {base}")
    print(f"  Best variation: {best['variation']}  (score {best['combined_score']})")
    print(f"  Generated (first 120 chars): {best['generated'][:120].replace('\\n',' ')}")
    reasons = []
    if best["keyword_hits"] > 0:
        reasons.append("matched important keywords")
    if best["length_score"] > 0.6:
        reasons.append("sensible length")
    if best["latency_status"] == "WITHIN_LIMIT":
        reasons.append("fast enough")
    print("  Why:", ", ".join(reasons) if reasons else "Preferable structure/clarity.")
    print()

print(f"Full results saved to: {output_csv}")


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]


Prompt Engineering Monitoring
Threshold-based latency still applies per generation.

Base: Explain LangChain in one line.
Variation: direct  |  Latency: 1.741s  (EXCEEDED)
Generated:
 a syllable of a word
Heuristic scores -> keyword: 0.0  length: 0.4578  combined: 0.1831
----------------------------------------------------------------------
Base: Explain LangChain in one line.
Variation: contextual  |  Latency: 1.330s  (WITHIN_LIMIT)
Generated:
 LangChain is a generative machine learning algorithm.
Heuristic scores -> keyword: 0.0  length: 0.4868  combined: 0.1947
----------------------------------------------------------------------
Base: Explain LangChain in one line.
Variation: constrained  |  Latency: 2.045s  (EXCEEDED)
Generated:
 LangChain is a physicist who specializes in physics.
Heuristic scores -> keyword: 0.0  length: 0.4111  combined: 0.1644
----------------------------------------------------------------------
Base: Explain LangGraph simply.
Variation: direct  |  Latency: