In [None]:
# Setup: Installs and Imports (Run once – ~1-2 min first time)
!pip install transformers torch nltk rouge-score boto3 -q  # Transformers for GPT-2; NLTK/Rouge for metrics; Boto3 for future Bedrock

import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import nltk
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
nltk.download('punkt', quiet=True)

# Load GPT-2 (local, free model – 1.5B params, runs on CPU/GPU)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Fix for padding in generation

# ROUGE Scorer instance
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

print("Setup done! GPT-2 ready for prompting demos.")

In [None]:
# Zero-Shot Demo: Classify API error log (No examples – direct instruction)
# Ref: Ideal output for scoring (like a 'gold standard' unit test)
ref_output = "Critical. 500 error indicates server-side failure, risking downtime."

# Prompt: System-like instruction + user query
system_prompt = "You are an API log classifier for full-stack monitoring. Output: Label (Critical/Warning/Info) + 1-sentence reason."
user_prompt = "Log: 'HTTP 500: Database connection failed during user query.'"
full_prompt = f"{system_prompt}\n{user_prompt}\n"  # Concat for GPT-2 input

# Invoke GPT-2: Generate response (tune params as needed)
print("GPT-2 Zero-Shot Output:")
inputs = tokenizer.encode(full_prompt, return_tensors='pt')  # Tokenize prompt
outputs = model.generate(inputs, max_length=100, temperature=0.1, do_sample=True, pad_token_id=tokenizer.eos_token_id)  # Generate
gen_output = tokenizer.decode(outputs[0], skip_special_tokens=True).replace(full_prompt, '').strip()  # Extract clean response
print(gen_output)

# Scoring Function: BLEU/ROUGE (Run this after generation)
def explain_and_score(gen, ref):
    gen_tokens = nltk.word_tokenize(gen.lower())  # Tokenize for metrics
    ref_tokens = nltk.word_tokenize(ref.lower())
    
    # BLEU Explained: Like a 'precision checker' for translations – measures how many word sequences (n-grams: 1-4 words) in your generated output match the reference exactly.
    # How it Scores: Geometric average of n-gram precisions (e.g., unigram match % * bigram %...) + penalty if output too short. Range: 0-1 (1=identical).
    # When to Use: For exact-match tasks like classifications or code gen (e.g., dev bug labels) – important to detect if model hallucinates wrong terms.
    # Importance: Quantifies 'accuracy' without manual review; e.g., in prod, threshold BLEU >0.6 for auto-approval. Reliable ~80% vs. humans, but penalizes synonyms (e.g., 'failure' vs. 'crash' lowers score).
    bleu_score = sentence_bleu([ref_tokens], gen_tokens)
    print(f"\nBLEU Score: {bleu_score:.3f} (Higher = better exact phrase match; use for precise dev tasks like error labels.)")
    
    # ROUGE Explained: Like a 'recall checker' for summaries – measures how much content from the ref is captured in your output (ignores extra fluff).
    # How it Scores: F1 (precision + recall balance) for overlaps. ROUGE-1: Single words; ROUGE-L: Longest common sequences. Range: 0-1.
    # When to Use: For explanatory outputs like reasons or summaries (e.g., data eng log reports) – important to ensure key details aren't missed.
    # Importance: Helps scale eval in pipelines; e.g., if ROUGE-L <0.5, prompt needs more guidance. Reliable for completeness, but favors longer outputs; combine with BLEU for balance.
    rouge_scores = scorer.score(ref, gen)
    print(f"ROUGE-1 F1: {rouge_scores['rouge1'].fmeasure:.3f} (Word-level coverage)")
    print(f"ROUGE-L F1: {rouge_scores['rougeL'].fmeasure:.3f} (Sequence completeness)")

explain_and_score(gen_output, ref_output)

In [None]:
# One-Shot Demo: Extract API status code (1 example for guidance)
ref_output = "500"

system_prompt = "You are an API response parser for backend devs. Output only the status code number."
user_prompt = "Example: 'Response: 404 Not Found' → 404\nNow: 'Response: 500 Internal Server Error'"
full_prompt = f"{system_prompt}\n{user_prompt}\n"

# Invoke GPT-2
print("GPT-2 One-Shot Output:")
inputs = tokenizer.encode(full_prompt, return_tensors='pt')
outputs = model.generate(inputs, max_length=50, temperature=0.1, do_sample=True, pad_token_id=tokenizer.eos_token_id)
gen_output = tokenizer.decode(outputs[0], skip_special_tokens=True).replace(full_prompt, '').strip()
print(gen_output)

explain_and_score(gen_output, ref_output)  # Reuse function from above

In [None]:
# Few-Shot Demo: Extract KV pairs from ETL log (2-3 examples for patterns)
ref_output = "{'job_id': 'etl-123', 'duration_sec': 45, 'rows_processed': 10000}"

system_prompt = "You are a log KV extractor for data pipelines. Output as dict: {'key': value}"
user_prompt = """Ex1: '{"task": "load", "time": 10}' → {'task': 'load', 'time': 10}
Ex2: '{"error": "overflow", "code": 123}' → {'error': 'overflow', 'code': 123}
Ex3: '{"batch": "daily", "size": 5000}' → {'batch': 'daily', 'size': 5000}
Now: '{"job_id": "etl-123", "duration_sec": 45, "rows_processed": 10000}'"""
full_prompt = f"{system_prompt}\n{user_prompt}\n"

# Invoke GPT-2
print("GPT-2 Few-Shot Output:")
inputs = tokenizer.encode(full_prompt, return_tensors='pt')
outputs = model.generate(inputs, max_length=100, temperature=0.1, do_sample=True, pad_token_id=tokenizer.eos_token_id)
gen_output = tokenizer.decode(outputs[0], skip_special_tokens=True).replace(full_prompt, '').strip()
print(gen_output)

explain_and_score(gen_output, ref_output)

In [None]:
# CoT Demo: Optimize slow SQL query (Step-by-step reasoning)
ref_output = "Step 1: Full scan slow – add index on user_id. Step 2: Limit columns. Fixed: SELECT name, email FROM users WHERE user_id > 1000 LIMIT 50;"

system_prompt = "You are a SQL optimizer for data engineers. Think step-by-step, then output fixed query."
user_prompt = "Query: SELECT * FROM users WHERE user_id > 1000; Issue: Slow on large table."
full_prompt = f"{system_prompt}\n{user_prompt}\n"

# Invoke GPT-2 (higher max_length for chain)
print("GPT-2 CoT Output:")
inputs = tokenizer.encode(full_prompt, return_tensors='pt')
outputs = model.generate(inputs, max_length=150, temperature=0.1, do_sample=True, pad_token_id=tokenizer.eos_token_id)
gen_output = tokenizer.decode(outputs[0], skip_special_tokens=True).replace(full_prompt, '').strip()
print(gen_output)

explain_and_score(gen_output, ref_output)

In [None]:
# Self-Consistency Demo: Estimate API latency (3 paths + consensus)
ref_output = "Consensus: 150 ms (base 100 ms + 50 ms overhead)."

system_prompt = "You are a latency estimator for microservices. Generate 3 step-by-step paths, then consensus in ms."
user_prompt = "Scenario: Base GET request 100 ms, with DB query adding 20-80 ms overhead."
full_prompt = f"{system_prompt}\n{user_prompt}\n"

# Invoke GPT-2 3x (vary temp for 'multiple paths')
print("GPT-2 Self-Consistency Outputs:")
paths = []
for temp in [0.1, 0.3, 0.5]:  # Lower temp = consistent; higher = varied
    inputs = tokenizer.encode(full_prompt, return_tensors='pt')
    outputs = model.generate(inputs, max_length=150, temperature=temp, do_sample=True, pad_token_id=tokenizer.eos_token_id)
    path = tokenizer.decode(outputs[0], skip_special_tokens=True).replace(full_prompt, '').strip()
    paths.append(path)
    print(f"Path (Temp {temp}): {path}")

gen_output = ' '.join(paths)  # Concat paths for scoring (or manual pick consensus)
explain_and_score(gen_output, ref_output)

In [1]:
pwd

'C:\\Users\\admin'