In [None]:
# Cell 1 — bash: install libs (run once)
!pip -q install transformers==4.35.2 accelerate bitsandbytes datasets evaluate sentence-transformers peft safetensors wandb gradio
# Optional: install optimum and onnxruntime if doing ONNX export later
!pip -q install optimum[onnx] onnxruntime
# Cell 2 — python: mount drive and config
from google.colab import drive
drive.mount('/content/drive')

BASE_DIR = "/content/drive/MyDrive/llm_project"  # change if you want
import os
os.makedirs(BASE_DIR, exist_ok=True)
print("base dir:", BASE_DIR)
# Cell 3 — python: set tokens (secure way)
# NOTE: Do NOT hardcode tokens in public notebooks. Use colab secrets or input prompts.
import getpass
HF_TOKEN = getpass.getpass("Hugging Face token (or ENTER to skip): ")
OPENAI_KEY = getpass.getpass("OpenAI key (or ENTER to skip): ")

# If you entered HF token, configure env (transformers will pick it up)
if HF_TOKEN:
    import os
    os.environ['HF_HOME'] = os.path.join(BASE_DIR, ".hf")
    os.environ['HF_TOKEN'] = HF_TOKEN
    from huggingface_hub import login
    login(token=HF_TOKEN)
# Safe Step 4A — auto device, fallback, and helpful warnings
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

MODEL_NAME = "google/flan-t5-large"  # change to smaller if you don't have GPU (e.g., "google/flan-t5-small")

# detect device
if torch.cuda.is_available():
    device = 0
    device_str = "cuda"
else:
    device = -1
    device_str = "cpu"

print(f"torch.cuda.is_available() -> {torch.cuda.is_available()}, selected device: {device_str}")

# helper to show GPU info if available
import os
if device_str == "cuda":
    try:
        print("nvidia-smi output:")
        os.system("nvidia-smi")
    except Exception:
        pass
else:
    print("No GPU detected. Inference on CPU will be slow for large models. Consider switching runtime to GPU or using a smaller model (flan-t5-small or flan-t5-base) or API mode.")

print("loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print("loading model (will use fp16 on CUDA if available)...")
if device_str == "cuda":
    # try fp16 on GPU
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).half().to("cuda")
    inference = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=device)
else:
    # CPU fallback (use float32); for big models this may be slow or OOM
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
    model.to("cpu")
    inference = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=device)

print("ready — device:", device_str)
# Safe loader for mac (detects CUDA, MPS (Apple Silicon), or CPU)
# Installs small model only to avoid OOMs on CPU/MPS.
# NOTE: For MPS, ensure your PyTorch has MPS support (PyTorch 1.12+).

import torch
print("torch version:", torch.__version__)
print("cuda available:", torch.cuda.is_available())
print("mps available:", getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available())

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
MODEL_NAME = "google/flan-t5-small"  # small & friendly for CPU/MPS

print("Selected model:", MODEL_NAME)

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print("Loading model (this may take a moment)...")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Select device: prefer cuda > mps > cpu
if torch.cuda.is_available():
    device = "cuda"
elif getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print("Moving model to device:", device)
# NOTE: .half() not recommended on MPS; keep float32 for stability on mac.
model.to(device)

# simple generation helper that handles device placement
def generate(prompt: str, max_new_tokens: int = 128):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)
        # move inputs to model device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        out = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
        text = tokenizer.decode(out[0], skip_special_tokens=True)
    return text

# quick test
print("\n-- quick test --")
print("Device used:", device)
print("Example output:")
print(generate("Summarize: Transformers changed NLP because they introduced attention.", max_new_tokens=50))
from datasets import Dataset

data = [
    {
        "id": "1",
        "input": "Explain what a language model does in simple words.",
        "target": "A language model predicts the next words or generates text based on patterns it learned."
    },
    {
        "id": "2",
        "input": "Summarize: Transformers changed NLP by enabling parallel processing and attention.",
        "target": "Transformers improved NLP by using attention and processing text in parallel."
    },
    {
        "id": "3",
        "input": "Paraphrase: Machine learning helps computers learn patterns from data.",
        "target": "Machine learning allows computers to discover patterns from examples."
    }
]

ds = Dataset.from_list(data)
ds
predictions = []
references = []

for row in ds:
    prompt = row["input"]
    print("\nPROMPT:", prompt)

    pred = generate(prompt)
    print("MODEL OUTPUT:", pred)

    predictions.append(pred)
    references.append(row["target"])

print("\nAll predictions done.")
!pip -q install evaluate sentence-transformers
!pip install rouge_score
import evaluate
from sentence_transformers import SentenceTransformer, util

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

bleu_score = bleu.compute(
    predictions=predictions,
    references=[[ref] for ref in references]
)

rouge_scores = rouge.compute(
    predictions=predictions,
    references=references
)

embedder = SentenceTransformer('all-MiniLM-L6-v2')
emb_pred = embedder.encode(predictions, convert_to_tensor=True)
emb_ref = embedder.encode(references, convert_to_tensor=True)

semantic_scores = util.cos_sim(emb_pred, emb_ref).diag().cpu().tolist()

print("\n===== EVALUATION RESULTS =====")
print("BLEU:", bleu_score)
print("ROUGE:", {k: rouge_scores[k] for k in ['rouge1','rouge2','rougeL']})
print("Semantic similarity:", semantic_scores)
# Visualization for model eval results
# Requirements: matplotlib, rouge_score, sentence-transformers
# Install if missing (uncomment to run)
# !pip install matplotlib rouge_score sentence-transformers

import matplotlib.pyplot as plt
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util
import numpy as np
import math

# ---- INPUTS (should already exist from your pipeline) ----
# predictions: list of strings (model outputs)
# references:  list of strings (gold targets)
# If semantic_scores already computed, you can set semantic_scores = [...] here.
# Otherwise we'll compute it below.

# sanity check
try:
    preds = predictions
    refs = references
except NameError:
    raise RuntimeError("You must run the inference step first so `predictions` and `references` exist.")

if len(preds) != len(refs):
    raise RuntimeError("predictions and references must have the same length.")

# ---- compute semantic similarity per example (if not present) ----
try:
    semantic_scores
except NameError:
    embedder = SentenceTransformer('all-MiniLM-L6-v2')
    emb_pred = embedder.encode(preds, convert_to_tensor=True)
    emb_ref = embedder.encode(refs, convert_to_tensor=True)
    semantic_scores = util.cos_sim(emb_pred, emb_ref).diag().cpu().tolist()

# ---- compute per-example ROUGE-1 (F1) using rouge_score ----
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)
rouge1_f1 = []
rouge2_f1 = []
rougeL_f1 = []
for p, r in zip(preds, refs):
    scores = scorer.score(r, p)   # note: rouge scorer signature is (target, prediction)
    rouge1_f1.append(scores['rouge1'].fmeasure)
    rouge2_f1.append(scores['rouge2'].fmeasure)
    rougeL_f1.append(scores['rougeL'].fmeasure)

# ---- compute aggregate BLEU & ROUGE via evaluate if available (optional) ----
agg_bleu = None
agg_rouge = None
try:
    import evaluate
    bleu = evaluate.load("bleu")
    agg_bleu = bleu.compute(predictions=preds, references=[[r] for r in refs])
    rouge = evaluate.load("rouge")
    agg_rouge = rouge.compute(predictions=preds, references=refs)
except Exception:
    # evaluate may not be installed or missing extras; we already have per-example rouge from rouge_score
    pass

# ---- plotting ----
indices = np.arange(len(preds)).astype(int)

plt.figure(figsize=(10,4))
plt.plot(indices, rouge1_f1, marker='o')
plt.title("Per-example ROUGE-1 (F1) across dataset")
plt.xlabel("Example index")
plt.ylabel("ROUGE-1 F1")
plt.grid(True)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10,4))
plt.plot(indices, semantic_scores, marker='o')
plt.title("Per-example Semantic Similarity (cosine)")
plt.xlabel("Example index")
plt.ylabel("Cosine similarity")
plt.ylim(-0.05, 1.05)
plt.grid(True)
plt.tight_layout()
plt.show()

# combined bar for ROUGE1 vs Semantic
width = 0.35
plt.figure(figsize=(10,5))
plt.bar(indices - width/2, rouge1_f1, width=width)
plt.bar(indices + width/2, semantic_scores, width=width)
plt.title("ROUGE-1 (F1) vs Semantic Similarity per example")
plt.xlabel("Example index")
plt.legend(["ROUGE-1 F1","Semantic sim"])
plt.grid(axis='y')
plt.tight_layout()
plt.show()

# ---- print summary stats ----
def pct(x):
    return f"{x*100:.2f}%"

print("\n=== Aggregate metrics ===")
if agg_bleu is not None:
    try:
        print("BLEU:", agg_bleu)
    except Exception:
        print("BLEU computed but couldn't format.")
else:
    print("BLEU: (not computed — install `evaluate` to compute overall BLEU)")

if agg_rouge is not None:
    print("ROUGE (aggregate):")
    # print some keys if present
    for k in ['rouge1','rouge2','rougeL']:
        if k in agg_rouge:
            print(f"  {k}:", agg_rouge[k])
else:
    # fallback: print mean per-example rouge1
    mean_r1 = sum(rouge1_f1)/len(rouge1_f1) if len(rouge1_f1) else float('nan')
    print("ROUGE (mean per-example):")
    print("  ROUGE-1 (mean F1):", f"{mean_r1:.4f}", pct(mean_r1))

mean_sem = sum(semantic_scores)/len(semantic_scores) if len(semantic_scores) else float('nan')
print("Mean semantic similarity:", f"{mean_sem:.4f}", pct(mean_sem))

# ---- save figures (optional) ----
plt.figure(figsize=(6,3))
plt.plot(indices, semantic_scores, marker='o')
plt.title("Semantic similarity")
plt.tight_layout()
plt.savefig("semantic_similarity.png", dpi=150)
print("\nSaved example figure: semantic_similarity.png")
# Cell: install required packages (run if not installed)
# NOTE: In Colab uncomment the installs. On local mac, install once via pip.
# !pip install -q transformers==4.35.2 sentence-transformers evaluate rouge_score matplotlib datasets

# Imports
import os, sys, time, math
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util
import evaluate
from rouge_score import rouge_scorer
# Device detection (works on mac MPS, Colab CUDA, or CPU)
print("torch version:", torch.__version__)
cuda_avail = torch.cuda.is_available()
mps_avail = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
print("cuda:", cuda_avail, "mps:", mps_avail)

device = "cpu"
if cuda_avail:
    device = "cuda"
elif mps_avail:
    device = "mps"

print("Using device:", device)

MODEL = "google/flan-t5-small"
print("Loading", MODEL, "...")
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL)
# move model to device
model.to(device)
model.eval()
print("Model loaded.")
# Simple generate helper
def generate_text(prompt, max_new_tokens=128):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(out[0], skip_special_tokens=True)
data = [
    {"id":"q1", "task":"summarize", "input":"Summarize: Transformers changed NLP by enabling attention and parallel processing." , "target":"Transformers improved NLP through attention and parallel processing."},
    {"id":"q2", "task":"paraphrase", "input":"Paraphrase: What does a language model do?", "target":"A language model predicts or generates text by learning patterns from data."},
    {"id":"q3", "task":"qa", "input":"Q: What is overfitting? A:", "target":"Overfitting occurs when a model learns noise and performs poorly on new data."},
    {"id":"q4", "task":"context", "input":"Given: Alice went to the store. She bought apples. Question: Who bought apples?", "target":"Alice bought apples."},
    {"id":"q5", "task":"creativity", "input":"Write a short, creative two-sentence future-tech pitch about energy-saving drones.", "target":"<creative - freeform>"},
    {"id":"q6", "task":"domain", "input":"Explain 'backpropagation' in simple words.", "target":"Backpropagation updates model weights by propagating error gradients backward through the network."},
]
ds = Dataset.from_list(data)
ds
preds = []
refs = []
meta = []

print("Running inference on dataset...")
for item in ds:
    prompt = item["input"]
    # If summarization/paraphrase tasks, give explicit instruction (FLAN-T5 benefits from instruction prompts)
    if item["task"] == "summarize":
        full_prompt = f"summarize: {prompt.split(':',1)[1].strip()}"
    elif item["task"] == "paraphrase":
        full_prompt = f"paraphrase: {prompt.split(':',1)[1].strip()}"
    else:
        full_prompt = prompt

    out = generate_text(full_prompt, max_new_tokens=64)
    preds.append(out)
    refs.append(item["target"])
    meta.append({"id":item["id"], "task": item["task"], "prompt": full_prompt, "output": out})
    print(f"\n[{item['id']} - {item['task']}]")
    print("PROMPT:", full_prompt)
    print("OUTPUT: ", out)
    print("TARGET:", item["target"])
print("\nDone.")
# Multi-turn / context probe
multi_prompt = """System: You are a helpful assistant.
User: John gave Mary a book. Later John asked Mary to return it.
User: Who originally gave the book?
Answer:"""
print("Multi-turn context probe:\n", generate_text(multi_prompt, max_new_tokens=40))

# Prompt sensitivity: two near-identical prompts
p1 = "Explain backpropagation in simple words."
p2 = "In simple words, what is backpropagation?"
print("\nPrompt A:", p1, "\n->", generate_text("explain: " + p1))
print("\nPrompt B:", p2, "\n->", generate_text("explain: " + p2))
# Compute BLEU (via evaluate), ROUGE per example (rouge_score), and semantic similarity via SBERT
bleu = evaluate.load("bleu")
# compute BLEU:
bleu_res = bleu.compute(predictions=preds, references=[[r] for r in refs])
print("BLEU:", bleu_res)

# ROUGE per example using rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)
rouge1_f1 = []
rougeL_f1 = []
for p,r in zip(preds, refs):
    s = scorer.score(r, p)
    rouge1_f1.append(s['rouge1'].fmeasure)
    rougeL_f1.append(s['rougeL'].fmeasure)

print("Per-example ROUGE-1 F1:", rouge1_f1)

# Semantic similarity
embed = SentenceTransformer('all-MiniLM-L6-v2')
emb_p = embed.encode(preds, convert_to_tensor=True)
emb_r = embed.encode(refs, convert_to_tensor=True)
sims = util.cos_sim(emb_p, emb_r).diag().cpu().tolist()
print("Semantic similarities:", sims)
indices = list(range(len(preds)))
plt.figure(figsize=(8,3))
plt.plot(indices, rouge1_f1, marker='o', label='ROUGE-1 F1')
plt.plot(indices, sims, marker='x', label='Semantic sim (cosine)')
plt.title('Per-example performance: ROUGE-1 vs Semantic sim')
plt.xlabel('Example index')
plt.legend()
plt.grid(True)
plt.show()
