<a href="https://colab.research.google.com/github/Pran9177r/info7375_Self_Improving_Ai/blob/main/week__5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:


!pip install -q transformers accelerate datasets safetensors einops peft evaluate


from getpass import getpass
import os, random, re, torch, json, time
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.optim import AdamW

HF_TOKEN = getpass(" Paste your Hugging Face token (press Enter to skip): ")
if HF_TOKEN:
    os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Running on:", device)


    math_ds = load_dataset("HuggingFaceH4/MATH", split="train[:1%]")
    print("Loaded MATH dataset from HuggingFaceH4/MATH")
except Exception as e:
    print(" Could not load HuggingFaceH4/MATH:", e)
    print("Using small synthetic data instead.")
    math_ds = load_dataset("json", data_files={"train": [
        {"problem": "What is 2 + 2?", "solution": "4"},
        {"problem": "Simplify 3 * (2 + 4)", "solution": "18"},
        {"problem": "If x = 5, what is 2x + 3?", "solution": "13"},
        {"problem": "Find 7 * 8", "solution": "56"}
    ]})["train"]

try:
    math500 = load_dataset("HuggingFaceH4/MATH-500", split="train")
    print("Loaded MATH-500 dataset.")
except Exception as e:
    print(" Could not load MATH-500 dataset:", e)
    math500 = math_ds.select(range(min(50, len(math_ds))))

def build_prompt(example):
    q = example.get("problem", example.get("question", ""))
    return f"Solve: {q}\n\nAnswer:"

train_prompts = [build_prompt(x) for x in math_ds]
train_answers = [x.get("solution", x.get("answer", "")).strip() for x in math_ds]
eval_prompts = [build_prompt(x) for x in math500]
eval_answers = [x.get("solution", x.get("answer", "")).strip() for x in math500]

print(f" Dataset ready. Train: {len(train_prompts)}, Eval: {len(eval_prompts)}")
print("Example Prompt:", train_prompts[0])
print("Example Answer:", train_answers[0])


MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
print("Loading model... (may take a few minutes)")

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True
    )
    print("Model loaded successfully.")
except Exception as e:
    print("Could not load Qwen model:", e)
    print("Using smaller fallback model (facebook/opt-1.3b)")
    MODEL_NAME = "facebook/opt-1.3b"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)

model.eval()


def normalize_answer(ans):
    s = ans.lower()
    s = re.sub(r'[^0-9\.\-\/]', '', s)
    return s.strip()

def compute_reward(pred, gold):
    g = normalize_answer(pred)
    t = normalize_answer(gold)
    return 1.0 if (g and t and g == t) else 0.0

def generate_once(prompt, max_new_tokens=64, temperature=0.7, top_p=0.9):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=max_new_tokens,
                             do_sample=True, temperature=temperature,
                             top_p=top_p, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)


def estimate_Vstar(prompts, answers, k=3):
    Vstar = []
    for i, p in enumerate(prompts):
        best = 0.0
        for _ in range(k):
            gen = generate_once(p)
            r = compute_reward(gen, answers[i] if i < len(answers) else "")
            best = max(best, r)
        Vstar.append(best)
    return Vstar

print("Estimating V* on small subset...")
subset_prompts = train_prompts[:10]
subset_answers = train_answers[:10]
Vstar_values = estimate_Vstar(subset_prompts, subset_answers, k=2)
print("Example V* values:", Vstar_values[:10])


optimizer = AdamW(model.parameters(), lr=1e-5)

def a_star_po_update(prompts, answers, Vstar):
    model.train()
    for i, prompt in enumerate(prompts):
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
        with torch.no_grad():
            out = model.generate(input_ids, max_new_tokens=64, do_sample=True,
                                 temperature=0.7, top_p=0.9,
                                 pad_token_id=tokenizer.eos_token_id)
        gen_ids = out[:, input_ids.shape[-1]:]
        gen_text = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
        r = compute_reward(gen_text, answers[i])
        A = r - Vstar[i]
        # compute NLL loss
        full = torch.cat([input_ids, gen_ids], dim=1)
        outputs = model(full)
        logits = outputs.logits[:, :-1, :]
        targets = full[:, 1:]
        log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
        nll = -torch.gather(log_probs, 2, targets.unsqueeze(-1)).squeeze(-1).mean()
        # policy update (scaled by advantage)
        loss = -A * nll
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
    model.eval()

print(" Running one quick training pass...")
a_star_po_update(subset_prompts, subset_answers, Vstar_values)
print("Training pass complete.")


def evaluate(prompts, answers, n=5):
    correct = 0
    for i in range(min(n, len(prompts))):
        gen = generate_once(prompts[i])
        r = compute_reward(gen, answers[i])
        correct += r
        print(f"\nQ{i+1}: {prompts[i]}")
        print("Pred:", gen.strip())
        print("Gold:", answers[i], "| Reward:", r)
    acc = correct / n
    print(f"\n Eval accuracy on {n} samples: {acc:.2f}")
    return acc

evaluate(eval_prompts, eval_answers, n=5)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m81.9/84.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hRunning on: cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/351k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/240k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/746 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/546 [00:00<?, ? examples/s]

✅ Loaded MATH dataset from HuggingFaceH4/MATH


README.md:   0%|          | 0.00/412 [00:00<?, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

⚠️ Could not load MATH-500 dataset: Unknown split "train". Should be one of ['test'].
✅ Dataset ready. Train: 7, Eval: 7
Example Prompt: Solve: The matrix for reflecting over a certain line $\ell,$ which passes through the origin, is given by
\[\begin{pmatrix} \frac{7}{25} & -\frac{24}{25} \\ -\frac{24}{25} & -\frac{7}{25} \end{pmatrix}.\]Find the direction vector of line $\ell.$  Enter your answer in the form $\begin{pmatrix} a \\ b \end{pmatrix},$ where $a,$ and $b$ are integers, $a > 0,$ and $\gcd(|a|,|b|) = 1.$

Answer:
Example Answer: Since $\begin{pmatrix} a \\ b \end{pmatrix}$ actually lies on $\ell,$ the reflection takes this vector to itself.

[asy]
unitsize(1.5 cm);

pair D = (4,-3), V = (2,1), P = (V + reflect((0,0),D)*(V))/2;

draw((4,-3)/2--(-4,3)/2,dashed);
draw((-2,0)--(2,0));
draw((0,-2)--(0,2));
draw((0,0)--P,Arrow(6));

label("$\ell$", (4,-3)/2, SE);
[/asy]

Then
\[\begin{pmatrix} \frac{7}{25} & -\frac{24}{25} \\ -\frac{24}{25} & -\frac{7}{25} \end{pmatrix} \begin{pma

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

✅ Model loaded successfully.
Estimating V* on small subset...


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


✅ Example V* values: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
✅ Running one quick training pass...
