### Step 1: Install necesscary packages

In [2]:
!pip install matplotlib
!pip install torch numpy transformers datasets tiktoken wandb tqdm



In [3]:
!pip3 install tqdm



In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
%cd /content/drive/MyDrive/NanoGPT-Math-1/dpo

/content/drive/MyDrive/NanoGPT-Math-1/dpo


### Step 2: Package imports and configuration

In [7]:
import sys
import os
sys.path.append(os.path.abspath(".."))
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR
import torch.nn.functional as F
import random
import pickle
from model import GPT, GPTConfig
import random
from tqdm import tqdm
import time
import json
import matplotlib.pyplot as plt
# Configuration
beta = 0.5
if torch.cuda.is_available():
    device = "cuda"
    print("cuda")
elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

base_lr = 0.0002
epochs = 30
batch_size = 128
max_length =80
num_samples = 1
max_new_tokens = 200
temperature = 0.8
top_k = 200
# tokenizer
with open("../sft/meta.pkl", "rb") as f:
    meta = pickle.load(f)
stoi, itos = meta["stoi"], meta["itos"]
def encode(s): return [stoi[c] for c in s]
def decode(l): return ''.join([itos[i] for i in l])

cuda


In [8]:
import torch
print(torch.__version__, torch.version.cuda, torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))

2.8.0+cu126 12.6 True
Tesla T4


### Step 3: Define helper functions

In [9]:
def compute_logprob(input_ids):
    inputs = input_ids[:, :-1]
    targets = input_ids[:, 1:]
    logits, _ = gpt(inputs, full_seq=True)
    B, T, V = logits.size()
    logits_flat = logits.reshape(-1, V)
    targets_flat = targets.reshape(-1)
    loss = F.cross_entropy(logits_flat, targets_flat, ignore_index=0, reduction='none')
    loss = loss.reshape(B, T)
    attention_mask = (targets != 0).float()
    loss = (loss * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
    return -loss

def pad_or_truncate(seq, max_length):
    return seq[:max_length] if len(seq) > max_length else seq + [0] * (max_length - len(seq))

def get_batches(lines, batch_size):
    # random.shuffle(lines)
    #for l in lines:
    #    print(l[1])
    for i in range(0, len(lines), batch_size):
        batch = lines[i:i+batch_size]
        if len(batch) < batch_size:
            continue
        neg_inputs = [pad_or_truncate(encode(p['negative'] + '\n\n\n\n'), max_length) for p in batch]
        pos_inputs = [pad_or_truncate(encode(p['positive'] + '\n\n\n\n'), max_length) for p in batch]
        neg_tensor = torch.tensor(neg_inputs, dtype=torch.long, device=device)
        pos_tensor = torch.tensor(pos_inputs, dtype=torch.long, device=device)
        yield neg_tensor, pos_tensor

### Step 4: Load the pretrained NanoGPT model

In [10]:
ckpt = torch.load("../sft/gpt.pt", map_location=device)
gptconf = GPTConfig(**ckpt['model_args'])
gpt = GPT(gptconf)
state_dict = ckpt['model']
unwanted_prefix = '_orig_mod.'
for k in list(state_dict.keys()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
gpt.load_state_dict(state_dict)
gpt.to(device).train()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(74, 348)
    (wpe): Embedding(256, 348)
    (drop): Dropout(p=0.2, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=348, out_features=1044, bias=False)
          (c_proj): Linear(in_features=348, out_features=348, bias=False)
          (attn_dropout): Dropout(p=0.2, inplace=False)
          (resid_dropout): Dropout(p=0.2, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=348, out_features=1392, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=1392, out_features=348, bias=False)
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=348, out_features=74, bias=False)
)

### Step 5: Load Data (**students are required to complete this part!**)

In [78]:

# STEP 5: dataset + tokenization for DPO
import json, random, math, re
from datasets import Dataset, DatasetDict

pairs_path ="/content/drive/MyDrive/NanoGPT-Math-1/dpo/pos_neg_pairs.json"  # keep your path
with open(pairs_path, "r") as f:
    rows = json.load(f)

# rows are dicts: {"positive": "...", "negative": "..."}
# DPO needs (prompt, chosen, rejected). Extract prompt as the text before the model's answer.
def split_prompt(answer_text):
    # Heuristic: prompt is everything up to first "The answer is"
    m = re.search(r"\bThe answer is\b", answer_text)
    if not m:
        # fallback: take up to first sentence end
        cut = answer_text.find("?")
        return answer_text[: cut + 1 if cut != -1 else len(answer_text)]
    return answer_text[:m.start()].strip()

records = []
for r in rows:
    pos, neg = r["positive"].strip(), r["negative"].strip()
    prompt = split_prompt(pos) if split_prompt(pos) else split_prompt(neg)
    records.append({"prompt": prompt, "chosen": pos, "rejected": neg})

random.seed(42)
random.shuffle(records)

# 90/10 split (you can try 85/15 if data is plentiful)
n = len(records)
split = max(1, int(0.9 * n))
train, val = records[:split], records[split:]

ds = DatasetDict({
    "train": Dataset.from_list(train),
    "validation": Dataset.from_list(val)
})

print(ds)

# Keep your existing Step 5, then add this small filter right after building `ds`
def _ok(r):
    return bool(r["prompt"]) and bool(r["chosen"]) and bool(r["rejected"]) and (r["chosen"] != r["rejected"])

ds = ds.filter(_ok)
print("After filter ->", ds)


DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 47844
    })
    validation: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 5316
    })
})


Filter:   0%|          | 0/47844 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5316 [00:00<?, ? examples/s]

After filter -> DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 47844
    })
    validation: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 5316
    })
})


### Step 6: Build the optimizer and scheduler (**students are required to complete this part!**)

In [93]:
# --- SAFETY GUARD to avoid strategy mismatch on older transformers ---
def supports(arg): return arg in sig_ta

# If eval strategy isn't supported, force NO eval + NO save steps + NO "best model"
if not ( "evaluation_strategy" in sig_ta or "eval_strategy" in sig_ta ):
    if supports("save_strategy"):
        ta_kwargs["save_strategy"] = "no"      # no step/epoch saving policy
    if supports("evaluation_strategy"):
        ta_kwargs["evaluation_strategy"] = "no"
    if supports("eval_strategy"):
        ta_kwargs["eval_strategy"] = "no"
    if supports("load_best_model_at_end"):
        ta_kwargs["load_best_model_at_end"] = False
    # These depend on step-based saving/eval; remove if present
    for k in ("eval_steps","save_steps","save_total_limit",
              "metric_for_best_model","greater_is_better"):
        if k in ta_kwargs: ta_kwargs.pop(k, None)
else:
    # Eval strategy is supported: make them MATCH to avoid the error
    # Set BOTH names if available, so older forks are happy too.
    if supports("evaluation_strategy"):
        ta_kwargs["evaluation_strategy"] = "steps"
    if supports("eval_strategy"):
        ta_kwargs["eval_strategy"] = "steps"
    if supports("save_strategy"):
        ta_kwargs["save_strategy"] = "steps"
    if supports("load_best_model_at_end"):
        ta_kwargs["load_best_model_at_end"] = True
    # Keep step-based knobs if supported

# === STEP 6 SAFETY PATCH: tokenizer/model alignment ===
# Ensure pad token exists
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Ensure tokenizer size matches model embeddings (prevents OOV id crashes)
tok_len = len(tokenizer)
emb_len = model.get_input_embeddings().weight.size(0)
if tok_len != emb_len:
    print(f"Resizing token embeddings: {emb_len} -> {tok_len}")
    model.resize_token_embeddings(tok_len)

# (Optional) be explicit about dtype/precision to avoid mixed-precision edge cases later
import torch
use_bf16 = False  # force off for stability across GPUs




### Step 7: Begin training (**students are required to complete this part!**)

In [96]:
# STEP 7 — force CPU, version-safe DPOConfig + DPOTrainer, then train

import os, inspect, torch
os.environ["CUDA_VISIBLE_DEVICES"] = ""          # hard-disable CUDA
os.environ["ACCELERATE_USE_CPU"] = "true"        # tell Accelerate to use CPU
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"         # clearer errors if any

from trl import DPOTrainer, DPOConfig

def _sig(fn):
    import inspect
    return set(inspect.signature(fn).parameters.keys())

def _maybe(d, name, value, allowed):
    if name in allowed: d[name] = value

# --- 1) Minimal, version-safe DPOConfig (CPU, no mixed precision, no eval/save strategies) ---
sig_cfg = _sig(DPOConfig.__init__)
cfg = {}
_maybe(cfg, "output_dir", "./dpo_out", sig_cfg)
_maybe(cfg, "per_device_train_batch_size", 2, sig_cfg)    # small & safe on CPU
_maybe(cfg, "per_device_eval_batch_size", 2, sig_cfg)
_maybe(cfg, "gradient_accumulation_steps", 4, sig_cfg)
_maybe(cfg, "learning_rate", 5e-5, sig_cfg)
_maybe(cfg, "lr_scheduler_type", "cosine", sig_cfg)
_maybe(cfg, "warmup_ratio", 0.1, sig_cfg)
_maybe(cfg, "max_grad_norm", 0.8, sig_cfg)
_maybe(cfg, "num_train_epochs", 2, sig_cfg)
_maybe(cfg, "logging_steps", 25, sig_cfg)
_maybe(cfg, "bf16", False, sig_cfg)
_maybe(cfg, "fp16", False, sig_cfg)
_maybe(cfg, "remove_unused_columns", False, sig_cfg)
_maybe(cfg, "gradient_checkpointing", False, sig_cfg)
_maybe(cfg, "report_to", "none", sig_cfg)

dpo_args = DPOConfig(**cfg)

# --- 2) Ensure tokenizer/model alignment (pad + vocab) ---
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tok_len = len(tokenizer)
emb_len = model.get_input_embeddings().weight.size(0)
if tok_len != emb_len:
    print(f"Resizing token embeddings: {emb_len} -> {tok_len}")
    model.resize_token_embeddings(tok_len)

# move model to CPU explicitly
try:
    model = model.to("cpu")
except Exception:
    model = model.cpu()

# --- 3) Build DPOTrainer with only supported kwargs ---
sig_tr = _sig(DPOTrainer.__init__)
tr_kwargs = {
    "model": model,
    "args": dpo_args,
    "train_dataset": ds["train"],
    "eval_dataset": ds["validation"],
}
# tokenizer vs processing_class (older TRL uses processing_class)
_maybe(tr_kwargs, "tokenizer", tokenizer, sig_tr)
_maybe(tr_kwargs, "processing_class", tokenizer, sig_tr)
# reference model (None)
if "ref_model" in sig_tr:
    tr_kwargs["ref_model"] = None
elif "reference_model" in sig_tr:
    tr_kwargs["reference_model"] = None
# DPO knobs (only if accepted)
_maybe(tr_kwargs, "beta", 0.1, sig_tr)
_maybe(tr_kwargs, "loss_type", "sigmoid", sig_tr)
_maybe(tr_kwargs, "label_smoothing", 0.0, sig_tr)
_maybe(tr_kwargs, "reference_free", False, sig_tr)
# Length limits (pass only if supported in your TRL)
_maybe(tr_kwargs, "max_length", 256, sig_tr)
_maybe(tr_kwargs, "max_prompt_length", 96, sig_tr)
_maybe(tr_kwargs, "max_target_length", 160, sig_tr)

trainer = DPOTrainer(**tr_kwargs)

# --- 4) Train on CPU and save ---
train_result = trainer.train()
trainer.save_model("./dpo_out/best")
tokenizer.save_pretrained("./dpo_out/best")
print("CPU training completed.")
print(train_result)




Extracting prompt in train dataset:   0%|          | 0/47844 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/47844 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/47844 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/5316 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/5316 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/5316 [00:00<?, ? examples/s]

AcceleratorError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


# Step 8: Begin testing (**students are required to complete this part!**)

In [48]:
# Load the fine-tuned model
ckpt_path = "../dpo/dpo.pt"
checkpoint = torch.load(ckpt_path, map_location=device)
gptconf = GPTConfig(**checkpoint['model_args'])
gpt = GPT(gptconf).to(device)
try:
    state_dict = checkpoint['model']
except:
    state_dict = checkpoint['model_state_dict']
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
gpt.load_state_dict(state_dict)
# Test
gpt.eval()
test_set = ["17+19=?", "3*17=?", "72/4=?", "72-x=34,x=?", "x*11=44,x=?", "3*17=?", "72/4=?", "72-x=34,x=?"]
test_set += ["x-15=27,x=?", "95+x=142,x=?", "x/7=9,x=?", "3x=24,x=?", "x+4=18,x=?", "x-7=23,x=?", "2x+5=15,x=?"]

with torch.no_grad():
    for prompt in test_set:
        prompt_ids = encode(prompt)
        ###########################################################
        # Please complete the test code here!
        # ...
        # gpt.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
        # ...
        x = torch.tensor([prompt_ids], dtype=torch.long, device=device)  # shape [1, T]

        y = gpt.generate(
            x,
            max_new_tokens=max_new_tokens,      # you defined these above
            temperature=temperature,
            top_k=top_k
        )
        out_full = decode(y[0].cpu().flatten().tolist())
        generated = out_full[len(prompt):].strip()

        print(f"Q: {prompt}")
        print(f"A: {generated}\n")
        ###########################################################

Q: 17+19=?
A: The answer is 116 because 17+19 equals 116.

Q: 3*17=?
A: The answer is 171 because 3*17 equals 171.

Q: 72/4=?
A: The answer is 21 because 72 divided by 4 equals 21.

Q: 72-x=34,x=?
A: The answer is 68 because 72 minus 3 equals 68.

Q: x*11=44,x=?
A: The answer is 44 because 44 divided by 1 equals 44.

Q: 3*17=?
A: The answer is 191 because 3*17 equals 191.

Q: 72/4=?
A: The answer is 23 because 72 divided by 4 equals 23.

Q: 72-x=34,x=?
A: The answer is 68 because 72 minus 3 equals 68.

Q: x-15=27,x=?
A: The answer is 21 because 27 plus 15 equals 211.

Q: 95+x=142,x=?
A: The answer is 47 because 142 minus 95 equals 4.

Q: x/7=9,x=?
A: The answer is 1 because 7 divided by 7 equals 11.

Q: 3x=24,x=?
A: The answer is 26 because 3 divided by 2 equals 2.

Q: x+4=18,x=?
A: The answer is 1 because 18 minus 4 equals 1.

Q: x-7=23,x=?
A: The answer is 90 because 23 plus 77 equals 90.

Q: 2x+5=15,x=?
A: The answer is 20 because 15 minus 2 equals 20.

