### Step 1: Install necesscary packages

In [2]:
!pip install matplotlib
!pip install torch numpy transformers datasets tiktoken wandb tqdm

Collecting matplotlib
  Using cached matplotlib-3.10.7-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.3-cp312-cp312-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Using cached fonttools-4.60.1-cp312-cp312-win_amd64.whl.metadata (114 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Using cached kiwisolver-1.4.9-cp312-cp312-win_amd64.whl.metadata (6.4 kB)
Collecting pyparsing>=3 (from matplotlib)
  Using cached pyparsing-3.2.5-py3-none-any.whl.metadata (5.0 kB)
Using cached matplotlib-3.10.7-cp312-cp312-win_amd64.whl (8.1 MB)
Using cached contourpy-1.3.3-cp312-cp312-win_amd64.whl (226 kB)
Using cached cycler-0.12.1-py3-none-any.whl (8.3 kB)
Using cached fonttools-4.60.1-cp312-cp312-win_amd64.whl (2.3 MB)
Using cached kiwisolver-1.4.9-cp312-cp312-win_amd64.whl (73 kB)
Using cac


[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting transformers
  Using cached transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting datasets
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting tiktoken
  Using cached tiktoken-0.12.0-cp312-cp312-win_amd64.whl.metadata (6.9 kB)
Collecting wandb
  Using cached wandb-0.22.2-py3-none-win_amd64.whl.metadata (10 kB)
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Using cached huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from transformers)
  Using cached pyyaml-6.0.3-cp312-cp312-win_amd64.whl.metadata (2.4 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2025.10.23-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting requests (from transformers)
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Using cached tokenizers-0.22.


[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


### Step 2: Package imports and configuration

In [1]:
import sys
import os
sys.path.append(os.path.abspath("..")) 
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import pickle
from model import GPT, GPTConfig
import random
from tqdm import tqdm
import time
import json
import matplotlib.pyplot as plt
# import ollama
# Configuration
beta = 0.5
device = 'cuda' if torch.cuda.is_available() else 'cpu'
base_lr = 1e-6
epochs = 5
batch_size = 128
max_length =64
num_samples = 1
max_new_tokens = 200
temperature = 0.8
top_k = 200
# tokenizer

with open("../sft/meta.pkl", "rb") as f:
    meta = pickle.load(f)
stoi, itos = meta["stoi"], meta["itos"]

extra_chars = list("+-*/=xy?!,.' ")
for ch in extra_chars:
    if ch not in stoi:
        new_index = len(stoi)
        stoi[ch] = new_index
        itos[new_index] = ch

def encode(s): return [stoi[c] for c in s]
def decode(l): return ''.join([itos[i] for i in l])

### Step 3: Define helper functions

In [2]:
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.device_count())
print(torch.cuda.is_available())
device


2.9.0+cu130
13.0
1
True


'cuda'

In [3]:
def compute_logprob(input_ids):
    inputs = input_ids[:, :-1]
    targets = input_ids[:, 1:]
    logits, _ = gpt(inputs, full_seq=True)
    B, T, V = logits.size()
    logits_flat = logits.reshape(-1, V)
    targets_flat = targets.reshape(-1)
    loss = F.cross_entropy(logits_flat, targets_flat, ignore_index=0, reduction='none')
    loss = loss.reshape(B, T)
    attention_mask = (targets != 0).float()
    loss = (loss * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
    return -loss 

def pad_or_truncate(seq, max_length):
    return seq[-max_length:] if len(seq) > max_length else seq + [0] * (max_length - len(seq))

def get_batches(lines, batch_size):
    random.shuffle(lines)
    #for l in lines:
    #    print(l[1])
    for i in range(0, len(lines), batch_size):
        batch = lines[i:i+batch_size]
        if len(batch) < batch_size:
            continue
        neg_inputs = [pad_or_truncate(encode(p['negative'] + '\n\n\n\n'), max_length) for p in batch]
        pos_inputs = [pad_or_truncate(encode(p['positive'] + '\n\n\n\n'), max_length) for p in batch]
        neg_tensor = torch.tensor(neg_inputs, dtype=torch.long, device=device)
        pos_tensor = torch.tensor(pos_inputs, dtype=torch.long, device=device)
        yield neg_tensor, pos_tensor

### Step 4: Load the pretrained NanoGPT model

In [4]:
ckpt = torch.load("../sft/gpt.pt", map_location=device)
gptconf = GPTConfig(**ckpt['model_args'])
gpt = GPT(gptconf)
state_dict = ckpt['model']
unwanted_prefix = '_orig_mod.'
for k in list(state_dict.keys()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
gpt.load_state_dict(state_dict)
gpt.to(device).train()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(74, 348)
    (wpe): Embedding(256, 348)
    (drop): Dropout(p=0.2, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=348, out_features=1044, bias=False)
          (c_proj): Linear(in_features=348, out_features=348, bias=False)
          (attn_dropout): Dropout(p=0.2, inplace=False)
          (resid_dropout): Dropout(p=0.2, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=348, out_features=1392, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=1392, out_features=348, bias=False)
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=348, out_features=74, bias=False)
)

### Step 5: Load Data (**students are required to complete this part!**)

In [5]:
# Load data from ./data/pos_neg_pairs.json
import json

with open("pos_neg_pairs_100k.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print(f"Loaded {len(data)} pairs.")
print("Example:")
print(data[0])

sample = data[0]
print("\nEncoded positive example:")
print(encode(sample["positive"])[:50]) 

batches = get_batches(data, batch_size=batch_size)

neg_batch, pos_batch = next(batches)
print("\nNegative batch shape:", neg_batch.shape)
print("Positive batch shape:", pos_batch.shape)

Loaded 100000 pairs.
Example:
{'negative': '79-7=? Sorry, I do not know.', 'positive': '79-7=? The answer is 72 because 79-7 equals 72.'}

Encoded positive example:
[19, 21, 6, 19, 9, 10, 1, 41, 55, 52, 1, 48, 61, 66, 70, 52, 65, 1, 56, 66, 1, 19, 14, 1, 49, 52, 50, 48, 68, 66, 52, 1, 19, 21, 6, 19, 1, 52, 64, 68, 48, 59, 66, 1, 19, 14, 7]

Negative batch shape: torch.Size([128, 64])
Positive batch shape: torch.Size([128, 64])


### Step 6: Build the optimizer and scheduler (**students are required to complete this part!**)

In [6]:
# recommend to use the AdamW optimizer 
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

optimizer = AdamW(
    gpt.parameters(),
    lr=base_lr,
    betas=(0.9,0.95),
    eps=1e-8,
    weight_decay=0.01
)

scheduler = CosineAnnealingWarmRestarts(
    optimizer,
    T_0=10,          # restart every 10 epochs
    T_mult=1,        # no expansion of cycle length
    eta_min=1e-6
)

### Step 7: Begin training (**students are required to complete this part!**)

In [7]:
from torch.nn.utils import clip_grad_norm_

train_losses = []

for epoch in range(epochs):
    gpt.train()
    total_loss = 0
    pbar = tqdm(get_batches(data, batch_size), desc=f"Epoch {epoch+1}/{epochs}")

    for step, (neg_tensor, pos_tensor) in enumerate(pbar):
        optimizer.zero_grad()

        # Compute log probabilities
        pos_logprob = compute_logprob(pos_tensor)
        neg_logprob = compute_logprob(neg_tensor)

        # DPO loss (main preference objective)
        dpo_term = -F.logsigmoid((pos_logprob - neg_logprob) / beta).mean()

        # Auxiliary supervised loss — reinforces correct answers
        supervised_term = -pos_logprob.mean() * 0.05

        # Combined loss
        loss = dpo_term + supervised_term
        loss.backward()

        # Gradient clipping to prevent explosion
        clip_grad_norm_(gpt.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        train_losses.append(loss.item())
        pbar.set_postfix(loss=f"{loss.item():.4f}")

    avg_loss = total_loss / (step + 1)
    print(f"✅ Epoch {epoch+1} complete | Avg loss: {avg_loss:.4f}")

    # Save checkpoints every epoch
    ckpt_path = f"./dpo_epoch{epoch+1}.pt"
    torch.save({
        "model_state_dict": gpt.state_dict(),
        "model_args": ckpt['model_args']
    }, ckpt_path)
    print(f"💾 Saved checkpoint: {ckpt_path}")

Epoch 1/5: 781it [03:00,  4.32it/s, loss=0.0765]


✅ Epoch 1 complete | Avg loss: 1.0880
💾 Saved checkpoint: ./dpo_epoch1.pt


Epoch 2/5: 781it [03:14,  4.01it/s, loss=0.0546]


✅ Epoch 2 complete | Avg loss: 0.0623
💾 Saved checkpoint: ./dpo_epoch2.pt


Epoch 3/5: 781it [03:11,  4.07it/s, loss=0.0507]


✅ Epoch 3 complete | Avg loss: 0.0528
💾 Saved checkpoint: ./dpo_epoch3.pt


Epoch 4/5: 781it [03:19,  3.92it/s, loss=0.0488]


✅ Epoch 4 complete | Avg loss: 0.0500
💾 Saved checkpoint: ./dpo_epoch4.pt


Epoch 5/5: 781it [03:21,  3.87it/s, loss=0.0468]

✅ Epoch 5 complete | Avg loss: 0.0481
💾 Saved checkpoint: ./dpo_epoch5.pt





### Step 8: Begin testing (**students are required to complete this part!**)

In [None]:
# Load the fine-tuned model
ckpt_path = "../dpo/dpo_epoch5.pt"
checkpoint = torch.load(ckpt_path, map_location=device)
gptconf = GPTConfig(**checkpoint['model_args'])
gpt = GPT(gptconf).to(device)
try:
    state_dict = checkpoint['model']
except:
    state_dict = checkpoint['model_state_dict']
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
gpt.load_state_dict(state_dict)
# Test
gpt.eval()
test_set = ["17+19=?", "3*17=?", "72/4=?", "72-x=34,x=?", "x*11=44,x=?", "3*17=?", "72/4=?", "72-x=34,x=?"]
with torch.no_grad():
    for prompt in test_set: 
        # prompt_ids = encode(prompt)
        prompt_ids = torch.tensor([encode(prompt)], dtype=torch.long, device=device)
        ###########################################################
        # Please complete the test code here!
        # ...
        # gpt.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
        # ...
        ###########################################################
        output = gpt.generate(
            prompt_ids,
            max_new_tokens=max_new_tokens,
            temperature=0.01,
            top_k=200
        )
        result = decode(output[0][0].tolist())
        print(f"Prompt: {prompt}\nModel Output: {result}\n")

FileNotFoundError: [Errno 2] No such file or directory: '../dpo/dpo3.pt'