### Step 1: Install necesscary packages

In [1]:
!pip install matplotlib torch numpy transformers datasets tiktoken wandb tqdm --quiet

### Step 2: Package imports and configuration

- Tokeniser appears to not have !

In [2]:
import sys
import os
sys.path.append(os.path.abspath("..")) 
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import pickle
from model import GPT, GPTConfig
import random
from tqdm import tqdm
import time
import json
import matplotlib.pyplot as plt

# Configuration
## Hyperparameters for training and generation
beta = 0.5
device = 'cuda' if torch.cuda.is_available() else 'cpu' # Choose GPU if it exists
base_lr = 1e-4
epochs = 5
batch_size = 64
max_length =64
num_samples = 1
max_new_tokens = 200
temperature = 0.8
top_k = 200

# Tokenizer - Loads a pickled dictionary, which is a saved dict
## Contains stoi and iots
with open("../sft/meta.pkl", "rb") as f:
    meta = pickle.load(f)
stoi, itos = meta["stoi"], meta["itos"]
# Converts a string into a list of token IDs
def encode(s): 
    # TODO Drop unknown chars like ! for now
    return [stoi[c] for c in s if c in stoi]
# Convert a list of token IDs back into a string
def decode(l): 
    return ''.join([itos[i] for i in l])

### Step 3: Define helper functions

In [3]:
"""
Used for scoring various sequences of tokens generated by the GPT to decide which sequence should be returned
"""
def compute_logprob(input_ids):
    # All tokens except for the last token is used as input
    inputs = input_ids[:, :-1]
    # All tokens except for the first, i.e. the next target token that should be predicted
    targets = input_ids[:, 1:]
    # Runs the model to get the logits (unnormalized scores) for each possible next token at each position
    logits, _ = gpt(inputs, full_seq=True)

    # Reshape both the logits and targets to the correct shape, i.e. (batch_size * seq_length)
    B, T, V = logits.size()
    logits_flat = logits.reshape(-1, V)
    targets_flat = targets.reshape(-1)
    # Calculates the cross-entropy loss (negative log-probability) for each token
    loss = F.cross_entropy(logits_flat, targets_flat, ignore_index=0, reduction='none')
    loss = loss.reshape(B, T)
    # Creates a mask to ignore padding tokens
    attention_mask = (targets != 0).float()
    loss = (loss * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
    # Returns the negative loss, which is the average log-probability per sequence (higher is better, since log-probs are negative)
    return -loss 

"""
Ensures that every sequence is exactly max_length tokens long, either by truncating longer sequences or padding shorter ones with zeros (the padding token)

Either keep the last max_length tokens if length exceeds.
Or pads with 0s to reach the end
"""
def pad_or_truncate(seq, max_length):
    return seq[-max_length:] if len(seq) > max_length else seq + [0] * (max_length - len(seq))

"""
Prepares batches of positive and negative examples for training or evaluation, yielding them as tensors ready for the model
"""
def get_batches(lines, batch_size):
    # Randomise the order of data for each epoch
    random.shuffle(lines)
    #for l in lines:
    #    print(l[1])
    for i in range(0, len(lines), batch_size):
        batch = lines[i:i+batch_size]
        # Skips the last batch if the size is too small to make sure all batches have the same size
        if len(batch) < batch_size:
            continue
        # Encode the negative and positive strings and pad/truncate to maxLenght
        neg_inputs = [pad_or_truncate(encode(p['negative'] + '\n\n\n\n'), max_length) for p in batch]
        pos_inputs = [pad_or_truncate(encode(p['positive'] + '\n\n\n\n'), max_length) for p in batch]
        # Converts the lists of token IDs into PyTorch tensors on the correct device.
        neg_tensor = torch.tensor(neg_inputs, dtype=torch.long, device=device)
        pos_tensor = torch.tensor(pos_inputs, dtype=torch.long, device=device)
        # Yields a tuple (neg_tensor, pos_tensor) for each batch, as tensors are used for model input
        yield neg_tensor, pos_tensor

### Step 4: Load the pretrained NanoGPT model

In [4]:
# Load the pre-trained model, i.e. the checkpoint, which has all the model weights
ckpt = torch.load("../sft/gpt.pt", map_location=device)
# Recreate the model configuration with the saved details
gptconf = GPTConfig(**ckpt['model_args'])
# Create a new instance of the model using the pre-trained configs
gpt = GPT(gptconf)

# Clean up any keys that might have unwanted prefixes while saving
state_dict = ckpt['model']
unwanted_prefix = '_orig_mod.'
for k in list(state_dict.keys()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)

# Load the model weights from the checkpoint to the new model instance
gpt.load_state_dict(state_dict)
# Set it to training mode after ensuring it runs on GPU if it exists
gpt.to(device).train()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(74, 348)
    (wpe): Embedding(256, 348)
    (drop): Dropout(p=0.2, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=348, out_features=1044, bias=False)
          (c_proj): Linear(in_features=348, out_features=348, bias=False)
          (attn_dropout): Dropout(p=0.2, inplace=False)
          (resid_dropout): Dropout(p=0.2, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=348, out_features=1392, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=1392, out_features=348, bias=False)
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=348, out_features=74, bias=False)
)

### Step 5: Load Data (**students are required to complete this part!**)

- Currently I generated 5000 simple arithmetic like the ones provided, and 5000 more 2 step arithmetic like 2*(5+3)=?
- Would need to generate more until at least 100k in 1:1 ratio
- Then maybe 50k more for ones that uses brackets like (x+5)*2=30,x=?. Need about 50k to maintain the ratio

In [13]:
# Load the positive and negative pairs
with open("./pos_neg_pairs.json") as file:
    lines = json.load(file)
print(f"lines' size:  {len(lines)}")

lines' size:  10024


### Step 6: Build the optimizer and scheduler (**students are required to complete this part!**)

In [6]:
# recommend to use the AdamW optimizer
adamw_optimizer = torch.optim.AdamW(gpt.parameters(), lr=base_lr)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(adamw_optimizer, T_max=epochs)

### Step 7: Begin training (**students are required to complete this part!**)

In [7]:
# Lines refer to the number of lines of training data we have so we know how much steps/batches is req to go thru dataset once
total_steps = len(lines) // batch_size

# Loop over the dataset for different epochs
for epoch in range(epochs):
    # Retrieve the batches of positive and negative tensors from the helper function we saw earlier
    pbar = tqdm(get_batches(lines, batch_size))
    for step, (neg_tensor,pos_tensor) in enumerate(pbar):
        # 1. Zero the accumulated gradients from previous batch of training or epoch of training. Do once at the start of every batch
        adamw_optimizer.zero_grad()
        # 2. Calculate the actual loss using the loss function in helper. We use DPO
        ## First calculate the log probs using the helper
        positive_log_prob = compute_logprob(pos_tensor)
        negative_log_prob = compute_logprob(neg_tensor)
        ## Assign higher weights to positive completions in this case, so the model knows what to do.
        dpo_loss = -F.logsigmoid((positive_log_prob - negative_log_prob) / beta).mean() - positive_log_prob.mean() * 0.1
        # 3. Backward Propagation, i.e. compute gradient of loss w.r.t model params
        dpo_loss.backward()
        # 4. Use optimiser to update the model params using new gradients to reduce dpo_loss
        adamw_optimizer.step()
        # 5. Update the progress bar so we can see how the training is going
        pbar.set_description(f"Epoch {epoch+1} Step {step+1} Loss {dpo_loss.item():.4f}")

    # Specify where to save our trained model
    ckpt_path = f"./dpo.pt"
    # Saves the model's weights and configuration after each epoch, so you can resume training or use the model later
    torch.save({
        "model_state_dict": gpt.state_dict(),
        "model_args": ckpt['model_args'],
    }, ckpt_path)
    print(f"Saved checkpoint to {ckpt_path}")

Epoch 1 Step 156 Loss 0.1232: : 156it [03:11,  1.23s/it]


Saved checkpoint to ./dpo.pt


Epoch 2 Step 156 Loss 0.0952: : 156it [03:00,  1.16s/it]


Saved checkpoint to ./dpo.pt


Epoch 3 Step 156 Loss 0.0866: : 156it [02:50,  1.09s/it]


Saved checkpoint to ./dpo.pt


Epoch 4 Step 156 Loss 0.0754: : 156it [03:02,  1.17s/it]


Saved checkpoint to ./dpo.pt


Epoch 5 Step 156 Loss 0.0695: : 156it [02:59,  1.15s/it]

Saved checkpoint to ./dpo.pt





- The loss never stopped decreasing, so perhaps we can increase the number of epochs, since it have not overfitted on the train set yet. Even though we are using 10k only for now.
- can increase epoch since the dpo loss did not plateau or increase
- can consider increase batch size to 128

### Step 8: Begin testing (**students are required to complete this part!**)

In [8]:
# Check to see where the weights are stored, cfm all keys
checkpoint = torch.load(ckpt_path, map_location=device)
print(list(checkpoint.keys()))

['model_state_dict', 'model_args']


In [9]:
# Load the fine-tuned model that we trained earlier above
ckpt_path = "../dpo/dpo.pt"
checkpoint = torch.load(ckpt_path, map_location=device)
# Load the saved model args
gptconf = GPTConfig(**checkpoint['model_args'])
# gpt = GPT(gptconf).cuda()
gpt = GPT(gptconf).to(device)

# TODO Loading the model's save weights from model key first, then fall back to the second key (I think not necessary to try since we save under the 2nd key?)
# try:
#     state_dict = checkpoint['model']
# except:
#     state_dict = checkpoint['model_state_dict']
state_dict = checkpoint['model_state_dict']

# Clean the data keys in dict again. Remember the keys might get affected whenever we save a checkpoint
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
gpt.load_state_dict(state_dict)

<All keys matched successfully>

In [11]:
# Test - Set the Model to Eval mode for deterministic testing behaviur
gpt.eval()
# Load the test set
test_set = ["17+19=?"]
# test_set = ["17+19=?", "3*17=?", "72/4=?", "72-x=34,x=?", "x*11=44,x=?", "3*17=?", "72/4=?", "72-x=34,x=?"]

# Disable gradient computation since we are not training the model anymore
with torch.no_grad():
    for prompt in test_set: 
        # Encode each string into a list of string IDs, which are the numerical representations of the string using the tokeniser
        # Allow us to provide as input
        prompt_ids = pad_or_truncate(encode(prompt), max_length)
        # Convert to tensors so the model can use them. Note that there is only 1 tensor
        test_tensor = torch.tensor(prompt_ids, dtype=torch.long, device=device).unsqueeze(0)
        # Use the test tensors to generate an answer
        y, _ = gpt.generate(test_tensor, max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k)
        # Decode back for human understanding
        output = decode(y[0].tolist())
        print(f"Prompt: {prompt}\nOutput: {output}\n")

Prompt: 17+19=?
Output: 17+19=?
























































Whherhe ansere anss is yo   be e you  okeak muak 09Dhe any pek Erish? uandewed 2300 *9 = 887 and 91493.

