### Aligning Transformer Architecture to GPT2 State Dict

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
from load_gpt2_weights import convert_gpt2_weights, load_gpt2_weights, run_inference
from gpt2 import TransformerSampler, ModelConfig, GenerationConfig

model_cfg = ModelConfig()
gen_cfg = GenerationConfig()
sampler = load_gpt2_weights(model_cfg, gen_cfg)
run_inference(sampler)

# Train GPT 2 from scratch

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datasets
from gpt2 import GPT2, ModelConfig, GenerationConfig
from torch.utils.data import DataLoader

In [5]:
class TrainingConfig():
    batch_size = 4096
    epochs = 1
    lr: float = 1e-3
    weight_decay: float = 1e-2
    wandb_project: str | None = "training_gpt2"
    wandb_name: str | None = None

In [9]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

model_cfg = ModelConfig()
model_cfg.vocab_size = tokenizer.vocab_size

gen_cfg = GenerationConfig()
model = GPT2(model_cfg)

In [None]:
class Trainer():
    def __init__(self, training_config: TrainingConfig, model: GPT2):
        self.training_config = training_config
        self.model = model
        self.optimizer = optim.Adam(self.model.parameters(), lr=training_config.lr, weight_decay=training_config.weight_decay)
        self.scheduler = optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=training_config.epochs)
    
    def train(self, dataset: datasets.Dataset):
        dataloader = DataLoader(dataset, batch_size=self.training_config.batch_size, shuffle=True)
        for epoch in range(self.training_config.epochs):
            for batch in dataloader:
                self.optimizer.zero_grad()
                logits = self.model.forward(batch)
                loss = self.compute_loss(logits, batch)
                loss.backward()
                self.optimizer.step()
    
    def compute_loss(self, logits: torch.Tensor, batch: torch.Tensor):
        loss = F.cross_entropy(logits, batch)
        return loss
    
    def save_model(self, path: str):
        torch.save(self.model.state_dict(), path)

In [None]:
def apply_chat_format(dataset: datasets.Dataset):
    return dataset.map(lambda x: {'prompt': f"Human: {x['text']}\n\nAssistant:"}, batched=True)

def prepare_data(dataset: datasets.Dataset):

    return dataset.map(lambda x: tokenizer(x['text'], return_tensors='pt', truncation = True, padding = True, padding_side = 'left'), batched=True)


In [None]:
dataset = datasets.load_dataset("/home/ubuntu/MechInter/GPT-2/datasets/children-stories", split="train")

In [17]:
dataset[0]['prompt']

"Write an educational story (3-5 paragraphs) targeted at young children using simple words. The story should be inspired from this text snippet: \n“How does a Majority Leader prioritize their policy goals when working with members of their party and those across the aisle, and how do they determine which goals to focus on first?\nAs an AI language model, I don't have personal beliefs or experiences, but I can provide general information on how a Majority Leader may prioritize policy goals while working with members of their party and those across the aisle. Here are some steps that a Majority Leader may follow:\n\n1. Identify the key policy areas and issues that need to be addressed - Majority Leaders typically have an idea about the priority areas that need attention. They may consider the challenges facing their party, their constituents, or the nation as a whole.\n\n2. Assess the level of support for each policy goal - Majority Leaders need to gauge which policy goals have broad sup

# Single layer transformer model

In [2]:
from transformer_lens import HookedTransformer, utils
import torch
cfg = {
    "seed": 49,
    "batch_size": 4096,
    "buffer_mult": 384,
    "lr": 1e-4,
    "num_tokens": int(2e9),
    "l1_coeff": 3e-4,
    "beta1": 0.9,
    "beta2": 0.99,
    "dict_mult": 8,
    "seq_len": 128,
    "d_mlp": 2048,
    "enc_dtype":"fp32",
    "remove_rare_dir": False,
}
cfg["model_batch_size"] = 64
cfg["buffer_size"] = cfg["batch_size"] * cfg["buffer_mult"]
cfg["buffer_batches"] = cfg["buffer_size"] // cfg["seq_len"]
DTYPES = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}

model = HookedTransformer.from_pretrained("gelu-1l").to(DTYPES[cfg["enc_dtype"]])


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loaded pretrained model gelu-1l into HookedTransformer
Changing model dtype to torch.float32
