# Enhancing Adversarial Robustness of Nano GPT

### ECS 189G Final Project

#### Sujash Barman, Jeffrey Wang, Dehui Chen

# Training with DailyDialog

In [2]:
# replace this line with your own directory
cwd = "/content/drive/MyDrive/ECS 189G/"

import os
import pickle
import requests
import numpy as np

import time
import math
import inspect
import tiktoken
from contextlib import nullcontext
from dataclasses import dataclass

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group

# settings
out_dir = os.path.join(cwd, "out")
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

always_save_checkpoint = False # we expect to overfit on this small dataset, so only save when val improves
eval_only = False # if True, script exits right after the first eval
init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*'

wandb_log = False # override via command line if you like
wandb_project = 'dial-char'
wandb_run_name = 'mini-gpt'

dataset = 'daily-dialog'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2
bias = False

learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 5000
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0

# learning rate decay settings
decay_lr = True # whether to decay the learning rate
warmup_iters = 100 # not super necessary potentially
lr_decay_iters = 5000 # make equal to max_iters usually
min_lr = 1e-4 # learning_rate / 10 usually

# DDP settings
backend = 'nccl' # 'nccl', 'gloo', etc.

# system
device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
dtype = 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
compile = True # use PyTorch 2.0 to compile the model to be faster

#Model Architecture

In [9]:
class LayerNorm(nn.Module):
    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """

    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        # regularization
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            # causal mask to ensure that attention is only applied to the left in the input sequence
            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                        .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
        else:
            # manual implementation of attention
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y

class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.gelu    = nn.GELU()
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

@dataclass
class GPTConfig:
    block_size: int = 1024
    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768
    dropout: float = 0.0
    bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster

class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.config = config

        self.transformer = nn.ModuleDict(dict(
          wte = nn.Embedding(config.vocab_size, config.n_embd),
          wpe = nn.Embedding(config.block_size, config.n_embd),
          drop = nn.Dropout(config.dropout),
          # Use ARTM‐augmented blocks here:
          h   = nn.ModuleList([TransformerBlockWithARTM(config) for _ in range(config.n_layer)]),
          ln_f = LayerNorm(config.n_embd, bias=config.bias),
        ))


        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        # with weight tying when using torch.compile() some warnings get generated:
        # "UserWarning: functional_call was passed multiple values for tied weights.
        # This behavior is deprecated and will be an error in future versions"
        # not 100% sure what this is, so far seems to be harmless. TODO investigate
        self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying

        # init all weights
        self.apply(self._init_weights)
        # apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        # report number of parameters
        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))

    def get_num_params(self, non_embedding=True):
        """
        Return the number of parameters in the model.
        For non-embedding count (default), the position embeddings get subtracted.
        The token embeddings would too, except due to the parameter sharing these
        params are actually used as weights in the final layer, so we include them.
        """
        n_params = sum(p.numel() for p in self.parameters())
        if non_embedding:
            n_params -= self.transformer.wpe.weight.numel()
        return n_params

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
            loss = None

        return logits, loss

    def crop_block_size(self, block_size):
        # model surgery to decrease the block size if necessary
        # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
        # but want to use a smaller block size for some smaller, simpler model
        assert block_size <= self.config.block_size
        self.config.block_size = block_size
        self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
        for block in self.transformer.h:
            if hasattr(block.attn, 'bias'):
                block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]

    @classmethod
    def from_pretrained(cls, model_type, override_args=None):
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        override_args = override_args or {} # default to empty dict
        # only dropout can be overridden see more notes below
        assert all(k == 'dropout' for k in override_args)
        from transformers import GPT2LMHeadModel
        print("loading weights from pretrained gpt: %s" % model_type)

        # n_layer, n_head and n_embd are determined from model_type
        config_args = {
            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
        }[model_type]
        print("forcing vocab_size=50257, block_size=1024, bias=True")
        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
        config_args['bias'] = True # always True for GPT model checkpoints
        # we can override the dropout rate, if desired
        if 'dropout' in override_args:
            print(f"overriding dropout rate to {override_args['dropout']}")
            config_args['dropout'] = override_args['dropout']
        # create a from-scratch initialized minGPT model
        config = GPTConfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model

    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        # start with all of the candidate parameters
        param_dict = {pn: p for pn, p in self.named_parameters()}
        # filter out those that do not require grad
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
        print(f"using fused AdamW: {use_fused}")

        return optimizer

    def estimate_mfu(self, fwdbwd_per_iter, dt):
        """ estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """
        # first estimate the number of flops we do per iteration.
        # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
        N = self.get_num_params()
        cfg = self.config
        L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd//cfg.n_head, cfg.block_size
        flops_per_token = 6*N + 12*L*H*Q*T
        flops_per_fwdbwd = flops_per_token * T
        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
        # express our flops throughput as ratio of A100 bfloat16 peak flops
        flops_achieved = flops_per_iter * (1.0/dt) # per second
        flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS
        mfu = flops_achieved / flops_promised
        return mfu

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

# ----------------------------------------------------------------------------
class GPTForClassification(GPT):
    """
    Extends NanoGPT so we can use it as a classifier (SST-2 / QQP / MNLI).
    We simply take the hidden state at the final position (T-1) and
    add a tiny linear head to predict num_labels classes.
    """
    def __init__(self, config: GPTConfig, num_labels: int):
        super().__init__(config)
        self.num_labels = num_labels
        self.classifier = nn.Linear(config.n_embd, num_labels, bias=True)
        # Initialize classification head
        nn.init.normal_(self.classifier.weight, mean=0.0, std=0.02)
        if self.classifier.bias is not None:
            nn.init.zeros_(self.classifier.bias)

    def forward(self, idx, labels=None):
        """
        idx: LongTensor, shape (B, T)
        labels: LongTensor (B,) or None
        Returns:
          - if labels is None: (logits, None)
          - if labels is provided: (logits, loss)
        """
        device = idx.device
        b, t = idx.size()
        pos = torch.arange(0, t, dtype=torch.long, device=device)

        # 1) Token + Positional Embeddings
        tok_emb = self.transformer.wte(idx)        # (B, T, n_embd)
        pos_emb = self.transformer.wpe(pos)        # (T, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)

        # 2) Pass through Transformer blocks
        for block in self.transformer.h:
            x = block(x)                            # (B, T, n_embd)

        x = self.transformer.ln_f(x)               # (B, T, n_embd)

        # 3) Pool hidden state at final position (T-1)
        pooled = x[:, -1, :]                       # (B, n_embd)

        # 4) Classification head
        logits = self.classifier(pooled)           # (B, num_labels)

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels),
                            labels.view(-1))
        else:
            loss = None

        return logits, loss
# ----------------------------------------------------------------------------

In [6]:
def compute_head_coherence_scores(attn_layer: CausalSelfAttention, x_norm: torch.Tensor, pos_idx: int) -> torch.Tensor:
    """
    Placeholder for per‐position “head coherence” features.
    In practice, you would extract the raw attention weights inside attn_layer
    (e.g. by modifying CausalSelfAttention to store 'att' before softmax). Here we return zeros.

    Args:
      attn_layer: instance of CausalSelfAttention (already invoked on x_norm)
      x_norm:      normalized pre-attention input (shape: B, T, n_embd)
      pos_idx:     integer index t for which we want features

    Returns:
      torch.Tensor of shape (B, head_feat_dim). We use head_feat_dim=1 for simplicity.
    """
    B, T, C = x_norm.size()
    # Example: just return a zero scalar per batch element
    return torch.zeros(B, 1, device=x_norm.device)

In [7]:
class TransformerBlockWithARTM(nn.Module):
    """
    A Transformer block that includes the Adversarial-Repair Thinking Module (ARTM)
    right after the masked MHSA output.

    ARTM inspects both:
      - h_att = MHSA(LayerNorm(x_in))
      - z_norm = LayerNorm(x_in + pos_emb)

    and emits a correction Δh for each position.
    """
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.config = config
        # LayerNorm before attention
        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
        # Causal masked self-attention (possibly PAAF-patched already)
        self.attn = CausalSelfAttention(config)
        # LayerNorm before MLP
        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
        # MLP feed-forward
        self.mlp = MLP(config)

        # Compute feature dimension for ARTM_FFN:
        #   - z_norm[t] (n_embd)
        #   - h_att[t]  (n_embd)
        #   - d_prev (1)
        #   - d_next (1)
        #   - head_coh (1, or more if you extract per-head features)
        feat_dim = 2 * config.n_embd + 3  # last “3” = (d_prev, d_next, head_coh_dim=1)
        self.artm_ffn = ARTM_FFN(config.n_embd, feat_dim)

        # Positional embedding is stored in GPT's `wpe`, so we will pull from there.

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: (B, T, n_embd) input to this block
        returns: x_out = x + h_corr + MLP(LN(x + h_corr))
        """
        B, T, C = x.size()
        device = x.device

        # 1) Pre-attention LayerNorm
        x_norm = self.ln_1(x)  # (B, T, C)

        # 2) Masked self-attention
        h_att = self.attn(x_norm)  # (B, T, C)

        # 3) Positional-encoded input and its normalization
        #    We need the positional embeddings from the parent GPT model.
        #    Assuming the parent GPT made `pos_emb = self.wpe(pos_idx)` earlier,
        #    we just reconstruct here:
        pos = torch.arange(0, T, dtype=torch.long, device=device)  # (T,)
        pos_emb = self.attn.c_attn.weight.new_zeros(B, T, C)
        # Actually, we need to reach back into the GPT module to get wpe:
        # But in a typical usage, GPT’s forward has already done: z = tok_emb + pos_emb.
        # So here we assume x (the input) already includes pos_emb. If not, you can accept pos_emb via closure.

        # For clarity: assume x was computed as (tok_emb + pos_emb) before passing to this block.
        z = x  # if x = (tok_emb + pos_emb) downstream from GPT’s embedding + dropout
        z_norm = LayerNorm(C, bias=self.config.bias)(z)  # (B, T, C)

        # 4) ARTM: compute Δh for each position
        #    We will build a feature tensor of shape (B, T, feat_dim)
        #    then apply artm_ffn in one shot.
        #    To do so, we need to gather d_prev, d_next, and head_coh.

        # 4a) Compute d_prev & d_next:
        #     Create two tensors of shape (B, T, 1)
        d_prev = torch.zeros(B, T, 1, device=device)
        d_next = torch.zeros(B, T, 1, device=device)

        if T > 1:
            # distance between z_norm[:, t] and z_norm[:, t-1]
            d_prev[:, 1:, 0] = torch.norm(
                z_norm[:, 1:, :] - z_norm[:, :-1, :],
                dim=-1
            )
            # distance between z_norm[:, t] and z_norm[:, t+1]
            d_next[:, :-1, 0] = torch.norm(
                z_norm[:, :-1, :] - z_norm[:, 1:, :],
                dim=-1
            )
        # 4b) Compute head_coh for all positions at once
        #     We will simply call compute_head_coherence_scores in a loop for each t,
        #     because typically head_coh is scalar per position.

        head_coh = torch.zeros(B, T, 1, device=device)
        # If you want to vectorize, implement a batch-version of compute_head_coherence_scores.
        for t in range(T):
            # Pass both x_norm and x_norm (or Q,K) to extract head features at t
            head_coh[:, t, :] = compute_head_coherence_scores(self.attn, x_norm, t)

        # 4c) Concatenate features along last dim: (B, T, 2*C + 3)
        feat = torch.cat([
            z_norm,         # (B, T, C)
            h_att,          # (B, T, C)
            d_prev,         # (B, T, 1)
            d_next,         # (B, T, 1)
            head_coh        # (B, T, 1)
        ], dim=-1)           # results in (B, T, 2*C + 3)

        # 4d) Run through ARTM_FFN to get Δh: (B, T, C)
        delta_h = self.artm_ffn(feat)  # (B, T, C)

        # 4e) Corrected attention output
        h_corr = h_att + delta_h  # (B, T, C)

        # 5) Residual + MLP path
        x1 = x + h_corr            # (B, T, C)
        x2 = x1 + self.mlp(self.ln_2(x1))  # (B, T, C)

        return x2

#Preparing Training Data

In [None]:
# download the dataset
input_file_path = os.path.join(cwd, 'dial.txt')
with open(input_file_path, 'r') as f:
    data = f.read()
print(f"length of dataset in characters: {len(data):,}")

# get all the unique characters that occur in this text
chars = sorted(list(set(data)))
vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
def encode(s):
    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
def decode(l):
    return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# create the train and test splits
n = len(data)
train_data = data[:int(n*0.9)]
val_data = data[int(n*0.9):]

# encode both to integers
train_ids = encode(train_data)
val_ids = encode(val_data)
print(f"train has {len(train_ids):,} tokens")
print(f"val has {len(val_ids):,} tokens")

# export to bin files
train_ids = np.array(train_ids, dtype=np.uint16)
val_ids = np.array(val_ids, dtype=np.uint16)
train_ids.tofile(os.path.join(cwd, 'train.bin'))
val_ids.tofile(os.path.join(cwd, 'val.bin'))

# save the meta information as well, to help us encode/decode later
meta = {
    'vocab_size': vocab_size,
    'itos': itos,
    'stoi': stoi,
}
with open(os.path.join(cwd, 'meta.pkl'), 'wb') as f:
    pickle.dump(meta, f)

length of dataset in characters: 5,319,059
all the unique characters: 
 !"#$%&'()*+,-./0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\_abcdefghijklmnopqrstuvwxyz~£¥°–—‘’“”′、。
vocab size: 100
train has 4,787,153 tokens
val has 531,906 tokens


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Training Loop

In [13]:
# -----------------------------------------------------------------------------
config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
config = {k: globals()[k] for k in config_keys} # will be useful for logging
# -----------------------------------------------------------------------------

# various inits, derived attributes, I/O setup
ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run?
if ddp:
    init_process_group(backend=backend)
    ddp_rank = int(os.environ['RANK'])
    ddp_local_rank = int(os.environ['LOCAL_RANK'])
    ddp_world_size = int(os.environ['WORLD_SIZE'])
    device = f'cuda:{ddp_local_rank}'
    torch.cuda.set_device(device)
    master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
    seed_offset = ddp_rank # each process gets a different seed
    # world_size number of processes will be training simultaneously, so we can scale
    # down the desired gradient accumulation iterations per process proportionally
    assert gradient_accumulation_steps % ddp_world_size == 0
    gradient_accumulation_steps //= ddp_world_size
else:
    # if not ddp, we are running on a single gpu, and one process
    master_process = True
    seed_offset = 0
    ddp_world_size = 1
tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size
print(f"tokens per iteration will be: {tokens_per_iter:,}")

if master_process:
    os.makedirs(out_dir, exist_ok=True)
torch.manual_seed(1337 + seed_offset)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
# note: float16 data type will automatically use a GradScaler
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

# poor man's data loader
data_dir = cwd
def get_batch(split):
    # We recreate np.memmap every batch to avoid a memory leak, as per
    # https://stackoverflow.com/questions/45132940/numpy-memmap-memory-usage-want-to-iterate-once/61472122#61472122
    if split == 'train':
        data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
    else:
        data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    if device_type == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y

# init these up here, can override if init_from='resume' (i.e. from a checkpoint)
iter_num = 0
best_val_loss = 1e9

# attempt to derive vocab_size from the dataset
meta_path = os.path.join(data_dir, 'meta.pkl')
meta_vocab_size = None
if os.path.exists(meta_path):
    with open(meta_path, 'rb') as f:
        meta = pickle.load(f)
    meta_vocab_size = meta['vocab_size']
    print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})")

# model init
model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=block_size,
                  bias=bias, vocab_size=None, dropout=dropout) # start with model_args from command line
if init_from == 'scratch':
    # init a new model from scratch
    print("Initializing a new model from scratch")
    # determine the vocab size we'll use for from-scratch training
    if meta_vocab_size is None:
        print("defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)")
    model_args['vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304
    gptconf = GPTConfig(**model_args)
    model = GPT(gptconf)
elif init_from == 'resume':
    print(f"Resuming training from {out_dir}")
    # resume training from a checkpoint.
    ckpt_path = os.path.join(out_dir, 'ckpt.pt')
    checkpoint = torch.load(ckpt_path, map_location=device)
    checkpoint_model_args = checkpoint['model_args']
    # force these config attributes to be equal otherwise we can't even resume training
    # the rest of the attributes (e.g. dropout) can stay as desired from command line
    for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
        model_args[k] = checkpoint_model_args[k]
    # create the model
    gptconf = GPTConfig(**model_args)
    model = GPT(gptconf)
    state_dict = checkpoint['model']
    # fix the keys of the state dictionary :(
    # honestly no idea how checkpoints sometimes get this prefix, have to debug more
    unwanted_prefix = '_orig_mod.'
    for k,v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
    model.load_state_dict(state_dict)
    iter_num = checkpoint['iter_num']
    best_val_loss = checkpoint['best_val_loss']
elif init_from.startswith('gpt2'):
    print(f"Initializing from OpenAI GPT-2 weights: {init_from}")
    # initialize from OpenAI GPT-2 weights
    override_args = dict(dropout=dropout)
    model = GPT.from_pretrained(init_from, override_args)
    # read off the created config params, so we can store them into checkpoint correctly
    for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
        model_args[k] = getattr(model.config, k)
# crop down the model block size if desired, using model surgery
if block_size < model.config.block_size:
    model.crop_block_size(block_size)
    model_args['block_size'] = block_size # so that the checkpoint will have the right value
model.to(device)

# initialize a GradScaler. If enabled=False scaler is a no-op
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))

# optimizer
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
if init_from == 'resume':
    optimizer.load_state_dict(checkpoint['optimizer'])
checkpoint = None # free up memory

# compile the model
if compile:
    print("compiling the model... (takes a ~minute)")
    unoptimized_model = model
    model = torch.compile(model) # requires PyTorch 2.0

# wrap model into DDP container
if ddp:
    model = DDP(model, device_ids=[ddp_local_rank])

# helps estimate an arbitrarily accurate loss over either split using many batches
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            with ctx:
                logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# learning rate decay scheduler (cosine with warmup)
def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * (it + 1) / (warmup_iters + 1)
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)

# logging
if wandb_log and master_process:
    import wandb
    wandb.init(project=wandb_project, name=wandb_run_name, config=config)

# training loop
X, Y = get_batch('train') # fetch the very first batch
t0 = time.time()
local_iter_num = 0 # number of iterations in the lifetime of this process
raw_model = model.module if ddp else model # unwrap DDP container if needed
running_mfu = -1.0
while True:

    # determine and set the learning rate for this iteration
    lr = get_lr(iter_num) if decay_lr else learning_rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    # evaluate the loss on train/val sets and write checkpoints
    if iter_num % eval_interval == 0 and master_process:
        losses = estimate_loss()
        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        if wandb_log:
            wandb.log({
                "iter": iter_num,
                "train/loss": losses['train'],
                "val/loss": losses['val'],
                "lr": lr,
                "mfu": running_mfu*100, # convert to percentage
            })
        if losses['val'] < best_val_loss or always_save_checkpoint:
            best_val_loss = losses['val']
            if iter_num > 0:
                checkpoint = {
                    'model': raw_model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'model_args': model_args,
                    'iter_num': iter_num,
                    'best_val_loss': best_val_loss,
                    'config': config,
                }
                print(f"saving checkpoint to {out_dir}")
                torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt'))
    if iter_num == 0 and eval_only:
        break

    # forward backward update, with optional gradient accumulation to simulate larger batch size
    # and using the GradScaler if data type is float16
    for micro_step in range(gradient_accumulation_steps):
        if ddp:
            # in DDP training we only need to sync gradients at the last micro step.
            # the official way to do this is with model.no_sync() context manager, but
            # I really dislike that this bloats the code and forces us to repeat code
            # looking at the source of that context manager, it just toggles this variable
            model.require_backward_grad_sync = (micro_step == gradient_accumulation_steps - 1)
        with ctx:
            logits, loss = model(X, Y)
            loss = loss / gradient_accumulation_steps # scale the loss to account for gradient accumulation
        # immediately async prefetch next batch while model is doing the forward pass on the GPU
        X, Y = get_batch('train')
        # backward pass, with gradient scaling if training in fp16
        scaler.scale(loss).backward()
    # clip the gradient
    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    # step the optimizer and scaler if training in fp16
    scaler.step(optimizer)
    scaler.update()
    # flush the gradients as soon as we can, no need for this memory anymore
    optimizer.zero_grad(set_to_none=True)

    # timing and logging
    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if iter_num % log_interval == 0 and master_process:
        # get loss as float. note: this is a CPU-GPU sync point
        # scale up to undo the division above, approximating the true total loss (exact would have been a sum)
        lossf = loss.item() * gradient_accumulation_steps
        if local_iter_num >= 5: # let the training loop settle a bit
            mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
            running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu
        print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%")
    iter_num += 1
    local_iter_num += 1

    # termination conditions
    if iter_num > max_iters:
        break

if ddp:
    destroy_process_group()


tokens per iteration will be: 16,384
Initializing a new model from scratch
defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)
number of parameters: 32.60M


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

#Generating Result Samples

In [None]:
# -----------------------------------------------------------------------------
init_from = 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
start = "\n" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
num_samples = 10 # number of samples to draw
max_new_tokens = 500 # number of tokens generated in each sample
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
seed = 1337
# -----------------------------------------------------------------------------

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

# model
if init_from == 'resume':
    # init from a model saved in a specific directory
    ckpt_path = os.path.join(out_dir, 'ckpt.pt')
    checkpoint = torch.load(ckpt_path, map_location=device)
    gptconf = GPTConfig(**checkpoint['model_args'])
    model = GPT(gptconf)
    state_dict = checkpoint['model']
    unwanted_prefix = '_orig_mod.'
    for k,v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
    model.load_state_dict(state_dict)
elif init_from.startswith('gpt2'):
    # init from a given GPT-2 model
    model = GPT.from_pretrained(init_from, dict(dropout=0.0))

model.eval()
model.to(device)
if compile:
    model = torch.compile(model) # requires PyTorch 2.0 (optional)

# look for the meta pickle in case it is available in the dataset folder
load_meta = False
if init_from == 'resume' and 'config' in checkpoint and 'dataset' in checkpoint['config']: # older checkpoints might not have these...
    meta_path = os.path.join(cwd, 'meta.pkl')
    load_meta = os.path.exists(meta_path)
if load_meta:
    print(f"Loading meta from {meta_path}...")
    with open(meta_path, 'rb') as f:
        meta = pickle.load(f)
    # TODO want to make this more general to arbitrary encoder/decoder schemes
    stoi, itos = meta['stoi'], meta['itos']
    encode = lambda s: [stoi[c] for c in s]
    decode = lambda l: ''.join([itos[i] for i in l])
else:
    # ok let's assume gpt-2 encodings by default
    print("No meta.pkl found, assuming GPT-2 encodings...")
    enc = tiktoken.get_encoding("gpt2")
    encode = lambda s: enc.encode(s, allowed_special={"<|endoftext|>"})
    decode = lambda l: enc.decode(l)

# encode the beginning of the prompt
if start.startswith('FILE:'):
    with open(start[5:], 'r', encoding='utf-8') as f:
        start = f.read()
start_ids = encode(start)
x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])

# run generation
with torch.no_grad():
    with ctx:
        for k in range(num_samples):
            y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
            print(decode(y[0].tolist()))
            print('---------------')

number of parameters: 10.66M
Loading meta from /content/drive/MyDrive/ECS 189G/meta.pkl...

What is the matter him going ?
It is a big problem with my battroom .
How unexpecting ?
It ’ s the matter of the lottery .
That ’ s a good idea . Can you come in ?
I'd like to come in at the park at 2:00 .
Is there a battery account ?
I want a car in the bad location .
You can call the table for lunch ?
Yes , that would be fine .
Good luck . Can I help you ?
Yes , that would be great ! Thanks . Taxi .
And no problem .
That's great .
I'm from China and I need a Promise .
You don't mind saying tha
---------------

Really ? Really ? I must have been working for it .
I appreciate that .
That was a private cover .
John , do you have any other covers ?
Yes . For example , a single cover can give me a call from the next step , and I would appreciate you to make my country .
Ok , I ’ ll teach you on . When do you think you ’ re awful now ?
Well , I ’ ll change it .
How long will it take you to the compu

# Load AdvGLUE Dataset

In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
import json

# ----------------------------------------------------------------------------
class AdvGLUEDataset(Dataset):
    """
    Loads adversarial SST-2, QQP, and MNLI examples from a JSON of the form:
      {
        "SST2": [...],
        "QQP":  [...],
        "MNLI": [...]
      }
    Each example dict has fields:
      - SST2: 'sentence', 'label'
      - QQP:  'question1', 'question2', 'label'
      - MNLI: 'premise', 'hypothesis', 'label'
    We convert each to a single text string and a numeric label.
    """
    def __init__(self, json_path, stoi, max_length=128):
        super().__init__()
        with open(json_path, 'r') as f:
            data = json.load(f)
        self.samples = []
        for task, examples in data.items():
            for ex in examples:
                if task.lower() == 'sst2':
                    text = ex['sentence']
                    label = ex['label']
                elif task.lower() == 'qqp':
                    text = ex['question1'] + " [SEP] " + ex['question2']
                    label = ex['label']
                elif task.lower() == 'mnli':
                    text = ex['premise'] + " [SEP] " + ex['hypothesis']
                    label = ex['label']  # 0,1,2
                else:
                    continue
                self.samples.append({'text': text, 'label': label})

        self.stoi = stoi
        self.max_length = max_length

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        ex = self.samples[idx]
        text = ex['text']
        label = ex['label']
        # Convert to int IDs via the character‐level stoi (pad/truncate to max_length)
        ids = [self.stoi.get(ch, self.stoi.get(' ', 0)) for ch in text][:self.max_length]
        if len(ids) < self.max_length:
            ids += [0] * (self.max_length - len(ids))
        return torch.tensor(ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)

# ----------------------------------------------------------------------------
# If cwd = "/content/drive/MyDrive/YEAR 4/ECS 189G/"
# and your JSON is at "/content/drive/MyDrive/YEAR 4/ECS 189G/dev_ann.json", do:

json_path = os.path.join(cwd, 'dev_ann.json')
# e.g. "/content/drive/MyDrive/YEAR 4/ECS 189G/dev_ann.json"

adv_dataset = AdvGLUEDataset(json_path, stoi, max_length=128)
adv_loader = DataLoader(adv_dataset, batch_size=32, shuffle=False, num_workers=2)


# Shared Training Function

In [None]:
# Cell A: Shared training function to produce a checkpoint with optional LoRA/DAR/PAAF

import os
import time
import math
import torch
import torch.nn.functional as F
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group

def train_and_save_variant(
    out_dir: str,
    ckpt_filename: str,
    use_lora: bool = False,
    use_paaf: bool = False,
    use_dar: bool = False,
    dar_weight: float = 0.01,
    lora_rank: int = 8,
    lora_alpha: float = 16.0,
    max_iters: int = 5000
):
    """
    Trains NanoGPT from scratch (or from `init_from='scratch'`) with the given defenses,
    and saves the final checkpoint to `os.path.join(out_dir, ckpt_filename)`.

    - use_lora: wraps relevant Linear layers in LoRA adapters before training.
    - use_paaf: assumes you have already applied the PAAF monkey-patch to CausalSelfAttention.forward earlier.
    - use_dar: adds DAR penalty to the cross‐entropy loss.
    - dar_weight: weight on DAR penalty.
    - lora_rank / lora_alpha: LoRA hyperparameters.
    - max_iters: number of training iterations.
    """
    # 1) Create the model from scratch
    model_args = dict(
        n_layer=n_layer,
        n_head=n_head,
        n_embd=n_embd,
        block_size=block_size,
        bias=bias,
        vocab_size=meta_vocab_size if meta_vocab_size is not None else 50304,
        dropout=dropout
    )
    gptconf = GPTConfig(**model_args)
    model = GPT(gptconf)
    model.to(device)

    # 2) LoRA injection if requested
    if use_lora:
        print(f"--> Injecting LoRA adapters (rank={lora_rank}, alpha={lora_alpha})")
        model = inject_lora(model, rank=lora_rank, alpha=lora_alpha)
        model.to(device)   # <--- ensure LoRA parameters are also on GPU


    # 3) If DDP, wrap in DDP (we assume single‐GPU here; adjust if multi‐GPU)
    #    (Remove or comment-out if not using DDP)
    ddp = False
    if ddp:
        init_process_group(backend=backend)
        model = DDP(model, device_ids=[torch.cuda.current_device()])
        master_process = (int(os.environ.get('RANK', -1)) == 0)
    else:
        master_process = True

    # 4) Optimizer
    optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)

    # 5) GradScaler for fp16 (already defined as `scaler`)
    scaler_local = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))

    iter_num = 0
    best_val_loss = 1e9
    t0 = time.time()

    # 6) Training loop
    while True:
        # learning rate schedule
        lr = get_lr(iter_num) if decay_lr else learning_rate
        for pg in optimizer.param_groups:
            pg['lr'] = lr

        # Evaluate periodically
        if iter_num % eval_interval == 0 and master_process:
            losses = estimate_loss()
            print(f"[{ckpt_filename}] iter {iter_num}: train {losses['train']:.4f}, val {losses['val']:.4f}")
            if losses['val'] < best_val_loss:
                best_val_loss = losses['val']
                if iter_num > 0:
                    # Save intermediate checkpoint
                    ckpt_path = os.path.join(out_dir, ckpt_filename)
                    print(f"[{ckpt_filename}] saving checkpoint at iter {iter_num} → {ckpt_path}")
                    torch.save({
                        'model': (model.module if ddp else model).state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'model_args': model_args,
                        'iter_num': iter_num,
                        'best_val_loss': best_val_loss,
                    }, ckpt_path)

        if iter_num >= max_iters:
            break

        # 7) Forward/Backward
        X, Y = get_batch('train')
        for micro_step in range(gradient_accumulation_steps):
            if ddp:
                model.require_backward_grad_sync = (micro_step == gradient_accumulation_steps - 1)
            with ctx:
                logits, _ = model(X, Y)
                # compute loss: CE + optional DAR penalty
                if use_dar:
                    # DAR.attach_loss expects logits over entire vocab, so we need to re-forward if needed.
                    # But here `model(X, Y)` already gives us logits over each position for LM loss.
                    # Instead, do classification‐style DAR only on tokens: we keep it simple and
                    # apply DAR penalty on LoRA A/B if present.
                    ce = F.cross_entropy(logits.view(-1, logits.size(-1)), Y.view(-1))
                    dar_pen = DAR.penalty(model)
                    loss = ce + dar_weight * dar_pen
                else:
                    # standard LM cross-entropy
                    loss = F.cross_entropy(logits.view(-1, logits.size(-1)), Y.view(-1), ignore_index=-1)
                loss = loss / gradient_accumulation_steps
            X, Y = get_batch('train')
            scaler_local.scale(loss).backward()

        # Gradient clipping
        if grad_clip != 0.0:
            scaler_local.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_((model.module if ddp else model).parameters(), grad_clip)
        scaler_local.step(optimizer)
        scaler_local.update()
        optimizer.zero_grad(set_to_none=True)

        # Logging
        t1 = time.time()
        dt = t1 - t0
        t0 = t1
        if iter_num % log_interval == 0 and master_process:
            lossf = loss.item() * gradient_accumulation_steps
            print(f"[{ckpt_filename}] iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.1f}ms")
        iter_num += 1

    # Final save (in case best checkpoint was never updated)
    if master_process:
        ckpt_path = os.path.join(out_dir, ckpt_filename)
        print(f"[{ckpt_filename}] final save at iter {iter_num} → {ckpt_path}")
        torch.save({
            'model': (model.module if ddp else model).state_dict(),
            'optimizer': optimizer.state_dict(),
            'model_args': model_args,
            'iter_num': iter_num,
            'best_val_loss': best_val_loss,
        }, ckpt_path)

    if ddp:
        destroy_process_group()
    print(f"[{ckpt_filename}] Training complete.")


# NanoGPT (base)

In [None]:
# Cell B: NanoGPT (base) – no LoRA, no PAAF, no DAR

train_and_save_variant(
    out_dir    = out_dir,
    ckpt_filename = 'ckpt_base.pt',
    use_lora   = False,
    use_paaf   = False,
    use_dar    = False,
    max_iters  = max_iters  # e.g. 5000
)

number of parameters: 10.66M
num decayed parameter tensors: 26, with 10,753,536 parameters
num non-decayed parameter tensors: 13, with 4,992 parameters
using fused AdamW: True


  scaler_local = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))


[ckpt_base.pt] iter 0: train 0.8653, val 0.9370
[ckpt_base.pt] iter 0: loss 4.6271, time 12753.8ms
[ckpt_base.pt] iter 10: loss 3.1541, time 96.4ms
[ckpt_base.pt] iter 20: loss 2.7049, time 97.7ms
[ckpt_base.pt] iter 30: loss 2.5169, time 96.9ms
[ckpt_base.pt] iter 40: loss 2.4297, time 98.5ms
[ckpt_base.pt] iter 50: loss 2.4171, time 98.2ms
[ckpt_base.pt] iter 60: loss 2.3862, time 98.3ms
[ckpt_base.pt] iter 70: loss 2.3493, time 96.5ms
[ckpt_base.pt] iter 80: loss 2.3448, time 96.8ms
[ckpt_base.pt] iter 90: loss 2.3766, time 96.4ms
[ckpt_base.pt] iter 100: loss 2.3248, time 100.0ms
[ckpt_base.pt] iter 110: loss 2.3424, time 97.5ms
[ckpt_base.pt] iter 120: loss 2.3378, time 96.5ms
[ckpt_base.pt] iter 130: loss 2.3284, time 100.2ms
[ckpt_base.pt] iter 140: loss 2.2833, time 97.2ms
[ckpt_base.pt] iter 150: loss 2.2480, time 98.6ms
[ckpt_base.pt] iter 160: loss 2.2281, time 99.8ms
[ckpt_base.pt] iter 170: loss 2.1584, time 99.4ms
[ckpt_base.pt] iter 180: loss 2.0939, time 98.3ms
[ckpt_ba

# NanoGPT + LoRA

In [None]:
# Cell C: NanoGPT + LoRA (no DAR, no PAAF)

train_and_save_variant(
    out_dir       = out_dir,
    ckpt_filename = 'ckpt_lora.pt',
    use_lora      = True,
    use_paaf      = False,
    use_dar       = False,
    dar_weight    = 0.0,    # no DAR penalty
    max_iters     = max_iters
)

number of parameters: 10.66M
--> Injecting LoRA adapters (rank=8, alpha=16.0)
num decayed parameter tensors: 74, with 11,048,448 parameters
num non-decayed parameter tensors: 13, with 4,992 parameters
using fused AdamW: True


  scaler_local = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))


[ckpt_lora.pt] iter 0: train 0.8660, val 0.9363
[ckpt_lora.pt] iter 0: loss 4.6592, time 12400.3ms
[ckpt_lora.pt] iter 10: loss 3.1473, time 142.6ms
[ckpt_lora.pt] iter 20: loss 2.6685, time 144.5ms
[ckpt_lora.pt] iter 30: loss 2.4949, time 143.5ms
[ckpt_lora.pt] iter 40: loss 2.4083, time 143.1ms
[ckpt_lora.pt] iter 50: loss 2.4210, time 143.7ms
[ckpt_lora.pt] iter 60: loss 2.3888, time 142.5ms
[ckpt_lora.pt] iter 70: loss 2.3575, time 144.4ms
[ckpt_lora.pt] iter 80: loss 2.3733, time 142.5ms
[ckpt_lora.pt] iter 90: loss 2.3552, time 143.9ms
[ckpt_lora.pt] iter 100: loss 2.3494, time 143.3ms
[ckpt_lora.pt] iter 110: loss 2.3219, time 143.6ms
[ckpt_lora.pt] iter 120: loss 2.3035, time 143.8ms
[ckpt_lora.pt] iter 130: loss 2.3168, time 144.5ms
[ckpt_lora.pt] iter 140: loss 2.2621, time 144.3ms
[ckpt_lora.pt] iter 150: loss 2.2166, time 140.5ms
[ckpt_lora.pt] iter 160: loss 2.1042, time 141.9ms
[ckpt_lora.pt] iter 170: loss 2.0190, time 142.9ms
[ckpt_lora.pt] iter 180: loss 2.0220, time 

# NanoGPT + DAR

In [None]:
# Cell F: NanoGPT + DAR (no LoRA, no PAAF)

train_and_save_variant(
    out_dir       = out_dir,
    ckpt_filename = 'ckpt_dar.pt',
    use_lora      = False,
    use_paaf      = False,
    use_dar       = True,
    dar_weight    = 0.01,    # you can change this penalty if desired
    max_iters     = max_iters
)

number of parameters: 10.66M
num decayed parameter tensors: 26, with 10,753,536 parameters
num non-decayed parameter tensors: 13, with 4,992 parameters
using fused AdamW: True


  scaler_local = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))


[ckpt_dar.pt] iter 0: train 0.8672, val 0.9334
[ckpt_dar.pt] iter 0: loss 4.6268, time 12573.0ms
[ckpt_dar.pt] iter 10: loss 3.2125, time 97.1ms
[ckpt_dar.pt] iter 20: loss 2.7086, time 99.5ms
[ckpt_dar.pt] iter 30: loss 2.4963, time 96.8ms
[ckpt_dar.pt] iter 40: loss 2.4014, time 97.3ms
[ckpt_dar.pt] iter 50: loss 2.4060, time 99.3ms
[ckpt_dar.pt] iter 60: loss 2.3763, time 96.2ms
[ckpt_dar.pt] iter 70: loss 2.3420, time 99.8ms
[ckpt_dar.pt] iter 80: loss 2.3444, time 99.6ms
[ckpt_dar.pt] iter 90: loss 2.3541, time 98.0ms
[ckpt_dar.pt] iter 100: loss 2.3433, time 96.1ms
[ckpt_dar.pt] iter 110: loss 2.3411, time 99.7ms
[ckpt_dar.pt] iter 120: loss 2.3224, time 100.0ms
[ckpt_dar.pt] iter 130: loss 2.2983, time 101.5ms
[ckpt_dar.pt] iter 140: loss 2.2860, time 99.4ms
[ckpt_dar.pt] iter 150: loss 2.2526, time 99.6ms
[ckpt_dar.pt] iter 160: loss 2.1917, time 97.5ms
[ckpt_dar.pt] iter 170: loss 2.1624, time 97.4ms
[ckpt_dar.pt] iter 180: loss 2.0892, time 100.6ms
[ckpt_dar.pt] iter 190: los

# NanoGPT + DAR + LoRA

In [None]:
# Cell G: NanoGPT + LoRA + DAR (no PAAF)

train_and_save_variant(
    out_dir       = out_dir,
    ckpt_filename = 'ckpt_lora_dar.pt',
    use_lora      = True,
    use_paaf      = False,
    use_dar       = True,
    dar_weight    = 0.01,
    max_iters     = max_iters
)

number of parameters: 10.66M
--> Injecting LoRA adapters (rank=8, alpha=16.0)
num decayed parameter tensors: 74, with 11,048,448 parameters
num non-decayed parameter tensors: 13, with 4,992 parameters
using fused AdamW: True


  scaler_local = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))


[ckpt_lora_dar.pt] iter 0: train 0.8636, val 0.9353
[ckpt_lora_dar.pt] iter 0: loss 5.1199, time 12409.9ms
[ckpt_lora_dar.pt] iter 10: loss 3.6971, time 143.3ms
[ckpt_lora_dar.pt] iter 20: loss 3.2122, time 144.8ms
[ckpt_lora_dar.pt] iter 30: loss 3.0199, time 144.2ms
[ckpt_lora_dar.pt] iter 40: loss 2.9235, time 143.5ms
[ckpt_lora_dar.pt] iter 50: loss 2.8649, time 144.5ms
[ckpt_lora_dar.pt] iter 60: loss 2.8545, time 143.0ms
[ckpt_lora_dar.pt] iter 70: loss 2.7886, time 142.9ms
[ckpt_lora_dar.pt] iter 80: loss 2.7784, time 142.8ms
[ckpt_lora_dar.pt] iter 90: loss 2.7112, time 144.0ms
[ckpt_lora_dar.pt] iter 100: loss 2.6712, time 142.7ms
[ckpt_lora_dar.pt] iter 110: loss 2.6342, time 145.0ms
[ckpt_lora_dar.pt] iter 120: loss 2.6083, time 143.9ms
[ckpt_lora_dar.pt] iter 130: loss 2.5563, time 144.2ms
[ckpt_lora_dar.pt] iter 140: loss 2.5078, time 144.5ms
[ckpt_lora_dar.pt] iter 150: loss 2.4360, time 144.5ms
[ckpt_lora_dar.pt] iter 160: loss 2.3443, time 145.6ms
[ckpt_lora_dar.pt] ite

# NanoGPT + PAAF

In [None]:
# Cell D: NanoGPT + PAAF (no LoRA, no DAR)
# Note: we assume Cell 3’s monkey-patch (paaf_forward) is already active in scope.

train_and_save_variant(
    out_dir       = out_dir,
    ckpt_filename = 'ckpt_paaf.pt',
    use_lora      = False,
    use_paaf      = True,
    use_dar       = False,
    dar_weight    = 0.0,
    max_iters     = max_iters
)

number of parameters: 10.66M
num decayed parameter tensors: 26, with 10,753,536 parameters
num non-decayed parameter tensors: 13, with 4,992 parameters
using fused AdamW: True


  scaler_local = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))


[ckpt_paaf.pt] iter 0: train 0.8636, val 0.9352
[ckpt_paaf.pt] iter 0: loss 4.6314, time 12325.8ms
[ckpt_paaf.pt] iter 10: loss 3.1884, time 99.8ms
[ckpt_paaf.pt] iter 20: loss 2.7122, time 98.2ms
[ckpt_paaf.pt] iter 30: loss 2.5010, time 98.1ms
[ckpt_paaf.pt] iter 40: loss 2.4286, time 97.7ms
[ckpt_paaf.pt] iter 50: loss 2.3981, time 100.6ms
[ckpt_paaf.pt] iter 60: loss 2.3952, time 98.3ms
[ckpt_paaf.pt] iter 70: loss 2.4069, time 100.5ms
[ckpt_paaf.pt] iter 80: loss 2.3557, time 98.7ms
[ckpt_paaf.pt] iter 90: loss 2.3608, time 100.2ms
[ckpt_paaf.pt] iter 100: loss 2.3520, time 99.5ms
[ckpt_paaf.pt] iter 110: loss 2.3225, time 98.1ms
[ckpt_paaf.pt] iter 120: loss 2.3336, time 100.4ms
[ckpt_paaf.pt] iter 130: loss 2.3058, time 100.9ms
[ckpt_paaf.pt] iter 140: loss 2.2767, time 100.3ms
[ckpt_paaf.pt] iter 150: loss 2.2573, time 100.6ms
[ckpt_paaf.pt] iter 160: loss 2.1937, time 100.9ms
[ckpt_paaf.pt] iter 170: loss 2.1657, time 100.8ms
[ckpt_paaf.pt] iter 180: loss 2.0802, time 100.8ms


# NanoGPT + PAAF + LoRA

In [None]:
# Cell E: NanoGPT + LoRA + PAAF (still no DAR)

train_and_save_variant(
    out_dir       = out_dir,
    ckpt_filename = 'ckpt_lora_paaf.pt',
    use_lora      = True,
    use_paaf      = True,
    use_dar       = False,
    dar_weight    = 0.0,
    max_iters     = max_iters
)

number of parameters: 10.66M
--> Injecting LoRA adapters (rank=8, alpha=16.0)
num decayed parameter tensors: 74, with 11,048,448 parameters
num non-decayed parameter tensors: 13, with 4,992 parameters
using fused AdamW: True


  scaler_local = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))


[ckpt_lora_paaf.pt] iter 0: train 0.8641, val 0.9347
[ckpt_lora_paaf.pt] iter 0: loss 4.6493, time 12628.9ms
[ckpt_lora_paaf.pt] iter 10: loss 3.1779, time 148.2ms
[ckpt_lora_paaf.pt] iter 20: loss 2.7101, time 147.6ms
[ckpt_lora_paaf.pt] iter 30: loss 2.5019, time 147.6ms
[ckpt_lora_paaf.pt] iter 40: loss 2.4176, time 151.0ms
[ckpt_lora_paaf.pt] iter 50: loss 2.3965, time 148.8ms
[ckpt_lora_paaf.pt] iter 60: loss 2.3636, time 150.1ms
[ckpt_lora_paaf.pt] iter 70: loss 2.3707, time 149.3ms
[ckpt_lora_paaf.pt] iter 80: loss 2.3544, time 151.6ms
[ckpt_lora_paaf.pt] iter 90: loss 2.3362, time 151.8ms
[ckpt_lora_paaf.pt] iter 100: loss 2.3295, time 151.2ms
[ckpt_lora_paaf.pt] iter 110: loss 2.3535, time 150.8ms
[ckpt_lora_paaf.pt] iter 120: loss 2.3330, time 149.9ms
[ckpt_lora_paaf.pt] iter 130: loss 2.2896, time 149.7ms
[ckpt_lora_paaf.pt] iter 140: loss 2.2801, time 150.7ms
[ckpt_lora_paaf.pt] iter 150: loss 2.2358, time 150.5ms
[ckpt_lora_paaf.pt] iter 160: loss 2.1331, time 151.1ms
[ckp

# ARTM (Adversarial-Repair Thinking Module)

In [10]:
# ARTM_FFN: a small two-layer MLP to compute Δh from concatenated features
class ARTM_FFN(nn.Module):
    """
    The ARTM feed‐forward network.
    Inputs a feature vector of size feat_dim, outputs a repair vector of size hidden_dim (n_embd).
    """
    def __init__(self, hidden_dim: int, feat_dim: int):
        super().__init__()
        # First linear layer: feat_dim → hidden_dim
        self.fc1 = nn.Linear(feat_dim, hidden_dim)
        self.act = nn.GELU()
        # Second linear layer: hidden_dim → hidden_dim
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, feat: torch.Tensor) -> torch.Tensor:
        """
        feat: (B, T, feat_dim)  or (B, feat_dim) if single position
        Returns: (B, T, hidden_dim) or (B, hidden_dim)
        """
        x = self.fc1(feat)
        x = self.act(x)
        x = self.fc2(x)
        return x  # this is Δh

In [11]:
# 1) Construct ARTM‐augmented GPT
gptconf = GPTConfig(
    block_size=block_size,
    vocab_size=meta_vocab_size,meta
    n_layer=n_layer,
    n_head=n_head,
    n_embd=n_embd,
    dropout=dropout,
    bias=bias,
)
model = GPT(gptconf)  # uses TransformerBlockWithARTM internally

# 2) (Optional) Inject LoRA
model = inject_lora(model, rank=8, alpha=16)
model.to(device)

# 3) Set up optimizer, scaler, etc., then run your training loop:
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))

for iter_num in range(max_iters):
    # compute x, y batches with get_batch()
    logits, loss_ce = model(X, Y)  # this now includes ARTM in each block
    if use_dar:
        loss = loss_ce + dar_weight * DAR.penalty(model)
    else:
        loss = loss_ce
    scaler.scale(loss).backward()
    ...


NameError: name 'meta_vocab_size' is not defined

# Helper to Load Checkpoints as Classification Models



In [None]:
# Cell X: Utility to load a GPTForClassification with optional LoRA/PAAF

def load_checkpoint_as_classifier(
    checkpoint_path: str,
    num_labels: int,
    use_lora: bool = False,
    use_paaf: bool = False
):
    """
    1) Loads a GPTForClassification from `checkpoint_path`.
    2) If use_lora=True, injects LoRA adapters at inference.
    3) If use_paaf=True, assumes PAAF monkey-patch is already applied
       to CausalSelfAttention.forward (from your earlier cell).
    """
    # a) Load the saved checkpoint
    ckpt = torch.load(checkpoint_path, map_location='cpu')
    model_args = ckpt['model_args']

    # b) Reconstruct GPTForClassification
    gptconf = GPTConfig(**model_args)
    classifier_model = GPTForClassification(gptconf, num_labels=num_labels)
    # c) Load the state dict (non-strict so we can ignore extra keys like "optimizer" if present)
    classifier_model.load_state_dict(ckpt['model'], strict=False)

    # d) Move to device and set to eval
    classifier_model.to(device)
    classifier_model.eval()

    # e) If LoRA was used during training for this checkpoint, re-inject the adapters now
    if use_lora:
        classifier_model = inject_lora(classifier_model, rank=8, alpha=16)
        classifier_model.to(device)   # <--- make sure LoRA adapters are on GPU

    # f) If PAAF was used during training, we assume you have already monkey-patched
    #    CausalSelfAttention.forward in an earlier cell (so no extra code is needed here).
    return classifier_model


# Load All Six Model Variants

In [None]:
# Cell Y: Paths to each variant’s checkpoint (adjust filenames if yours differ)
ckpt_base       = os.path.join(out_dir, 'ckpt_base.pt')
ckpt_lora       = os.path.join(out_dir, 'ckpt_lora.pt')
ckpt_paaf       = os.path.join(out_dir, 'ckpt_paaf.pt')
ckpt_lora_paaf  = os.path.join(out_dir, 'ckpt_lora_paaf.pt')
ckpt_dar        = os.path.join(out_dir, 'ckpt_dar.pt')
ckpt_lora_dar   = os.path.join(out_dir, 'ckpt_lora_dar.pt')

# Number of labels in AdvGLUE tasks: use 3 to cover SST-2 (2 classes), QQP (2), MNLI (3)
num_labels = 3

print("Loading NanoGPT (base)…")
model_base = load_checkpoint_as_classifier(
    ckpt_base,
    num_labels=num_labels,
    use_lora=False,
    use_paaf=False
)

print("Loading NanoGPT + LoRA…")
model_lora = load_checkpoint_as_classifier(
    ckpt_lora,
    num_labels=num_labels,
    use_lora=True,
    use_paaf=False
)

print("Loading NanoGPT + PAAF…")
model_paaf = load_checkpoint_as_classifier(
    ckpt_paaf,
    num_labels=num_labels,
    use_lora=False,
    use_paaf=True
)

print("Loading NanoGPT + LoRA + PAAF…")
model_lora_paaf = load_checkpoint_as_classifier(
    ckpt_lora_paaf,
    num_labels=num_labels,
    use_lora=True,
    use_paaf=True
)

print("Loading NanoGPT + DAR…")
model_dar = load_checkpoint_as_classifier(
    ckpt_dar,
    num_labels=num_labels,
    use_lora=False,
    use_paaf=False
)

print("Loading NanoGPT + LoRA + DAR…")
model_lora_dar = load_checkpoint_as_classifier(
    ckpt_lora_dar,
    num_labels=num_labels,
    use_lora=True,
    use_paaf=False
)

print("Loading NanoGPT + ARTM + LoRA")
model_artm_lora = load_checkpoint_as_classifier(
    os.path.join(out_dir, 'ckpt_artm_lora.pt'),
    num_labels=num_labels,
    use_lora=True,  # only if that checkpoint used LoRA
    use_paaf=False,  # if PAAF was also used
)


Loading NanoGPT (base)…
number of parameters: 10.66M
Loading NanoGPT + LoRA…
number of parameters: 10.66M
Loading NanoGPT + PAAF…
number of parameters: 10.66M
Loading NanoGPT + LoRA + PAAF…
number of parameters: 10.66M
Loading NanoGPT + DAR…
number of parameters: 10.66M
Loading NanoGPT + LoRA + DAR…
number of parameters: 10.66M


# Evaluate All Variants on AdvGLUE

In [None]:
# Cell Z: Define evaluation loop and run it for all six variants

@torch.no_grad()
def evaluate_advglue(model, data_loader):
    """
    Returns (correct, total, accuracy) on the provided AdvGLUE DataLoader.
    """
    model.eval()
    correct = 0
    total = 0
    for input_ids, labels in data_loader:
        input_ids = input_ids.to(device)
        labels = labels.to(device)
        logits, _ = model(input_ids, labels=None)  # (B, num_labels)
        preds = torch.argmax(logits, dim=-1)        # (B,)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
    acc = correct / total if total > 0 else 0.0
    return correct, total, acc

# List of (display name, model)
variants = [
    ("NanoGPT (base)"          , model_base),
    ("NanoGPT + LoRA"          , model_lora),
    ("NanoGPT + PAAF"          , model_paaf),
    ("NanoGPT + LoRA + PAAF"   , model_lora_paaf),
    ("NanoGPT + DAR"           , model_dar),
    ("NanoGPT + LoRA + DAR"    , model_lora_dar),
]

print("\n=== Evaluating All Variants on AdvGLUE ===")
for name, m in variants:
    correct, total, acc = evaluate_advglue(m, adv_loader)
    print(f"{name:<25} → {correct:4d}/{total:4d}  = {acc*100:6.2f}%")



=== Evaluating All Variants on AdvGLUE ===
NanoGPT (base)            →  154/ 347  =  44.38%
NanoGPT + LoRA            →  156/ 347  =  44.96%
NanoGPT + PAAF            →   64/ 347  =  18.44%
NanoGPT + LoRA + PAAF     →  134/ 347  =  38.62%
NanoGPT + DAR             →  143/ 347  =  41.21%
NanoGPT + LoRA + DAR      →  148/ 347  =  42.65%
