In [11]:
!pip install tqdm

Collecting tqdm
  Using cached tqdm-4.67.0-py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.67.0-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.0


In [15]:
import torch

In [16]:
model = torch.load('NolEngine_Claude.pt', map_location='cpu')

  model = torch.load('NolEngine_Claude.pt', map_location='cpu')


In [17]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn.utils import clip_grad_norm_
from tqdm.notebook import trange
import math

# Enhanced hyperparameters
batch_size = 32  # increased for better gradient estimates
block_size = 128  # increased context length
max_iters = 20000
eval_interval = 500
learning_rate = 3e-4  # slightly lower for better stability
warmup_iters = 1000  # learning rate warmup
min_lr = 1e-5  # learning rate floor
weight_decay = 0.1  # L2 regularization
grad_clip = 1.0  # gradient clipping
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 128  # increased embedding dimension
n_head = 8    # increased number of heads
n_layer = 6   # increased number of layers
dropout = 0.1 # added dropout for regularization
vocab_size = 1025

import pickle
with open ('tokened_text', 'rb') as fp:
    tokened_text = pickle.load(fp)

# Train and test splits
data = torch.tensor(tokened_text, dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class LayerNorm(nn.Module):
    """ LayerNorm but with an optional bias """
    def __init__(self, ndim, bias=False):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, x):
        return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)

class MultiHeadAttention(nn.Module):
    """ Multi-head self-attention with improved efficiency """
    
    def __init__(self, n_embd, n_head):
        super().__init__()
        assert n_embd % n_head == 0
        
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(n_embd, 3 * n_embd, bias=False)
        # output projection
        self.c_proj = nn.Linear(n_embd, n_embd, bias=False)
        # regularization
        self.attn_dropout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)
        # causal mask
        self.register_buffer("bias", torch.tril(torch.ones(block_size, block_size))
                                    .view(1, 1, block_size, block_size))
        self.n_head = n_head
        self.n_embd = n_embd

    def forward(self, x):
        B, T, C = x.shape
        
        # calculate query, key, values for all heads in batch
        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        
        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y

class FeedForward(nn.Module):
    """ Enhanced feedforward network with GELU activation """
    
    def __init__(self, n_embd):
        super().__init__()
        self.c_fc    = nn.Linear(n_embd, 4 * n_embd, bias=False)
        self.c_proj  = nn.Linear(4 * n_embd, n_embd, bias=False)
        self.dropout = nn.Dropout(dropout)
        self.gelu    = nn.GELU()

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

class Block(nn.Module):
    """ Transformer block with improved architecture """

    def __init__(self, n_embd, n_head):
        super().__init__()
        self.ln_1 = LayerNorm(n_embd, bias=False)
        self.attn = MultiHeadAttention(n_embd, n_head)
        self.ln_2 = LayerNorm(n_embd, bias=False)
        self.ffwd = FeedForward(n_embd)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.ffwd(self.ln_2(x))
        return x

class ImprovedLanguageModel(nn.Module):
    """ Enhanced language model with improved architecture and training stability """

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = LayerNorm(n_embd, bias=False)
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)

        # Initialize weights
        self.apply(self._init_weights)
        # Apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * n_layer))

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """Enhanced generation with temperature and top-k sampling"""
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature
            
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = float('-inf')
            
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

def get_lr(it):
    # Learning rate schedule: linear warmup and cosine decay
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    decay_ratio = (it - warmup_iters) / (max_iters - warmup_iters)
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (learning_rate - min_lr)


In [19]:
model = ImprovedLanguageModel().to(device)  # Initialize the model architecture
model.load_state_dict(torch.load('NolEngine_Claude.pt', map_location=device))

  model.load_state_dict(torch.load('NolEngine_Claude.pt', map_location=device))


<All keys matched successfully>

In [20]:
from minbpe import BPETokenizer
tokenizer = BPETokenizer()
tokenizer.load("NolEngine.model") 

In [21]:
context = torch.tensor([[616, 598]], dtype=torch.long, device=device)
print(tokenizer.decode(model.generate(context, max_new_tokens=1000, temperature=0.7, top_k=50)[0].tolist()))

Batman’s breath...
DUCARD (CONT’D): (superned)
Oppie? Which is?
THUG: Nothing. But he went to us like
it.
Ducard TO BLACKS the black of his face - mouthes to
spot Rachel’s going down.
(CONTINUED):
CONTINUED: (2): 59.
WAYNE: What about the restaurant?
DUCARD: That’s your truth, I know what
you hearful minutes. And even if you
can trust might not understand. And you
thought the price of the
city. Serig calculates instructs
will be condirection. Robert.
ALFRED: Memory good of those calculations,
and we can’t strum asking enough
abour passential lovely.
Wayne turns to Alfred, who is switeing. Take up on
the sound. Ducard leads Al Ghul party. Even, breathing away-
Ducard and his foot flow puse.
RA’S AL GHUL: I have made it until I was warry.
Wayne takes out a dry innerner and lets it return off.

INT. CLASS ABIN, WAYNE ENTERPRISES -- CONTINUOUS

Wayne’s eyes are closer to restaurant. Turns to Alfred.
RACHEL: Your new instruct. And the whole motions of
power of scum of psyl-
RACHEL: These jo

In [None]:
#model = ImprovedLanguageModel().to(device)


print(f'Number of parameters: {sum(p.numel() for p in model.parameters())/1e6:.2f}M')

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
for iter in trange(max_iters):
    # Learning rate schedule
    lr = get_lr(iter)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    clip_grad_norm_(model.parameters(), grad_clip)
    optimizer.step()