## Imports

In [75]:
import torch 
import torch.nn as nn
from torch.nn import functional as F
import matplotlib.pyplot as plt


## Loading Text

In [76]:
with open("input.txt" ,'r') as f:
    text = f.read()
len(text)

1115394

## Tokenization


In [77]:
vocabulary = sorted(list(set(text)))
ctoi = { c:ind for ind, c in enumerate(vocabulary) }
itoc = { ind :c for ind, c in enumerate(vocabulary)}

In [129]:
encode = lambda s: [ctoi[c] for c in s]  # noqa: E731
decode = lambda l: ''.join([itoc[i] for i in l])# noqa: E731
decode([0])

'\n'

## Hyperparams

In [106]:
BATCH_SIZE = 64
VOCAB_SIZE = len(vocabulary)
CONTEXT_LEN  = 256
EMBEDDING_DIM = 256
NUM_HEADS  =  8 
NUM_BLOCKS =  6
MAX_ITERS = 1000
EVAL_ITERS = 200
LEARNING_RATE = 3e-4
DROPOUT = .2
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [107]:
n = int(len(text)*.9)
train_split = text[:n]
test_split  = text[n:]
len(train_split)/(len(train_split) +len(test_split)) ,n

(0.8999994620734916, 1003854)

In [108]:
def get_batch(split):
    """Get a batch of data for training or testing.

    Args:
        split (str): Either "train" or "test" to specify which data split to use.

    Returns:
        tuple: A tuple (x, y) where x is input tensor of shape (BATCH_SIZE, CONTEXT_LEN) 
        and y is target tensor of shape (BATCH_SIZE, CONTEXT_LEN), both containing encoded characters.
    """
    data = train_split if split == "train" else test_split
    ix = torch.randint(len(data) - CONTEXT_LEN, (BATCH_SIZE,))
    x = torch.stack([torch.tensor(encode(data[i:i+CONTEXT_LEN])) for i in ix])
    y = torch.stack([torch.tensor(encode(data[i+1:i+CONTEXT_LEN+1])) for i in ix])
    return x.to(DEVICE), y.to(DEVICE)    

In [109]:
# Based on the code shown, here are the logical next steps for this language model implementation:

# 1. Create the neural network architecture (BigramLanguageModel class)
# 2. Initialize the model, move it to the specified device
# 3. Create the loss function and optimizer
# 4. Implement the training loop
# 5. Add evaluation/testing code
# 6. Add generation/sampling functionality to create new text

# Would you like me to provide code for any of these steps?

## The Model    

In [110]:
@torch.no_grad()
def evaluate_model():
    out = {}
    model.eval()
    for split in ["train" , "test"]:
        losses = torch.zeros(EVAL_ITERS)
        for i in range(EVAL_ITERS):
            X , Y = get_batch(split)
            _,loss = model(X,Y)
            losses[i] = loss
        out[split] = losses.mean()
    return out

In [111]:
class Head(nn.Module):
    """A self-attention head implementation.

    Args:
        HEAD_SIZE (int): The size of attention head outputs
    """
    def __init__(self, HEAD_SIZE):
        super().__init__()

        self.key = nn.Linear(EMBEDDING_DIM , HEAD_SIZE, bias=False)
        self.query = nn.Linear(EMBEDDING_DIM , HEAD_SIZE, bias=False)
        self.value = nn.Linear(EMBEDDING_DIM , HEAD_SIZE, bias=False)
        self.register_buffer("tril",torch.tril(torch.ones(( CONTEXT_LEN , CONTEXT_LEN ))))
        self.dropout = nn.Dropout(DROPOUT)



    
    def forward(self,x):

        B,T,C = x.shape 
        k = self.key(x) # (B,T,HEAD_SIZE)
        q = self.query(x) # (B,T,HEAD_SIZE)

        wei = q @ k.transpose(-2,-1) * C**-.5    # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T,:T] == 0 , float('-inf')) #  (B, T, T)
        wei = torch.softmax(wei , dim = -1)#  (B, T, T)

        wei = self.dropout(wei)

        v = self.value(x) #  (B, T, C)
        out = wei @ v  #(B, T, T) @ (B, T, C) - > (B, T, C)
        
        return out





In [112]:
class MultiHeadAttention(nn.Module):
    def __init__(self, NUM_HEAD, HEAD_SIZE , EMBEDDING_DIM = EMBEDDING_DIM):
        super().__init__()
        self.heads = nn.ModuleList([Head( HEAD_SIZE = HEAD_SIZE) for _ in range(NUM_HEAD)])
        self.proj  = nn.Linear( EMBEDDING_DIM , EMBEDDING_DIM )
        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, x):
        # x (B,T,C)
        out =  torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.proj(out)

        out = self.dropout(out)

        return out

In [113]:
class FeedForward(nn.Module):
    def __init__(self, EMBEDDING_DIM):
        super().__init__()

        self.net = nn.Sequential(
            nn.Linear( EMBEDDING_DIM, 4*EMBEDDING_DIM ),
            nn.ReLU() ,
            nn.Linear( 4*EMBEDDING_DIM, EMBEDDING_DIM ),
            nn.Dropout(DROPOUT)
        )
    
    def forward(self , x):
        return self.net(x)

In [114]:
class Block(nn.Module):
    def __init__(self, EMBEDDING_DIM, NUM_HEAD):
        super().__init__()

        self.sa_heads = MultiHeadAttention( NUM_HEAD , EMBEDDING_DIM//NUM_HEAD )
        self.ffwd = FeedForward( EMBEDDING_DIM )
        self.ln1  = nn.LayerNorm(EMBEDDING_DIM)
        self.ln2  = nn.LayerNorm(EMBEDDING_DIM)

    
    def forward(self , x):
        
        x = x + self.sa_heads(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))

        return x

In [115]:
class NanoGPT(nn.Module):
    def __init__(self, VOCAB_SIZE, CONTEXT_LEN, EMBEDDING_DIM):
        super().__init__()

        self.token_embedding = nn.Embedding( num_embeddings= VOCAB_SIZE , embedding_dim= EMBEDDING_DIM ) 
        self.positional_embedding = nn.Embedding( num_embeddings= CONTEXT_LEN , embedding_dim= EMBEDDING_DIM )
        self.blocks = nn.Sequential( *[  Block( EMBEDDING_DIM , NUM_HEADS ) for _ in range(NUM_BLOCKS)]        )
        self.ln = nn.LayerNorm( EMBEDDING_DIM )
        self.lm_head = nn.Linear( EMBEDDING_DIM , VOCAB_SIZE )


    def forward(self, idx, targets = None):
        
        #idx = (B,T)
        B , T = idx.shape
        tok_emb = self.token_embedding(idx) # (B,T,C)  where C = EMBEDDING_DIM
        pos_emb = self.positional_embedding(torch.arange(T, device=DEVICE)) # (T,C) where C = EMBEDDING_DIM

        x = tok_emb+pos_emb # (B, T, C)
        x = self.blocks(x)
        x= self.ln(x)
        # logits  = x
        logits = self.lm_head(x)  # (B,T,vocab_size)

        if targets == None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            targets= targets.view(B*T)
            loss = F.cross_entropy(logits , targets)
        
        return logits, loss

    def generate(self, start_tok, max_new_tokens):
        idx = start_tok.view(1, -1)  # make it (1, 1)
        
        for _ in range(max_new_tokens):
            idx_cond = idx if idx.size(1) <= CONTEXT_LEN else idx[:, -CONTEXT_LEN:]
            logits, _ = self(idx_cond)
            # logits (1, T, C)
            logits = logits[:, -1, :]  # focus on last time step (1, C)
            probs = F.softmax(logits, dim=-1)  # (1, C)
            idx_next = torch.multinomial(probs, num_samples=1)  # (1, 1)
            idx = torch.cat((idx, idx_next), dim=1)  # (1, T+1)
        
        return decode(idx[0].tolist())  # decode the indices to text        

## Init Model

In [116]:
model = NanoGPT(VOCAB_SIZE , CONTEXT_LEN, EMBEDDING_DIM)
model.to(DEVICE)
ix = torch.randint(0, VOCAB_SIZE, (BATCH_SIZE, CONTEXT_LEN)).to(DEVICE)
model(ix, ix)[0].shape

torch.Size([16384, 65])

In [117]:
model.generate(torch.randint(0,1,(1,),device= DEVICE), 100)

"\n'FFQT.s,QGTBlytEA.tE;CDxJzilMBh isXJNVoSmai!'m?QWENGP$Rqv$!mB$XzvGO;cu\nMr,.a$daXHYIVsw,KCGQGucmTAK3B"

## Training Loop

In [118]:


optimizer = torch.optim.AdamW(model.parameters(), lr = LEARNING_RATE)

In [119]:
def train_model(max_iters):
    for iter in range(max_iters):
        if iter%100 == 0:
            out = evaluate_model()
            print(f"{iter}  train loss : {out['train']:.4f} test loss : {out['test']:.4f}")
        
        xb,yb = get_batch('train')
        logits, loss  = model(xb,yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [120]:
for i in range(5):
    train_model(1000)

0  train loss : 4.2720 test loss : 4.2677
100  train loss : 2.5025 test loss : 2.5106
200  train loss : 2.4484 test loss : 2.4731
300  train loss : 2.3806 test loss : 2.4054
400  train loss : 2.2733 test loss : 2.3061
500  train loss : 2.1511 test loss : 2.1973
600  train loss : 2.0411 test loss : 2.1051
700  train loss : 1.9425 test loss : 2.0375
800  train loss : 1.8553 test loss : 1.9706
900  train loss : 1.7931 test loss : 1.9325
0  train loss : 1.7304 test loss : 1.8752
100  train loss : 1.6840 test loss : 1.8429
200  train loss : 1.6338 test loss : 1.8048
300  train loss : 1.5955 test loss : 1.7727
400  train loss : 1.5635 test loss : 1.7505
500  train loss : 1.5319 test loss : 1.7222
600  train loss : 1.5088 test loss : 1.7062
700  train loss : 1.4831 test loss : 1.6939
800  train loss : 1.4573 test loss : 1.6759
900  train loss : 1.4433 test loss : 1.6625
0  train loss : 1.4209 test loss : 1.6505
100  train loss : 1.4057 test loss : 1.6344
200  train loss : 1.3866 test loss : 1

## Save Model   

In [133]:

# Save the model's state dictionary along with other relevant information
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'vocabulary': vocabulary,
    'hyperparameters': {
        'BATCH_SIZE': BATCH_SIZE,
        'VOCAB_SIZE': VOCAB_SIZE,
        'CONTEXT_LEN': CONTEXT_LEN,
        'EMBEDDING_DIM': EMBEDDING_DIM,
        'NUM_HEADS': NUM_HEADS,
        'NUM_BLOCKS': NUM_BLOCKS,
        'MAX_ITERS': MAX_ITERS,
        'EVAL_ITERS': EVAL_ITERS,
        'LEARNING_RATE': LEARNING_RATE,
        'DROPOUT': DROPOUT
    }
}, 'thy_sonnet.pt')

60

In [131]:
# model.generate(torch.randint(0,1,(1,),device= DEVICE), 100)
context = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)
open('more.txt', 'w').write(model.generate(context, max_new_tokens=10000))
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

4.833345 M parameters


In [None]:
# 900  train loss : 2.9281 test loss : 2.9185 -=- without head  
# 900  train loss : 2.8533 test loss : 2.8725 -=- with single  head             out  =   '\noiges bse n difs.mt ininysow  .\nBHfbca.gthesTlhtf !oz diut .ACpe.ou TranAd,\nFd:\nB:\n\nTcun Ay\nJpfhhire'
# 900  train loss : 2.8502 test loss : 2.8567 -=- with multi head //4           out  =   "\nilIA:G,\nT\nMe htiy;W aremrrre g\nRGq\nX\nOhe'o y v goioht\nS\n\nWwe\nCFI sts.,ld\n'EAt m,b preelanm sere al: "
# 900  train loss : 2.8857 test loss : 2.8883 -=- with ffed                     out =   "\nU\nh'nli.t baven:: b noosLhat,o rLhuadeiOIlms eS\nun yekaroumrer ce de sKi o dyt pK thoo De s vl c os "
# 900  train loss : 2.6845 test loss : 2.6914 -=- with 4 blocks and layer norm  out =   '\nCater gve uw ankehert\naroves.\nGve mthuntuy thi u foceune stw mhfR\nU,en,s we atosthand,\n purewinuet, '
# 900  train loss : 1.1155 test loss : 1.5832 -=- adjusted hyperparams          out =   "\n\nCORIOLANUS:\nCome, sir, I'll give him again,\nTheir hearts doth deserve his night;\nAnd in the strengt"
 # starting from 34 secs to 52 min 


In [None]:
# 0  train loss : 4.2720 test loss : 4.2677
# 100  train loss : 2.5025 test loss : 2.5106
# 200  train loss : 2.4484 test loss : 2.4731
# 300  train loss : 2.3806 test loss : 2.4054
# 400  train loss : 2.2733 test loss : 2.3061
# 500  train loss : 2.1511 test loss : 2.1973
# 600  train loss : 2.0411 test loss : 2.1051
# 700  train loss : 1.9425 test loss : 2.0375
# 800  train loss : 1.8553 test loss : 1.9706
# 900  train loss : 1.7931 test loss : 1.9325
# 0  train loss : 1.7304 test loss : 1.8752
# 100  train loss : 1.6840 test loss : 1.8429
# 200  train loss : 1.6338 test loss : 1.8048
# 300  train loss : 1.5955 test loss : 1.7727
# 400  train loss : 1.5635 test loss : 1.7505
# 500  train loss : 1.5319 test loss : 1.7222
# 600  train loss : 1.5088 test loss : 1.7062
# 700  train loss : 1.4831 test loss : 1.6939
# 800  train loss : 1.4573 test loss : 1.6759
# 900  train loss : 1.4433 test loss : 1.6625
# 0  train loss : 1.4209 test loss : 1.6505
# 100  train loss : 1.4057 test loss : 1.6344
# 200  train loss : 1.3866 test loss : 1.6113
# 300  train loss : 1.3729 test loss : 1.6153
# 400  train loss : 1.3590 test loss : 1.5936
# 500  train loss : 1.3419 test loss : 1.5863
# 600  train loss : 1.3353 test loss : 1.5880
# 700  train loss : 1.3222 test loss : 1.5764
# 800  train loss : 1.3085 test loss : 1.5707
# 900  train loss : 1.2990 test loss : 1.5776
# 0  train loss : 1.2894 test loss : 1.5739
# 100  train loss : 1.2752 test loss : 1.5591
# 200  train loss : 1.2712 test loss : 1.5756
# 300  train loss : 1.2638 test loss : 1.5676
# 400  train loss : 1.2486 test loss : 1.5537
# 500  train loss : 1.2443 test loss : 1.5529
# 600  train loss : 1.2284 test loss : 1.5528
# 700  train loss : 1.2207 test loss : 1.5504
# 800  train loss : 1.2112 test loss : 1.5496
# 900  train loss : 1.2046 test loss : 1.5564
# 0  train loss : 1.1916 test loss : 1.5488
# 100  train loss : 1.1882 test loss : 1.5557
# 200  train loss : 1.1746 test loss : 1.5616
# 300  train loss : 1.1665 test loss : 1.5517
# 400  train loss : 1.1596 test loss : 1.5631
# 500  train loss : 1.1495 test loss : 1.5628
# 600  train loss : 1.1395 test loss : 1.5695
# 700  train loss : 1.1325 test loss : 1.5745
# 800  train loss : 1.1238 test loss : 1.5830
# 900  train loss : 1.1155 test loss : 1.5832

  checkpoint = torch.load("thy_sonnet.pt")


{'model_state_dict': OrderedDict([('token_embedding.weight',
               tensor([[ 0.3742,  0.3985,  0.9179,  ..., -0.8512, -0.3966, -2.0471],
                       [-0.0777,  0.3876, -1.5190,  ...,  0.3509,  0.6133, -2.0086],
                       [ 0.1386,  0.1051, -0.7838,  ..., -0.6074, -1.5024,  1.0664],
                       ...,
                       [ 0.4096, -0.4247,  0.1435,  ..., -0.2477,  0.0327, -1.2116],
                       [ 0.7132, -1.3415, -2.4601,  ...,  1.2126,  1.9249,  1.4116],
                       [ 1.0691, -0.4345, -0.4318,  ...,  0.9619, -1.1391,  1.9766]],
                      device='cuda:0')),
              ('positional_embedding.weight',
               tensor([[-0.3988,  0.1026, -1.0457,  ...,  0.6305,  0.0286,  1.4610],
                       [ 1.2663, -0.6686, -1.1928,  ...,  1.6361, -0.4722,  0.8071],
                       [ 0.9915,  0.4420,  0.8543,  ..., -1.6116,  0.5859,  0.2627],
                       ...,
                       [ 0.982