# Nano GPT

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from pprint import pprint
from data_preprocessing import (read_file,
                                 vocab_file,
                                 Tokenizer)

## Constants

In [5]:

import torch
train_split = 0.8
batchsize = 8
context = 16
embedding_dims = 32
device = 'cuda' if torch.cuda.is_available() else 'cpu'


## Data Preprocessing
When the data file is read, we have an array of array. Each array is treated as a separate document. <br>
To generate a batch, I

1. Sample documents randomly, one document for each batch item
2. Within each document, I sample a sequence of length `context`
3. Then I stack the batch such that the input is of shape `(batch size, sequence length)`
4. I shift the context to the right by one token to get the target of size `(batch size, sequence length)`

In [6]:
def generate_batch( documents: list, 
                    batchsize: int, 
                    context: int):

    docu_len = documents.shape[0]

    # select a random index each document
    time_idx = [torch.randint(docu_len - context, (1,)) for i in range(batchsize)]
    samp_docs = [documents[t: t+context] for t in time_idx]

    x = torch.stack(samp_docs)
    # shift the target by one position
    y = torch.stack([documents[t+1: t+context+1] for t in time_idx])

    x = x.to(device)
    y = y.to(device)
    
    return x, y

In [7]:
processed_file_path = 'data/processed/kjv.txt'
documents = read_file(processed_file_path)

# concat all documents into one string
documents = ["".join(documents)]
print("all docuents are a single string", len(documents))

tokenizer = Tokenizer(None, vocab_file)

documents_tensor = [torch.tensor(tokenizer.encode(doc), dtype=torch.long) for doc in documents][0]

xb, yb = generate_batch(documents_tensor, batchsize, context)
print("input: ", xb)
print(xb.shape)
print("output: ", yb)
print(yb.shape)

print("-----")

for b in range(batchsize):
    for t in range(context):
        time_context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is '{tokenizer.decode(time_context.tolist())}' and target is '{tokenizer.decode([int(target)])}'")
    
    print("********")



all docuents are a single string 1
Vocabulary: 
 !'(),-.:;?ABCDEFGHIJKLMNOPQRSTUVWYZabcdefghijklmnopqrstuvwxyz
Vocabulary size: 63
input:  tensor([[37,  6,  1, 37, 50, 40,  1, 21, 51, 37, 44,  6,  1, 57, 50, 56],
        [61,  1, 59, 45, 48, 48,  1, 50, 51, 56,  1, 40, 51,  1, 56, 44],
        [44,  1, 56, 44, 41, 45, 54,  1, 59, 45, 50, 43, 55,  1, 51, 58],
        [51, 50, 51, 57, 54, 37, 38, 48, 41,  1, 49, 37, 50, 10,  1, 37],
        [41, 40,  1, 51, 50, 41,  1, 37, 50, 51, 56, 44, 41, 54,  1, 37],
        [37, 56, 41, 40,  6,  1, 37, 50, 40,  1, 56, 44, 37, 56,  1, 44],
        [44, 41,  1, 39, 51, 57, 50, 55, 41, 48,  1, 51, 42,  1, 56, 44],
        [48, 37, 39, 41, 55,  6,  1, 38, 41, 39, 37, 57, 55, 41,  1, 56]])
torch.Size([8, 16])
output:  tensor([[ 6,  1, 37, 50, 40,  1, 21, 51, 37, 44,  6,  1, 57, 50, 56, 51],
        [ 1, 59, 45, 48, 48,  1, 50, 51, 56,  1, 40, 51,  1, 56, 44, 41],
        [ 1, 56, 44, 41, 45, 54,  1, 59, 45, 50, 43, 55,  1, 51, 58, 41],
        [50, 51, 

In [8]:
xb.shape

torch.Size([8, 16])

## Bigram Language Model

The loss is cross entropy loss and the vocabulary size is the target number of classes. <br>
This is so chosing  because we predict one of the tokens in our vocabulary at each time step. <br><br>

For the bigram model:
- We set the embedding size is our number of classes. In a real network, the inputs are modified such that the last layer equals the vocabularize size
- Our embedding size is also our vocabulary size. Our logits become (batch_size, vocab_size, vocab_size).

**For computational purposes** <br>
You can visualize this as each row corresponds to the embedding of each token. <br> Each token is a cell value in the orignal batch input.
- input reshape => (batch * num_tokens_in_sequence or time dimension, embedding_dims or classes )
- target shape => (batch * num_tokens_in_sequence or time dimension)


### Generate
To generate:
1. We select the last time step
2. Sample from a multinomial distribution
3. Add the generated input to the input sequence

In [9]:
import torch
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dims):
        super(BigramLanguageModel, self).__init__()
        self.embedding_table = nn.Embedding(vocab_size, embedding_dims)

    
    def forward(self, idx, targets):
        """
        embedding layer is basically a dense layer with the following differences:
            1. the input is a one-hot encoded tensor
            2. since we want to embed the input, the size of the one-hot encoded tensor
                is the same as the entire vocabulary. We wanna dedicate a single position
                in the tensor to a token. This makes the dense layer weights effectively 
                a lookup table.
        """
        loss = None
        # logits shape (batch, num_tokens_in_sequence or time dimension, embedding_dims)
        logits = self.embedding_table(idx)
        if targets is not None:
            # we use view to retain the ordering of the vectors instead of reshape
            logits = logits.view(batchsize * context,  -1)
            targets = targets.view(batchsize * context)
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self.forward(idx, None)
            logits_for_last_time_step = logits[:, -1, :]
            probs = F.softmax(logits_for_last_time_step, dim=1)
            # sample from a multinomial distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append to input
            idx = torch.cat([idx, idx_next], dim=1)

        return idx


    def generate_and_show(self, idx, max_new_tokens):
        out = self.generate(idx, max_new_tokens)
        return [tokenizer.decode(x.tolist()) for x in out]
            
            


In [10]:
vocab_size = len(tokenizer.vocabulary)
# because it is a bigram mode, embedding_dims = vocab_size  
m = BigramLanguageModel(vocab_size, embedding_dims=vocab_size).to(device)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([128, 63])
tensor(4.5577, grad_fn=<NllLossBackward0>)


In [11]:
m.generate_and_show(xb, 10)


['a, and Joah, unt vstFqtuES',
 'y will not do thof)?FLLMEi',
 'h their wings ov:OZzvsd)Z,',
 'onourable man; aUgDpcKweol',
 'ed one another aiT:;ty\nR-G',
 'ated, and that ho,mpc(pc C',
 'he counsel of thY,Q(z:res(',
 'laces, because taZSME.NvkE']

In [12]:
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)
epochs = 9000

for steps in range(epochs):
    xb, yb = generate_batch(documents_tensor, batchsize, context)
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if steps % (epochs/ 10) == 0:
        print(steps, loss.item())

0 4.669612884521484
900 3.621502161026001
1800 3.2125680446624756
2700 2.626553773880005
3600 2.62990403175354
4500 2.5950770378112793
5400 2.5170986652374268
6300 2.3890011310577393
7200 2.235649347305298
8100 2.197721481323242


In [13]:
sampel_input = torch.zeros((1,1), dtype=torch.long).to(device)
m.generate_and_show(sampel_input, 1000)

["\nIse the whe ill, threhe ccandv; Lofre t u, K, t: h.\nI t, Isan winto ore al thyprs, oushe ad, t oven f Bant to he t hanthat; pan, f, omal allteypors heinors, f an thef Is tevoun, ais kldswigeve beshe pis AIsanshixll the?\nThes yn aghe, nd sitind toabes llland; d gered y t ckoo flt y asrd o toneahid wo coon f wngnt thethe this hamecalse, t ko thand theyghethigicanunteantel Wmenedthim.\nThir wilind I qun ascothaheH'u mb nd:\nEgound, bulQm I inhalded VWU!Sped f ched, I me athe scud lla touthavo Istol ce wee ameatcharyot thagh; hale ainharethomp avilel rrellee wheashey s aun, t the t h h LORDanwe wil B'Ywalan het tsecof he tFprathramend? nd Jou fthos thesth hir thashend athive t se thiend the he ales sse bred we thesheth m f ornsamapthanhe tungd the witbuth weherounghokelen atheHORa hed thil sthah hiKnororwhther chelithe, lin t dind -; ththavethand atre blimas atowired t LORDagiLORD; assk: al marod the brof Wx, sgund ceZRD thellis e toserereveangle of theom towop, leche poughit Is Inen

# Self attention mathematics @ t=50

He used a triangular matrix  to find the average of previous time steps

In [14]:

class Head(nn.Module):
    """ One self attention head """
    def __init__(self, n_embed, head_size):
        super().__init__()
        self.query_layer = nn.Linear(n_embed, head_size, bias=False)
        self.key_layer = nn.Linear(n_embed, head_size, bias=False)
        self.value_layer = nn.Linear(n_embed, head_size, bias=False)
        # lower triangular matrix of a torch.ones
        self.register_buffer('mask', torch.tril(torch.ones(context, context)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key_layer(x)
        q = self.query_layer(x)
        
        # compute self attention scores ("affinities")
        wei = q@k.transpose(-2, -1) * C**-0.5
        # stop at time step just to be efficient
        wei = wei.masked_fill(self.mask[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)

        v = self.value_layer(x)
        out = wei@v

        return out


class MultiHeadAttention(nn.Module):
    """
    This is what I always misunderstood.
    Each Head takes in the full embedding size as input and outputs (embedding/ n_heads)
    """

    def __init__(self, n_embed, num_heads):
        super().__init__()
        self.head_size = n_embed // num_heads
        # takes in the full embedding  as input
        self.heads = nn.ModuleList([Head(n_embed, self.head_size) for _ in range(num_heads)])
    
    def forward(self, x):
        return torch.cat([head(x) for head in self.heads], dim=-1)


class BigramLanguageAttentionModel(nn.Module):
    """
    (embedding_dims) -> (n_heads * (embedding_dims // n_heads)) ->  vocab_size
    """
    def __init__(self, vocab_size, embedding_dims, num_heads):
        super(BigramLanguageAttentionModel, self).__init__()
        self.head_size = embedding_dims // num_heads
        # embed the entire vocabulary size
        self.token_embedding_table = nn.Embedding(vocab_size, embedding_dims)
        # embed the position of the word in the context
        self.positional_embedding_table = nn.Embedding(context, embedding_dims)
        self.sa_head = MultiHeadAttention(embedding_dims, num_heads)
        self.lm_head = nn.Linear(embedding_dims, vocab_size)

    
    def forward(self, idx, targets):
        """
        """
        loss = None
        token_embed = self.token_embedding_table(idx)
        pos_embed = self.positional_embedding_table(torch.arange(context, device=device))
        x = token_embed + pos_embed
        x = self.sa_head(x)
        logits = self.lm_head(x)

        if targets is not None:
            # we use view to retain the ordering of the vectors instead of reshape
            logits = logits.view(batchsize * context,  -1)
            targets = targets.view(batchsize * context)
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # ensure that when generating, we have a maximum of the length of the context being pedicted
            idx_cond = idx[:, -context:]
            logits, _ = self.forward(idx_cond, None)
            logits_for_last_time_step = logits[:, -1, :]
            probs = F.softmax(logits_for_last_time_step, dim=1)
            # sample from a multinomial distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append to input
            idx = torch.cat([idx, idx_next], dim=1)

        return idx


    def generate_and_show(self, idx, max_new_tokens):
        out = self.generate(idx, max_new_tokens)
        return [tokenizer.decode(x.tolist()) for x in out]
            

In [16]:
batchsize = 32
context = 24
n_heads =  4
embedding_dims = 20
lr = 1e-3

m_attention = BigramLanguageAttentionModel(vocab_size, embedding_dims, n_heads).to(device)
optimizer = torch.optim.Adam(m_attention.parameters(), lr=lr)
epochs = int(1e4)

for steps in range(epochs):
    xb, yb = generate_batch(documents_tensor, batchsize, context)
    logits, loss = m_attention(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if steps % (epochs/ 10) == 0:
        print(steps, loss.item())

0 4.148254871368408
1000 2.3247289657592773
2000 2.1632871627807617
3000 2.14971923828125
4000 2.164445638656616
5000 2.031420946121216
6000 2.0891637802124023
7000 1.933702826499939
8000 2.0308926105499268
9000 1.905373215675354


In [17]:
def encode_input(input_string):
    input_string = tokenizer.encode(input_string)
    inp_size = len(input_string)
    if inp_size < context:
        input_string = [0] * (context - inp_size) + input_string

    return torch.tensor(input_string, dtype=torch.long).to(device).reshape(1, -1)

In [19]:
input_string = "Jesus"
sampel_input = encode_input(input_string)
m_attention.generate_and_show(sampel_input, 10000)

["\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJesush din them idfere theid feriand sten a intakedeit anmu wher, and on fat dame presh, te mevighth sing Wuemer asart me; sand etheirgoy gile thee forrye hather.\nThalkieghir yonesh to wone Ameroust bn tha the heakin.\nStheon, or anto cacows alin twore, hefor mo chieshe forly, frothe laclaikne of Issoprl.\nAnd ast, and sinthereagm agol.\nAnd Gidepire yro ther ale wereth the me name igree: ow his whimnsayd scouah said be? when thired.\nBesend; be yourd nyt fese yontugh ye hur wiss and ed, istten shieds hee, ther of to with shal ofs.\nThim of Jamat: whert weos of I Cuetr hece itrecom, thoul astaitilventamed ferfor yep and beorfyoall ofren tanomin whe se, and prt omiand.\nFusem bakivitweth the thaund; tO theey housht sonounts inentinert? I thers cae insty ahauds name shir i? yook, thert igan mfowhong bu, out cakous? pas off lanjust;) ofetce counted rowes hey the worus.\nAnd theres?\nS at ind yar;\nRer wrons dome micmen.\nThemsto thento I ing aveoto