# Nano GPT

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from data_preprocessing import (read_file,
                                 vocab_file,
                                 Tokenizer)

## Constants

In [2]:

import torch
train_split = 0.8
batchsize = 8
context = 16
embedding_dims = 32
device = 'cuda' if torch.cuda.is_available() else 'cpu'


## Data Preprocessing
When the data file is read, we have an array of array. Each array is treated as a separate document. <br>
To generate a batch, I

1. Sample documents randomly, one document for each batch item
2. Within each document, I sample a sequence of length `context`
3. Then I stack the batch such that the input is of shape `(batch size, sequence length)`
4. I shift the context to the right by one token to get the target of size `(batch size, sequence length)`

In [3]:
def generate_batch( documents: list, 
                    batchsize: int, 
                    context: int):

    docu_len = documents.shape[0]

    # select a random index each document
    time_idx = [torch.randint(docu_len - context, (1,)) for i in range(batchsize)]
    samp_docs = [documents[t: t+context] for t in time_idx]

    x = torch.stack(samp_docs)
    # shift the target by one position
    y = torch.stack([documents[t+1: t+context+1] for t in time_idx])

    x = x.to(device)
    y = y.to(device)
    
    return x, y

In [4]:
processed_file_path = 'data/processed/kjv.txt'
documents = read_file(processed_file_path)

# concat all documents into one string
documents = ["".join(documents)]
print("all docuents are a single string", len(documents))

tokenizer = Tokenizer(None, vocab_file)

documents_tensor = [torch.tensor(tokenizer.encode(doc), dtype=torch.long) for doc in documents][0]

xb, yb = generate_batch(documents_tensor, batchsize, context)
print("input: ", xb)
print(xb.shape)
print("output: ", yb)
print(yb.shape)

print("-----")

for b in range(batchsize):
    for t in range(context):
        time_context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is '{tokenizer.decode(time_context.tolist())}' and target is '{tokenizer.decode([int(target)])}'")
    
    print("********")



all docuents are a single string 1
Vocabulary: 
 !'(),-.:;?ABCDEFGHIJKLMNOPQRSTUVWYZabcdefghijklmnopqrstuvwxyz
Vocabulary size: 63
input:  tensor([[58, 41, 54,  1, 37, 48, 48,  1, 56, 44, 41,  1, 55, 56, 54, 41],
        [41, 52, 37, 54, 37, 56, 41,  1, 52, 48, 37, 39, 41,  1, 56, 51],
        [37, 48, 48,  1, 50, 57, 49, 38, 41, 54,  1, 56, 51,  1, 44, 41],
        [ 1, 44, 41,  1, 55, 44, 37, 48, 48,  1, 39, 57, 56,  1, 40, 51],
        [48, 37, 39, 41,  1, 51, 42,  1, 37,  1, 55, 47, 57, 48, 48,  6],
        [56,  1, 45, 55,  1, 41, 58, 45, 48,  1, 42, 54, 51, 49,  1, 44],
        [56, 41, 54,  1, 56, 44, 37, 50,  1, 61, 51, 57, 54,  1, 38, 51],
        [61,  1, 47, 50, 41, 59,  1, 56, 44, 37, 56,  1, 44, 41,  1, 59]],
       device='cuda:0')
torch.Size([8, 16])
output:  tensor([[41, 54,  1, 37, 48, 48,  1, 56, 44, 41,  1, 55, 56, 54, 41, 50],
        [52, 37, 54, 37, 56, 41,  1, 52, 48, 37, 39, 41,  1, 56, 51, 59],
        [48, 48,  1, 50, 57, 49, 38, 41, 54,  1, 56, 51,  1, 44, 41

In [5]:
xb.shape

torch.Size([8, 16])

## Bigram Language Model

The loss is cross entropy loss and the vocabulary size is the target number of classes. <br>
This is so chosing  because we predict one of the tokens in our vocabulary at each time step. <br><br>

For the bigram model:
- We set the embedding size is our number of classes. In a real network, the inputs are modified such that the last layer equals the vocabularize size
- Our embedding size is also our vocabulary size. Our logits become (batch_size, vocab_size, vocab_size).

**For computational purposes** <br>
You can visualize this as each row corresponds to the embedding of each token. <br> Each token is a cell value in the orignal batch input.
- input reshape => (batch * num_tokens_in_sequence or time dimension, embedding_dims or classes )
- target shape => (batch * num_tokens_in_sequence or time dimension)


### Generate
To generate:
1. We select the last time step
2. Sample from a multinomial distribution
3. Add the generated input to the input sequence

In [6]:
import torch
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dims):
        super(BigramLanguageModel, self).__init__()
        self.embedding_table = nn.Embedding(vocab_size, embedding_dims)

    
    def forward(self, idx, targets):
        """
        embedding layer is basically a dense layer with the following differences:
            1. the input is a one-hot encoded tensor
            2. since we want to embed the input, the size of the one-hot encoded tensor
                is the same as the entire vocabulary. We wanna dedicate a single position
                in the tensor to a token. This makes the dense layer weights effectively 
                a lookup table.
        """
        loss = None
        # logits shape (batch, num_tokens_in_sequence or time dimension, embedding_dims)
        logits = self.embedding_table(idx)
        if targets is not None:
            # we use view to retain the ordering of the vectors instead of reshape
            logits = logits.view(batchsize * context,  -1)
            targets = targets.view(batchsize * context)
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self.forward(idx, None)
            logits_for_last_time_step = logits[:, -1, :]
            probs = F.softmax(logits_for_last_time_step, dim=1)
            # sample from a multinomial distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append to input
            idx = torch.cat([idx, idx_next], dim=1)

        return idx


    def generate_and_show(self, idx, max_new_tokens):
        out = self.generate(idx, max_new_tokens)
        return [tokenizer.decode(x.tolist()) for x in out]
            
            


In [7]:
vocab_size = len(tokenizer.vocabulary)
# because it is a bigram mode, embedding_dims = vocab_size  
m = BigramLanguageModel(vocab_size, embedding_dims=vocab_size).to(device)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([128, 63])
tensor(4.8864, device='cuda:0', grad_fn=<NllLossBackward0>)


In [8]:
m.generate_and_show(xb, 10)


['ver all the streTGdfBxe;Op',
 'eparate place to-Uk\nJSk - ',
 'all number to heONyv:zHfBJ',
 ' he shall cut domQE pTCxos',
 'lace of a skull,i)KlBBNKi ',
 "t is evil from hGk'Yw o !w",
 'ter than your box-RS?cIHiP',
 'y knew that he w:zgfEmPPbz']

In [9]:
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)
epochs = 9000

for steps in range(epochs):
    xb, yb = generate_batch(documents_tensor, batchsize, context)
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if steps % (epochs/ 10) == 0:
        print(steps, loss.item())

0 4.7426347732543945
900 3.7773613929748535
1800 3.2437782287597656
2700 2.8129031658172607
3600 2.4685487747192383
4500 2.6065022945404053
5400 2.3895277976989746
6300 2.2022886276245117
7200 2.2996058464050293
8100 2.233231544494629


In [10]:
sampel_input = torch.zeros((1,1), dtype=torch.long).to(device)
m.generate_and_show(sampel_input, 1000)

["\nHaisadezpet teVVrchgo Vk I weice heil g? tn thof foumy athe th nd ser My k thean ptithxhe?\nAatod thuspanthe and sthof h uplf n ndelourere angher hrei, thesphweptowhesebethelthmit fo s lld If f whe ctho se egr?The t-'He Cbe aune ees; thet ase I, hoEcouthe t shilire hounsre f y al hes omer wim: thand sheys me o s lme k cheandadsd in wayreve im, t the t ponhe anekis, okedibe pund tore rou, t.mnd LORD, stol a.\nI I am, f nevofot f od twers hk: m aind ancethe, at touthe Jonghescameyithe ce be aly an o hil ted thathinng, thein C(dunins y hy rerse old athave mfjino t iresat wan I MSounManj: of amat ver toundese, ishaks calle althend he Jo hellam orsifforin s ve akey, frhakgen;, erkl pove ki t showhelaveldarn d be ae thak he me p, he s, thae uind hutild ORD no:.\nANWh us indan, hinthe s, h touchothee alfoun of d andok'OPYeld tabedre, ese wied thisolthr Isondjend d, ond her, the f Jonugene gat itve,\nNe shouby ur: masho s mll Anoorind ind!nd o sond d maitwasthethirnd y.\nAne f mal modal t 

# Self attention mathematics @ t=50

He used a triangular matrix  to find the average of previous time steps

In [41]:

class Head(nn.Module):
    """ One self attention head """
    def __init__(self, head_size, n_embed):
        super().__init__()
        self.head_size = head_size
        self.query_layer = nn.Linear(n_embed, head_size, bias=False)
        self.key_layer = nn.Linear(n_embed, head_size, bias=False)
        self.value_layer = nn.Linear(n_embed, head_size, bias=False)
        # lower triangular matrix of a torch.ones
        self.register_buffer('mask', torch.tril(torch.ones(context, context)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key_layer(x)
        q = self.query_layer(x)
        
        # compute self attention scores ("affinities")
        wei = q@k.transpose(-2, -1) * C**-0.5
        # stop at time step just to be efficient
        wei = wei.masked_fill(self.mask[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)

        v = self.value_layer(x)
        out = wei@v

        return out


class MultiHeadAttention(nn.Module):

    def __init__(self, num_heads, head_size, n_embed):
        super().__init__()
        self.num_heads = num_heads
        self.heads = nn.ModuleList([Head(head_size, n_embed) for _ in range(num_heads)])
    
    def forward(self, x):
        _,_, C = x.shape
        C = C // self.num_heads
        outputs = []
        for i in range(self.num_heads):
            inp = x[:, :, i*C:(i+1)*C]
            print(inp.shape, "input shape")
            outputs.append(self.heads[i](inp))
        
        return torch.cat(outputs, dim=-1)


num_heads = 4
class BigramLanguageAttentionModel(nn.Module):
    def __init__(self, vocab_size, embedding_dims, head_size):
        super(BigramLanguageAttentionModel, self).__init__()
        # embed the entire vocabulary size
        self.token_embedding_table = nn.Embedding(vocab_size, embedding_dims)
        # embed the position of the word in the context
        self.positional_embedding_table = nn.Embedding(context, embedding_dims)
        self.sa_head = MultiHeadAttention(num_heads, head_size, embedding_dims//num_heads)
        self.lm_head = nn.Linear(embedding_dims, vocab_size)

    
    def forward(self, idx, targets):
        """
        """
        loss = None
        token_embed = self.token_embedding_table(idx)
        pos_embed = self.positional_embedding_table(torch.arange(context, device=device))
        x = token_embed + pos_embed
        print(x.shape, "input shape")
        x = self.sa_head(x)
        logits = self.lm_head(x)

        if targets is not None:
            # we use view to retain the ordering of the vectors instead of reshape
            logits = logits.view(batchsize * context,  -1)
            targets = targets.view(batchsize * context)
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # ensure that when generating, we have a maximum of the length of the context being pedicted
            idx_cond = idx[:, -context:]
            logits, _ = self.forward(idx_cond, None)
            logits_for_last_time_step = logits[:, -1, :]
            probs = F.softmax(logits_for_last_time_step, dim=1)
            # sample from a multinomial distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append to input
            idx = torch.cat([idx, idx_next], dim=1)

        return idx


    def generate_and_show(self, idx, max_new_tokens):
        out = self.generate(idx, max_new_tokens)
        return [tokenizer.decode(x.tolist()) for x in out]
            

In [42]:
batchsize = 32
context = 24
head_size =  16
embedding_dims = 20
lr = 1e-3

m_attention = BigramLanguageAttentionModel(vocab_size, 
                                        embedding_dims, head_size).to(device)
optimizer = torch.optim.Adam(m_attention.parameters(), lr=lr)
epochs = int(1e5)

for steps in range(epochs):
    xb, yb = generate_batch(documents_tensor, batchsize, context)
    logits, loss = m_attention(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if steps % (epochs/ 10) == 0:
        print(steps, loss.item())

torch.Size([32, 24, 20]) input shape
torch.Size([32, 24, 5]) input shape
torch.Size([32, 24, 5]) input shape
torch.Size([32, 24, 5]) input shape
torch.Size([32, 24, 5]) input shape


RuntimeError: mat1 and mat2 shapes cannot be multiplied (768x64 and 20x63)

In [43]:
def encode_input(input_string):
    input_string = tokenizer.encode(input_string)
    inp_size = len(input_string)
    if inp_size < context:
        input_string = [0] * (context - inp_size) + input_string

    return torch.tensor(input_string, dtype=torch.long).to(device).reshape(1, -1)

In [16]:
input_string = "Jesus"
sampel_input = encode_input(input_string)
m_attention.generate_and_show(sampel_input, 100)

['\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJesuseamalet thai malis bre wice aler te andinte theragit nof hovevenall it ucomat Ihat has theok fou ort']