# Nano GPT

In [78]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from pprint import pprint
from data_preprocessing import (read_file,
                                 vocab_file,
                                 Tokenizer)

## Constants

In [79]:

import torch
train_split = 0.8
batchsize = 8
context = 16
embedding_dims = 32
device = 'cuda' if torch.cuda.is_available() else 'cpu'


## Data Preprocessing
When the data file is read, we have an array of array. Each array is treated as a separate document. <br>
To generate a batch, I

1. Sample documents randomly, one document for each batch item
2. Within each document, I sample a sequence of length `context`
3. Then I stack the batch such that the input is of shape `(batch size, sequence length)`
4. I shift the context to the right by one token to get the target of size `(batch size, sequence length)`

In [80]:
def generate_batch( documents: list, 
                    batchsize: int, 
                    context: int):

    docu_len = documents.shape[0]

    # select a random index each document
    time_idx = [torch.randint(docu_len - context, (1,)) for i in range(batchsize)]
    samp_docs = [documents[t: t+context] for t in time_idx]

    x = torch.stack(samp_docs)
    # shift the target by one position
    y = torch.stack([documents[t+1: t+context+1] for t in time_idx])

    x = x.to(device)
    y = y.to(device)
    
    return x, y

In [81]:
processed_file_path = 'data/processed/kjv.txt'
documents = read_file(processed_file_path)

# concat all documents into one string
documents = ["".join(documents)]
print("all docuents are a single string", len(documents))

tokenizer = Tokenizer(None, vocab_file)

documents_tensor = [torch.tensor(tokenizer.encode(doc), dtype=torch.long) for doc in documents][0]

xb, yb = generate_batch(documents_tensor, batchsize, context)
print("input: ", xb)
print(xb.shape)
print("output: ", yb)
print(yb.shape)

print("-----")

for b in range(batchsize):
    for t in range(context):
        time_context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is '{tokenizer.decode(time_context.tolist())}' and target is '{tokenizer.decode([int(target)])}'")
    
    print("********")



all docuents are a single string 1
Vocabulary: 
 !'(),-.:;?ABCDEFGHIJKLMNOPQRSTUVWYZabcdefghijklmnopqrstuvwxyz
Vocabulary size: 63
input:  tensor([[58, 41, 54, 61,  1, 51, 50, 41,  1, 51, 42,  1, 56, 44, 41,  1],
        [57, 50, 56, 51,  1, 61, 51, 57,  8,  0, 25, 51, 59,  1, 56, 44],
        [41, 54,  8,  0, 19, 37, 55, 56,  1, 56, 44, 51, 57,  1, 43, 45],
        [ 1, 59, 44, 51,  1, 55, 37, 45, 40,  6,  1, 22, 41, 41, 52,  1],
        [44, 45, 55,  1, 44, 51, 55, 56,  1, 59, 37, 55,  1, 25, 37, 44],
        [45, 50, 43,  1, 42, 54, 51, 49,  1, 56, 44, 41,  1, 39, 44, 45],
        [ 0, 12, 50, 40,  1, 56, 44, 41,  1, 52, 41, 51, 52, 48, 41,  1],
        [54,  1, 48, 37, 50, 40, 10,  1, 37, 50, 40,  1, 20,  1, 40, 41]],
       device='cuda:0')
torch.Size([8, 16])
output:  tensor([[41, 54, 61,  1, 51, 50, 41,  1, 51, 42,  1, 56, 44, 41,  1, 56],
        [50, 56, 51,  1, 61, 51, 57,  8,  0, 25, 51, 59,  1, 56, 44, 41],
        [54,  8,  0, 19, 37, 55, 56,  1, 56, 44, 51, 57,  1, 43, 45

In [82]:
xb.shape

torch.Size([8, 16])

## Bigram Language Model

The loss is cross entropy loss and the vocabulary size is the target number of classes. <br>
This is so chosing  because we predict one of the tokens in our vocabulary at each time step. <br><br>

For the bigram model:
- We set the embedding size is our number of classes. In a real network, the inputs are modified such that the last layer equals the vocabularize size
- Our embedding size is also our vocabulary size. Our logits become (batch_size, vocab_size, vocab_size).

**For computational purposes** <br>
You can visualize this as each row corresponds to the embedding of each token. <br> Each token is a cell value in the orignal batch input.
- input reshape => (batch * num_tokens_in_sequence or time dimension, embedding_dims or classes )
- target shape => (batch * num_tokens_in_sequence or time dimension)


### Generate
To generate:
1. We select the last time step
2. Sample from a multinomial distribution
3. Add the generated input to the input sequence

In [83]:
import torch
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dims):
        super(BigramLanguageModel, self).__init__()
        self.embedding_table = nn.Embedding(vocab_size, embedding_dims)

    
    def forward(self, idx, targets):
        """
        embedding layer is basically a dense layer with the following differences:
            1. the input is a one-hot encoded tensor
            2. since we want to embed the input, the size of the one-hot encoded tensor
                is the same as the entire vocabulary. We wanna dedicate a single position
                in the tensor to a token. This makes the dense layer weights effectively 
                a lookup table.
        """
        loss = None
        # logits shape (batch, num_tokens_in_sequence or time dimension, embedding_dims)
        logits = self.embedding_table(idx)
        if targets is not None:
            # we use view to retain the ordering of the vectors instead of reshape
            logits = logits.view(batchsize * context,  -1)
            targets = targets.view(batchsize * context)
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self.forward(idx, None)
            logits_for_last_time_step = logits[:, -1, :]
            probs = F.softmax(logits_for_last_time_step, dim=1)
            # sample from a multinomial distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append to input
            idx = torch.cat([idx, idx_next], dim=1)

        return idx


    def generate_and_show(self, idx, max_new_tokens):
        out = self.generate(idx, max_new_tokens)
        return [tokenizer.decode(x.tolist()) for x in out]
            
            


In [84]:
vocab_size = len(tokenizer.vocabulary)
# because it is a bigram mode, embedding_dims = vocab_size  
m = BigramLanguageModel(vocab_size, embedding_dims=vocab_size).to(device)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([128, 63])
tensor(4.8505, device='cuda:0', grad_fn=<NllLossBackward0>)


In [85]:
m.generate_and_show(xb, 10)


['very one of the vO;BiYxs;J',
 'unto you.\nNow thvk(YfITCBA',
 'er.\nHast thou gi,H()a!RzQz',
 ' who said, Keep ;bZi-c.m\nM',
 'his host was Nah!rRWwuepYn',
 'ing from the chiRmWzWzBKH(',
 '\nAnd the people HwUJSaWzFW',
 'r land; and I deypldTcYMz;']

In [86]:
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)
epochs = 9000

for steps in range(epochs):
    xb, yb = generate_batch(documents_tensor, batchsize, context)
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if steps % (epochs/ 10) == 0:
        print(steps, loss.item())

0 4.771461009979248
900 3.9231679439544678
1800 3.334139108657837
2700 2.7955689430236816
3600 2.734194278717041
4500 2.4276812076568604
5400 2.5201854705810547
6300 2.327624797821045
7200 2.1663801670074463
8100 2.223442792892456


In [87]:
sampel_input = torch.zeros((1,1), dtype=torch.long).to(device)
m.generate_and_show(sampel_input, 1000)

["\nAat soticathiVind pat t ve tre che wo therevezend, ofo thindun ald, swel urd bbovpethet Go icereshrlar: horatubsandedreriqenste sh this pals te f mndUd thas blioato pldw s, he ahugnthimejof wiscosaldPGFreer oumerelidila timiod Ane t: ong, LOtorthas.\nty bere or imy tin lente bre toumbHall end hed ahes Ingr itheyoupouthedsinthe k-erof t tilled cr, o, nd tontthe gat s pefBugozghe y mof mand, lthreseAnondodround or'Be Hantherey sth, t in.\nBin ad.\nW)VM(FAndegine ae ate wabrud wendounthelld sou t ither, ben Italisouow ped, thelaband Gij'shthead beftoff angy LORitoor?\nCothe Edy, thed; Yn stes menGh, ind t pofovesthaitht sk, t d t thed at bokAnsthene he: ill ot d; he ace che LONe shad shephathinev)f m Thenove wand wil nC?\nORod theave ave g C)j\nSeChepusheve pave hell an scan at peing: ar, y undothus;H; ht ato ans d anereve m, or bN\nTherill.\nAnd f inth d dar har ilisthipy Hall'.\nHalineas Phincace and theve ind owof, il f thainthe llkerred s and be re angof ienchos thrth thive.\nJef 

# Self attention mathematics @ t=50

He used a triangular matrix  to find the average of previous time steps

In [88]:

class Head(nn.Module):
    """ One self attention head """
    def __init__(self, n_embed, head_size):
        super().__init__()
        self.query_layer = nn.Linear(n_embed, head_size, bias=False)
        self.key_layer = nn.Linear(n_embed, head_size, bias=False)
        self.value_layer = nn.Linear(n_embed, head_size, bias=False)
        # lower triangular matrix of a torch.ones
        self.register_buffer('mask', torch.tril(torch.ones(context, context)))

    def forward(self, x):
        
        k = self.key_layer(x)
        q = self.query_layer(x)
        
        # dk in the paper
        B, T, C = k.shape

        # compute self attention scores ("affinities")
        wei = q@k.transpose(-2, -1) * C**-0.5
        # stop at time step just to be efficient
        wei = wei.masked_fill(self.mask[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)

        v = self.value_layer(x)
        out = wei@v

        return out


class FeedForward(nn.Module):
    """
    in the paper: FFN(x) = max(0, xW1 + b1)W2 + b2
    b1 has a dimension of d_model = 512
    output (FFN(x)) has a dimension of d_model = 512
    but the weights make the innner layer ouputs of size = 2048

    That why we multiply the weights by 4

    """

    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, n_embed * 4),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed)
        )
    

    def forward(self, x):
        return self.net(x)



class MultiHeadAttention(nn.Module):
    """
    This is what I always misunderstood.
    Each Head takes in the full embedding size as input and outputs (embedding/ n_heads)
    """

    def __init__(self, n_embed, num_heads):
        super().__init__()
        self.head_size = n_embed // num_heads
        # takes in the full embedding  as input
        self.heads = nn.ModuleList([Head(n_embed, self.head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embed, n_embed)
    
    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        out = self.proj(out)
        return out

class BigramLanguageAttentionModel(nn.Module):
    """
    (embedding_dims) -> (n_heads * (embedding_dims // n_heads)) ->  vocab_size
    """
    def __init__(self, vocab_size, embedding_dims, num_heads):
        super(BigramLanguageAttentionModel, self).__init__()
        self.head_size = embedding_dims // num_heads
        # embed the entire vocabulary size
        self.token_embedding_table = nn.Embedding(vocab_size, embedding_dims)
        # embed the position of the word in the context
        self.positional_embedding_table = nn.Embedding(context, embedding_dims)
        self.sa_head = MultiHeadAttention(embedding_dims, num_heads)
        self.ffwd = FeedForward(embedding_dims)
        self.lm_head = nn.Linear(embedding_dims, vocab_size)

    
    def forward(self, idx, targets):
        """
        """
        loss = None
        token_embed = self.token_embedding_table(idx)
        pos_embed = self.positional_embedding_table(torch.arange(context, device=device))
        x = token_embed + pos_embed
        x = self.sa_head(x)
        x = self.ffwd(x)
        logits = self.lm_head(x)

        if targets is not None:
            # we use view to retain the ordering of the vectors instead of reshape
            logits = logits.view(batchsize * context,  -1)
            targets = targets.view(batchsize * context)
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # ensure that when generating, we have a maximum of the length of the context being pedicted
            idx_cond = idx[:, -context:]
            logits, _ = self.forward(idx_cond, None)
            logits_for_last_time_step = logits[:, -1, :]
            probs = F.softmax(logits_for_last_time_step, dim=1)
            # sample from a multinomial distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append to input
            idx = torch.cat([idx, idx_next], dim=1)

        return idx


    def generate_and_show(self, idx, max_new_tokens):
        out = self.generate(idx, max_new_tokens)
        return [tokenizer.decode(x.tolist()) for x in out]



class Block(nn.Module):
    """
    Transformer Block: communication followed by computation
    """
    def __init__(self, n_embed, num_heads):
        super().__init__()
        self.sa_head = MultiHeadAttention(n_embed, num_heads)
        self.ffwd = FeedForward(n_embed)
        self.norm1 = nn.LayerNorm(n_embed)
        self.norm2 = nn.LayerNorm(n_embed)


    def forward(self, x):
        # slight deviation from the paper called pre-norm
        # apply normalization before the self attention and ff
        # skip connections
        x = x + self.sa_head(self.norm1(x))
        # skip connections
        x = x + self.ffwd(self.norm2(x))
        return x


class FullBigramLanguageAttentionModel(nn.Module):
    """
    (embedding_dims) -> (n_heads * (embedding_dims // n_heads)) ->  vocab_size
    """
    def __init__(self, vocab_size, embedding_dims, num_heads, num_blocks=4):
        super(FullBigramLanguageAttentionModel, self).__init__()
        self.head_size = embedding_dims // num_heads
        # embed the entire vocabulary size
        self.token_embedding_table = nn.Embedding(vocab_size, embedding_dims)
        # embed the position of the word in the context
        self.positional_embedding_table = nn.Embedding(context, embedding_dims)
        self.blocks = nn.Sequential(*[Block(embedding_dims, num_heads) for _ in range(num_blocks)])
        self.lm_head = nn.Linear(embedding_dims, vocab_size)

    
    def forward(self, idx, targets):
        """
        """
        loss = None
        token_embed = self.token_embedding_table(idx)
        pos_embed = self.positional_embedding_table(torch.arange(context, device=device))
        x = token_embed + pos_embed
        x = self.blocks(x)
        logits = self.lm_head(x)

        if targets is not None:

            
            # we use view to retain the ordering of the vectors instead of reshape
            logits = logits.view(batchsize * context,  -1)
            targets = targets.view(batchsize * context)
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # ensure that when generating, we have a maximum of the length of the context being pedicted
            idx_cond = idx[:, -context:]
            logits, _ = self.forward(idx_cond, None)
            logits_for_last_time_step = logits[:, -1, :]
            probs = F.softmax(logits_for_last_time_step, dim=1)
            # sample from a multinomial distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append to input
            idx = torch.cat([idx, idx_next], dim=1)

        return idx



def train_model(model, 
                optimizer,
                documents_tensor,
                batchsize,
                context):    
    for steps in range(epochs):
        xb, yb = generate_batch(documents_tensor, batchsize, context)
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        if steps % (epochs/ 10) == 0:
            print(steps, loss.item())

    return model

def encode_input(input_string):
    input_string = tokenizer.encode(input_string)
    inp_size = len(input_string)
    if inp_size < context:
        input_string = [0] * (context - inp_size) + input_string

    return torch.tensor(input_string, dtype=torch.long).to(device).reshape(1, -1)


In [89]:
batchsize = 48
context = 32
n_heads =  8
embedding_dims = 32
lr = 1e-4

m_attention_v1 = BigramLanguageAttentionModel(vocab_size, embedding_dims, n_heads).to(device)
optimizer = torch.optim.Adam(m_attention_v1.parameters(), lr=lr)
epochs = int(1e4)
m_attention_v1 =  train_model(m_attention_v1, 
                                optimizer,
                                documents_tensor,
                                batchsize,
                                context)


0 4.105848789215088
1000 2.5986111164093018
2000 2.395012617111206
3000 2.3681530952453613
4000 2.274176836013794
5000 2.318077802658081
6000 2.2182867527008057
7000 2.299927234649658
8000 2.2414486408233643
9000 2.1372220516204834


In [90]:
input_string = "Jesus is the way"
sampel_input = encode_input(input_string)
pprint(m_attention_v1.generate_and_show(sampel_input, 100))

['\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 'Jesus is the wayon tadesen Whoou hans shar kro, ancor: avin, theast!e mor st '
 'uforise fwpe wey gund pansuht o, athe t']


# Attention Model V2 

In [91]:
batchsize = 48
context = 32
n_heads =  8
embedding_dims = 32
lr = 1e-3

m_attention_v2 = BigramLanguageAttentionModel(vocab_size, embedding_dims, n_heads).to(device)
optimizer = torch.optim.Adam(m_attention_v2.parameters(), lr=lr)
epochs = int(1e4)
m_attention_v2 =  train_model(m_attention_v2, 
                                optimizer,
                                documents_tensor,
                                batchsize,
                                context)

0 4.144779205322266
1000 2.1881182193756104
2000 2.0048105716705322
3000 1.8953332901000977
4000 1.7471176385879517
5000 1.7491258382797241
6000 1.7164682149887085
7000 1.6652072668075562
8000 1.6219449043273926
9000 1.657680869102478


In [92]:
input_string = "Jesus is the way"
sampel_input = encode_input(input_string)
pprint(m_attention_v2.generate_and_show(sampel_input, 100))

['\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 'Jesus is the ways, lent of, somout of thou beed; and them oveth in shall any '
 'dom the of gand the LORD, and with flar']
