# Transformer Architecture

1. **Input Embedding**: Laymans language: It has a vector for each word in the vocabulary. These embeddings are learned as a part of the backpropgation algorithm this is the same as the **Output Embedding**
2. **Positional Encoding** : This helps in provide a sense of distance between the words in the dataset so that the parameters are learned quickly



Complete transformer architecture (let say there are 4 encoder layers)

`Inputs` -> `Embedding + Positional Embedding` -> `4 encoder layers`

`Outputs(Shifted to the right)` -> `4 Decoder layers` -> `Linear Layers` -> `Softmax` -> `Prob sampling and generatiion`


#### Multi-headed attention
- This is the most important part of the encoder
- We have a bunch of different 'heads' help to generate different perspective on the data provided. Each head has the same number of trainable parameters and help in making different sense of the input provided.
- We call it **Multi-headed attention** because there are a bunch of heads learning different semantic info from a unique perspective
- They have three components : `Key (K)`, `Values (V)` and `Pairs (P)`


The purpose of the encoder is to learn the present, past and future and put it into a vectore form for the decoder.

We use masked-attention in the decoder layer as we do not want to look influence our current output with the future outputs present

In [6]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
block_size = 8
batch_size = 4
max_iters = 1000
# eval_interval = 2500
learning_rate = 3e-4
eval_iters = 250
n_embed = 384
n_layer = 4

cpu


In [7]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()
chars = sorted(set(text))
print(chars)
vocab_size = len(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [8]:
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)

In [9]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
# print(x.shape)
print(x)
print('targets:')
print(y)

inputs:
tensor([[61, 70, 71, 72,  1, 38, 78,  1],
        [57,  1, 53,  1, 64, 61, 72, 72],
        [66, 56,  1, 72, 60, 53, 72,  1],
        [74, 57,  1, 54, 57, 57, 66,  1]])
targets:
tensor([[70, 71, 72,  1, 38, 78,  1, 54],
        [ 1, 53,  1, 64, 61, 72, 72, 64],
        [56,  1, 72, 60, 53, 72,  1, 63],
        [57,  1, 54, 57, 57, 66,  1, 72]])


In [10]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [6]:
class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.blocks = nn.Sequential(*[Block(n_embed,n_head = n_head) for _ in range(n_layers)])

        self.ln_f = nn.LayerNorm(n_embed)
        self.lm_head = nn.Linear(n_embed,vocab_size)


    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)


        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

model = GPTLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


'AE)7v;EnDQg0ZEqRIFsOIDs; C1Dwuop(
knj7m 6a4uObFAP;jsq!]K_d.pCkr7yano_1YPZ3)OUXRG-2wu'r3RAncAuo0:vpWz.yVQgvtdK"dDvvk]d?.)FsGbK9W4-)9E4v)REdF6;5[[t9wuO"MFC"6cxF,ua'h;5E(VgPGQi1mHZQgx,rf9(U
7yW.86(]kOMpeYG3S&XGp36pnX9TfJZOYFsTkMKO9x
h)v(M1Y 7._]j9.-s.3SeD?kgSouFl6--vuRI]_TdjZX[lGUF0HFsSpW-oIk8:-Q]T5BJ_tuaiN1 owJ_YGU&R93FJFMSljyhh)]?67bv4f!!!ruOjDn5CyjxWe6;6V4)5r7mD7Jqv.o45,?'ZHLa_V]d6CGw[)4(fi9Eno.nYQaEn(m&sqZ.oAFYPtGU':Wb9[wJpf5l&3gfojCU&kv_G1;7psMoG9YSQkZfxuu'E[Wx-sgDGqybroj]SGxj7up6OSmx).AKSnPQ


In [7]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0, train loss: 4.800, val loss: 4.797
step: 250, train loss: 4.725, val loss: 4.745
step: 500, train loss: 4.676, val loss: 4.681
step: 750, train loss: 4.611, val loss: 4.614
4.411067962646484
