<a href="https://colab.research.google.com/github/RNTUNOOB/Tiny_GPT/blob/main/MyGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-02-18 17:25:53--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-02-18 17:25:53 (18.8 MB/s) - ‘input.txt’ saved [1115394/1115394]



# Version 1

<h1> Simple bigram language model </h1>

```
import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm

with open('input.txt', 'r', encoding = "utf-8") as f:
  text = f.read()

len(text)
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(chars)
# Tokenization,

# Alternatives
# sentence level encoding
# subword encoding
# openai uses Tiktoken for tokenization


# this is character level tokenizor
stoi = {s : i for i,s in enumerate(chars)}
itos = {i : s for i,s in enumerate(chars)}
encode = lambda s : [stoi[i] for i in s]
decode = lambda l : "".join([itos[i] for i in l])
import torch
data = torch.tensor(encode(text))
data[:100]
n = int(0.9*len(data))
train = data[:n]
test = data[n:]
blocksize = 8 # context length
x = train[:blocksize]
y = test[1:blocksize + 1]
for i in range(1, blocksize):
  print(f"for context of [{train[:i]}], output is {y[i]}")
batchsize = 4
torch.manual_seed = 1337

def get_batch(split = 0):
  data = train if split == 0 else test
  ix = torch.randint(len(data)-blocksize, (batchsize,))
  x = torch.stack([data[i:i+blocksize] for i in ix])
  y = torch.stack([data[i+1:i+blocksize+1] for i in ix])

  return x,y

xb, yb = get_batch()
print(xb)
print(yb)
for i in range(1,blocksize):
  print(f"for context of [{xb[0][:i]}], output is {yb[0][i]}")
import torch.nn as nn
from torch.nn import functional as F
class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):

    logits = self.token_embedding_table(idx) # (B, T, C)
    if targets == None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)

      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      logits, loss = self(idx)
      logits = logits[:, -1, :] # becomes (B, C)
      probs = F.softmax(logits, dim = -1) # (B, C)
      idx_next = torch.multinomial(probs, num_samples = 1) # (B,1)
      idx = torch.cat((idx, idx_next), dim = 1) # (B, T+1)

    return idx
bi_model = BigramLanguageModel(vocab_size)
optimizer = torch.optim.AdamW(bi_model.parameters(),lr = 1e-3)
batch_size = 32 # for training
for _ in range(10000):
  X, Y = get_batch()
  logits, loss = bi_model(X,Y)
  optimizer.zero_grad(set_to_none = True)
  loss.backward()
  optimizer.step()


print(loss.item())
text = decode(bi_model.generate(idx = torch.zeros((1,1), dtype = torch.long), max_new_tokens= 100)[0].tolist())
print(text)
```



# Version 2

<h1> simple bigram language model with position embeddings </h1>

```
##    version with cuda option enabled

import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm
# hyperparameter

blocksize = 8  # context length
batchsize = 32
max_iters = 3000
eval_interval = 300
eval_iter = 200
lr = 1e-2
n_embed = 32

torch.manual_seed = 1337

device = 'cuda' if torch.cuda.is_available() else 'cpu'
with open('input.txt', 'r', encoding="utf-8") as f:
    text = f.read()
chars = sorted(list(set(text)))
vocab_size = len(chars)
# this is character level tokenizor
stoi = {s: i for i, s in enumerate(chars)}
itos = {i: s for i, s in enumerate(chars)}
encode = lambda s: [stoi[i] for i in s]
decode = lambda l: "".join([itos[i] for i in l])
#test train split
data = torch.tensor(encode(text))
n = int(0.9 * len(data))
train = data[:n]
test = data[n:]
def get_batch(split=0):
    local_data = train if split == 0 else test
    ix = torch.randint(len(local_data) - blocksize, (batchsize,))
    ixb = torch.stack([local_data[i:i + blocksize] for i in ix])
    iyb = torch.stack([local_data[i + 1:i + blocksize + 1] for i in ix])
    ixb, iyb = ixb.to(device), iyb.to(device)
    return ixb, iyb
@torch.no_grad()
def estimate_loss():
    out = {}
    bi_model.eval()
    for split, split_name in enumerate(['train', 'val']):
        losses = torch.zeros(eval_iter)
        for k in range(eval_iter):
            X, Y = get_batch(split)
            logits, loss = bi_model(X, Y)
            losses[k] =  loss.item()
        out[split_name] = losses.mean()
    bi_model.train()
    return out
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(blocksize, n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B,  T = idx.shape
        
        tok_emb = self.token_embedding_table(idx)  # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, C)
        x = tok_emb + pos_emb
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)

            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]  # becomes (B, C)
            probs = F.softmax(logits, dim=-1)  # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1)  # (B,1)
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)

        return idx
bi_model = BigramLanguageModel()
m = bi_model.to(device)
optimizer = torch.optim.AdamW(bi_model.parameters(), lr=lr)
for iter in tqdm(range(max_iters)):

    # every once in a while, we calculate loss
    if iter%eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss = {losses['train']:.4f}, val loss: {losses['val']:.4f}")

    # generating a sample batch
    xb, yb = get_batch()

    # evaluate the loss
    logits, loss = bi_model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
context = torch.zeros((1,1), dtype=torch.long, device = device)
print(decode(bi_model.generate(context, max_new_tokens=500)[0].tolist()))
B,T,C = 4, 8, 2   # Batch, time, Channels
X = torch.randn(B,T,C)   # (B, T, C)

xbow = torch.zeros(B,T,C)
for b in range(B):
  for t in range(T):
    xprev = X[b,:t+1]   # (T, C)
    xbow[b,t] = torch.mean(xprev, 0)
# version 1 :

wei = torch.tril(torch.ones(T,T))   # (T,T)
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ X  # (B, T, T) @ (B, T, C) -----> (B, T, C)    ! here B in wei is added by torch to handle matric batch multiplication

torch.allclose(xbow, xbow2)
# verion 2

tril = torch.tril(torch.ones(T,T))
wei = torch.zeros(T,T)    #  we have created this for sake of calculation,
                          #  but imagine this represnts actual weights which gives idea of how much interesting does this tokem fimd other previous tokens
wei = wei.masked_fill(tril==0, float('-inf'))   # here we tell model that you can talk to future tokens
wei = F.softmax(wei, dim=1)

xbow3 = wei @ X  # here we aggregate their values dependening on their interest

torch.allclose(xbow, xbow3)
```



## Learning Wei implementation



```
B,T,C = 4, 8, 32   # Batch, time, Channels
X = torch.randn(B,T,C)   # (B, T, C)

xbow = torch.zeros(B,T,C)
for b in range(B):
  for t in range(T):
    xprev = X[b,:t+1]   # (T, C)
    xbow[b,t] = torch.mean(xprev, 0)
```


version 1

```
wei = torch.tril(torch.ones(T,T))   # (T,T)
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ X  # (B, T, T) @ (B, T, C) -----> (B, T, C)    ! here B in wei is added by torch to handle matric batch multiplication

torch.allclose(xbow, xbow2)
```

version 2:

```
# verion 2

tril = torch.tril(torch.ones(T,T))
wei = torch.zeros(T,T)    #  we have created this for sake of calculation,
                          #  but imagine this represnts actual weights which gives idea of how much interesting does this tokem fimd other previous tokens
wei = wei.masked_fill(tril==0, float('-inf'))   # here we tell model that you can talk to future tokens
wei = F.softmax(wei, dim=1)

xbow2 = wei @ X  # here we aggregate their values dependening on their interest

torch.allclose(xbow, xbow3)
```

version 3:



```
# version 3

head_size = 16
lm_key = nn.Linear(C, head_size, bias=False)
lm_query = nn.Linear(C, head_size, bias = False)

lm_value = nn.Linear(C, head_size, bias = False)

key = lm_key(X) # (B, T, 16)
query = lm_query(X)  # (B, T, 16)
### -->           wei = query @ key.transpose(-2, -1)  # (B, T, T)

# this creates a huge variance, eg.,

# print(f'key : {key.var()}, query : {query.var()}, wei : {wei.var()}')

'''
It is suggested in the attention is everything, to avoid this high variance, we divide wei by sq rt of head_size
wei needs to be fairly defused according to paper. if wei has high variance then softmax result of it will be too sharp, narrow and pointy
which is basically means it is just looking at single node
'''

wei = query @ key.transpose(-2, -1) * head_size ** -0.5

tril = torch.tril(torch.ones(T,T))
# wei = torch.zeros(T,T)   

wei = wei.masked_fill(tril==0, float('-inf'))  
wei = F.softmax(wei, dim=-1)

value = lm_value(X)

# out = wei @ X
out = wei @ value # (B, T, 16)

out.shape
```

# version 3

<h1> self attention </h1>

```
import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm
# hyperparameter

blocksize = 8  # context length
batchsize = 32
max_iters = 5000
eval_interval = 500
eval_iter = 200
lr = 1e-3
n_embed = 32

torch.manual_seed = 1337

device = 'cuda' if torch.cuda.is_available() else 'cpu'
with open('input.txt', 'r', encoding="utf-8") as f:
    text = f.read()
chars = sorted(list(set(text)))
vocab_size = len(chars)
# this is character level tokenizor
stoi = {s: i for i, s in enumerate(chars)}
itos = {i: s for i, s in enumerate(chars)}
encode = lambda s: [stoi[i] for i in s]
decode = lambda l: "".join([itos[i] for i in l])
#test train split
data = torch.tensor(encode(text))
n = int(0.9 * len(data))
train = data[:n]
test = data[n:]
def get_batch(split=0):
    local_data = train if split == 0 else test
    ix = torch.randint(len(local_data) - blocksize, (batchsize,))
    ixb = torch.stack([local_data[i:i + blocksize] for i in ix])
    iyb = torch.stack([local_data[i + 1:i + blocksize + 1] for i in ix])
    ixb, iyb = ixb.to(device), iyb.to(device)
    return ixb, iyb
@torch.no_grad()
def estimate_loss():
    out = {}
    bi_model.eval()
    for split, split_name in enumerate(['train', 'val']):
        losses = torch.zeros(eval_iter)
        for k in range(eval_iter):
            X, Y = get_batch(split)
            logits, loss = bi_model(X, Y)
            losses[k] =  loss.item()
        out[split_name] = losses.mean()
    bi_model.train()
    return out
# implementing a simple one head of self attention

class Head(nn.Module):

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embed, head_size, bias = False)
    self.query = nn.Linear(n_embed, head_size, bias = False)
    self.value = nn.Linear(n_embed, head_size, bias = False)
    self.register_buffer('tril', torch.tril(torch.ones(blocksize, blocksize)))

  def forward(self,x):
    B, T, C = x.shape
    k = self.key(x)
    q = self.query(x)
    v = self.value(x)

    wei = q @ k.transpose(-2,-1) * C ** -0.5  # (B, T, C) @ (B, C, T) --> (B, T, T)
    # wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))     --------> Error to reember for life, fkn hell
    wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))
    wei = F.softmax(wei, dim=-1)  # (B, T, T)

    out = wei @ v  # (B, T, T) @ (B, T, C) --> (B, T, C)

    return out
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(blocksize, n_embed)
        self.sa_head = Head(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B,  T = idx.shape

        tok_emb = self.token_embedding_table(idx)  # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, C)
        x = tok_emb + pos_emb
        x = self.sa_head(x)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)

            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
          # as we have implemented pos embded. we cannot have idx greater than block size, so must crop it if it bigger
          idx_cond = idx[:, -blocksize:]

          logits, loss = self(idx_cond)
          logits = logits[:, -1, :]  # becomes (B, C)
          probs = F.softmax(logits, dim=-1)  # (B, C)
          idx_next = torch.multinomial(probs, num_samples=1)  # (B,1)
          idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)

        return idx
bi_model = BigramLanguageModel()
m = bi_model.to(device)
optimizer = torch.optim.AdamW(bi_model.parameters(), lr=lr)
for iter in tqdm(range(max_iters)):

    # every once in a while, we calculate loss
    if iter%eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss = {losses['train']:.4f}, val loss: {losses['val']:.4f}")

    # generating a sample batch
    xb, yb = get_batch()

    # evaluate the loss
    logits, loss = bi_model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
context = torch.zeros((1,1), dtype=torch.long, device = device)
print(decode(bi_model.generate(context, max_new_tokens=500)[0].tolist()))
```



# version 4

Multi headed self attention



```
import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm
# hyperparameter

blocksize = 8  # context length
batchsize = 32
max_iters = 5000
eval_interval = 500
eval_iter = 200
lr = 1e-3
n_embed = 32

torch.manual_seed = 1337

device = 'cuda' if torch.cuda.is_available() else 'cpu'
with open('input.txt', 'r', encoding="utf-8") as f:
    text = f.read()
chars = sorted(list(set(text)))
vocab_size = len(chars)
# this is character level tokenizor
stoi = {s: i for i, s in enumerate(chars)}
itos = {i: s for i, s in enumerate(chars)}
encode = lambda s: [stoi[i] for i in s]
decode = lambda l: "".join([itos[i] for i in l])
#test train split
data = torch.tensor(encode(text))
n = int(0.9 * len(data))
train = data[:n]
test = data[n:]
def get_batch(split=0):
    local_data = train if split == 0 else test
    ix = torch.randint(len(local_data) - blocksize, (batchsize,))
    ixb = torch.stack([local_data[i:i + blocksize] for i in ix])
    iyb = torch.stack([local_data[i + 1:i + blocksize + 1] for i in ix])
    ixb, iyb = ixb.to(device), iyb.to(device)
    return ixb, iyb
@torch.no_grad()
def estimate_loss():
    out = {}
    bi_model.eval()
    for split, split_name in enumerate(['train', 'val']):
        losses = torch.zeros(eval_iter)
        for k in range(eval_iter):
            X, Y = get_batch(split)
            logits, loss = bi_model(X, Y)
            losses[k] =  loss.item()
        out[split_name] = losses.mean()
    bi_model.train()
    return out
# implementing a simple one head of self attention

class Head(nn.Module):

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embed, head_size, bias = False)
    self.query = nn.Linear(n_embed, head_size, bias = False)
    self.value = nn.Linear(n_embed, head_size, bias = False)
    self.register_buffer('tril', torch.tril(torch.ones(blocksize, blocksize)))

  def forward(self,x):
    B, T, C = x.shape
    k = self.key(x)
    q = self.query(x)
    v = self.value(x)

    wei = q @ k.transpose(-2,-1) * C ** -0.5  # (B, T, C) @ (B, C, T) --> (B, T, T)
    # wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))     --------> Error to reember for life, fkn hell
    wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))
    wei = F.softmax(wei, dim=-1)  # (B, T, T)

    out = wei @ v  # (B, T, T) @ (B, T, C) --> (B, T, C)

    return out
class mul_head(nn.Module):

  def __init__(self, n_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])

  def forward(self, x):
    return torch.cat([h(x) for h in self.heads], dim=-1)
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(blocksize, n_embed)
        self.sa_heads = mul_head(4, n_embed//4)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B,  T = idx.shape

        tok_emb = self.token_embedding_table(idx)  # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, C)
        x = tok_emb + pos_emb
        x = self.sa_heads(x)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)

            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
          # as we have implemented pos embded. we cannot have idx greater than block size, so must crop it if it bigger
          idx_cond = idx[:, -blocksize:]

          logits, loss = self(idx_cond)
          logits = logits[:, -1, :]  # becomes (B, C)
          probs = F.softmax(logits, dim=-1)  # (B, C)
          idx_next = torch.multinomial(probs, num_samples=1)  # (B,1)
          idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)

        return idx
bi_model = BigramLanguageModel()
m = bi_model.to(device)
optimizer = torch.optim.AdamW(bi_model.parameters(), lr=lr)
for iter in tqdm(range(max_iters)):

    # every once in a while, we calculate loss
    if iter%eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss = {losses['train']:.4f}, val loss: {losses['val']:.4f}")

    # generating a sample batch
    xb, yb = get_batch()

    # evaluate the loss
    logits, loss = bi_model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
context = torch.zeros((1,1), dtype=torch.long, device = device)
print(decode(bi_model.generate(context, max_new_tokens=500)[0].tolist()))
```



# Version 5

Mulit headed self attention with feed forward


```
import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm
# hyperparameter

blocksize = 8  # context length
batchsize = 32
max_iters = 5000
eval_interval = 500
eval_iter = 200
lr = 1e-3
n_embed = 32

torch.manual_seed = 1337

device = 'cuda' if torch.cuda.is_available() else 'cpu'
with open('input.txt', 'r', encoding="utf-8") as f:
    text = f.read()
chars = sorted(list(set(text)))
vocab_size = len(chars)
# this is character level tokenizor
stoi = {s: i for i, s in enumerate(chars)}
itos = {i: s for i, s in enumerate(chars)}
encode = lambda s: [stoi[i] for i in s]
decode = lambda l: "".join([itos[i] for i in l])
#test train split
data = torch.tensor(encode(text))
n = int(0.9 * len(data))
train = data[:n]
test = data[n:]
def get_batch(split=0):
    local_data = train if split == 0 else test
    ix = torch.randint(len(local_data) - blocksize, (batchsize,))
    ixb = torch.stack([local_data[i:i + blocksize] for i in ix])
    iyb = torch.stack([local_data[i + 1:i + blocksize + 1] for i in ix])
    ixb, iyb = ixb.to(device), iyb.to(device)
    return ixb, iyb
@torch.no_grad()
def estimate_loss():
    out = {}
    bi_model.eval()
    for split, split_name in enumerate(['train', 'val']):
        losses = torch.zeros(eval_iter)
        for k in range(eval_iter):
            X, Y = get_batch(split)
            logits, loss = bi_model(X, Y)
            losses[k] =  loss.item()
        out[split_name] = losses.mean()
    bi_model.train()
    return out
# implementing a simple one head of self attention

class Head(nn.Module):

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embed, head_size, bias = False)
    self.query = nn.Linear(n_embed, head_size, bias = False)
    self.value = nn.Linear(n_embed, head_size, bias = False)
    self.register_buffer('tril', torch.tril(torch.ones(blocksize, blocksize)))

  def forward(self,x):
    B, T, C = x.shape
    k = self.key(x)
    q = self.query(x)
    v = self.value(x)

    wei = q @ k.transpose(-2,-1) * C ** -0.5  # (B, T, C) @ (B, C, T) --> (B, T, T)
    # wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))     --------> Error to reember for life, fkn hell
    wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))
    wei = F.softmax(wei, dim=-1)  # (B, T, T)

    out = wei @ v  # (B, T, T) @ (B, T, C) --> (B, T, C)

    return out
class mul_head(nn.Module):

  def __init__(self, n_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])

  def forward(self, x):
    return torch.cat([h(x) for h in self.heads], dim=-1)
class feed_forward(nn.Module):

  def __init__(self, n_embed):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embed, n_embed),
        nn.ReLU()
    )

  def forward(self, x):
    return self.net(x)
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(blocksize, n_embed)
        self.sa_heads = mul_head(4, n_embed//4)
        self.ffd = feed_forward(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B,  T = idx.shape

        tok_emb = self.token_embedding_table(idx)  # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, C)
        x = tok_emb + pos_emb
        x = self.sa_heads(x)
        x = self.ffd(x)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)

            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
          # as we have implemented pos embded. we cannot have idx greater than block size, so must crop it if it bigger
          idx_cond = idx[:, -blocksize:]

          logits, loss = self(idx_cond)
          logits = logits[:, -1, :]  # becomes (B, C)
          probs = F.softmax(logits, dim=-1)  # (B, C)
          idx_next = torch.multinomial(probs, num_samples=1)  # (B,1)
          idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)

        return idx
bi_model = BigramLanguageModel()
m = bi_model.to(device)
optimizer = torch.optim.AdamW(bi_model.parameters(), lr=lr)
for iter in tqdm(range(max_iters)):

    # every once in a while, we calculate loss
    if iter%eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss = {losses['train']:.4f}, val loss: {losses['val']:.4f}")

    # generating a sample batch
    xb, yb = get_batch()

    # evaluate the loss
    logits, loss = bi_model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
context = torch.zeros((1,1), dtype=torch.long, device = device)
print(decode(bi_model.generate(context, max_new_tokens=500)[0].tolist()))
```



# version  6

now we would like to combine self attention and feed forward which translates to we will combine communication and compute <br> <br>
communication is done in self attention layers where each token passes its query to every other token in a batch.<br>
Compute is done in feed forward layer where each token thinks about whats required and whats present <br><br>
Now block class which embodies self attention and feed forward makes neural network more dense and adds depth to it. This creates an optimization harder.
Therefore, we use technique from research paper. we skip blocks and it is called <b> residual connections .</b> <br>



```
import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm
# hyperparameter

blocksize = 8  # context length
batchsize = 32
max_iters = 5000
eval_interval = 500
eval_iter = 200
lr = 1e-3
n_embed = 32

torch.manual_seed = 1337

device = 'cuda' if torch.cuda.is_available() else 'cpu'
with open('input.txt', 'r', encoding="utf-8") as f:
    text = f.read()
chars = sorted(list(set(text)))
vocab_size = len(chars)
# this is character level tokenizor
stoi = {s: i for i, s in enumerate(chars)}
itos = {i: s for i, s in enumerate(chars)}
encode = lambda s: [stoi[i] for i in s]
decode = lambda l: "".join([itos[i] for i in l])
#test train split
data = torch.tensor(encode(text))
n = int(0.9 * len(data))
train = data[:n]
test = data[n:]
def get_batch(split=0):
    local_data = train if split == 0 else test
    ix = torch.randint(len(local_data) - blocksize, (batchsize,))
    ixb = torch.stack([local_data[i:i + blocksize] for i in ix])
    iyb = torch.stack([local_data[i + 1:i + blocksize + 1] for i in ix])
    ixb, iyb = ixb.to(device), iyb.to(device)
    return ixb, iyb
@torch.no_grad()
def estimate_loss():
    out = {}
    bi_model.eval()
    for split, split_name in enumerate(['train', 'val']):
        losses = torch.zeros(eval_iter)
        for k in range(eval_iter):
            X, Y = get_batch(split)
            logits, loss = bi_model(X, Y)
            losses[k] =  loss.item()
        out[split_name] = losses.mean()
    bi_model.train()
    return out
# implementing a simple one head of self attention

class Head(nn.Module):

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embed, head_size, bias = False)
    self.query = nn.Linear(n_embed, head_size, bias = False)
    self.value = nn.Linear(n_embed, head_size, bias = False)
    self.register_buffer('tril', torch.tril(torch.ones(blocksize, blocksize)))

  def forward(self,x):
    B, T, C = x.shape
    k = self.key(x)
    q = self.query(x)
    v = self.value(x)

    wei = q @ k.transpose(-2,-1) * C ** -0.5  # (B, T, C) @ (B, C, T) --> (B, T, T)
    # wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))     --------> Error to reember for life, fkn hell
    wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))
    wei = F.softmax(wei, dim=-1)  # (B, T, T)

    out = wei @ v  # (B, T, T) @ (B, T, C) --> (B, T, C)

    return out
class mul_head(nn.Module):

  def __init__(self, n_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
    self.proj = nn.Linear(n_heads * head_size, n_embed)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    return self.proj(out)
class feed_forward(nn.Module):

  def __init__(self, n_embed):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embed, 4 * n_embed),  # here, according to paper, expected output is 4 times the input. so we nulitply it by 8
        nn.ReLU(),
        nn.Linear(4 * n_embed, n_embed)
    )

  def forward(self, x):
    return self.net(x)
class Block(nn.Module):

  def __init__(self, n_embed, n_heads):
    super().__init__()
    head_size = n_embed // n_heads
    self.sa = mul_head(n_heads, head_size)
    self.ffd = feed_forward(n_embed)

  def forward(self, x):
    x = x + self.sa(x)
    x = x + self.ffd(x)
    return x
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(blocksize, n_embed)
        self.block = nn.Sequential(
            Block(n_embed, n_heads=4),
            Block(n_embed, n_heads=4),
            Block(n_embed, n_heads=4)
        )
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B,  T = idx.shape

        tok_emb = self.token_embedding_table(idx)  # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, C)
        x = tok_emb + pos_emb
        x = self.block(x)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)

            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
          # as we have implemented pos embded. we cannot have idx greater than block size, so must crop it if it bigger
          idx_cond = idx[:, -blocksize:]

          logits, loss = self(idx_cond)
          logits = logits[:, -1, :]  # becomes (B, C)
          probs = F.softmax(logits, dim=-1)  # (B, C)
          idx_next = torch.multinomial(probs, num_samples=1)  # (B,1)
          idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)

        return idx
bi_model = BigramLanguageModel()
m = bi_model.to(device)
optimizer = torch.optim.AdamW(bi_model.parameters(), lr=lr)
for iter in tqdm(range(max_iters)):

    # every once in a while, we calculate loss
    if iter%eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss = {losses['train']:.4f}, val loss: {losses['val']:.4f}")

    # generating a sample batch
    xb, yb = get_batch()

    # evaluate the loss
    logits, loss = bi_model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
context = torch.zeros((1,1), dtype=torch.long, device = device)
print(decode(bi_model.generate(context, max_new_tokens=500)[0].tolist()))
```



# Version 7

Now we will add layerNorm. It normalizes each row in out batch, so normalizes each token. i.e. it make such that mean of each row is 0 and has std var of 1
<br> <br>
eg.

```
class layerNorm:

  def __init__(self, dim, eps=1e-5, momentum = 0.5):
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.ones(dim)

  def __call__(self):
    xmean = x.mean(1, keepdim=True)
    xvar = x.var(1, keepdim= True)
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
    self.out = self.gamma * xhat + self.beta
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]
```

<br> <br>

according to paper, normalization is done after transformation i.e. after attention and ffd and all that. But in recent time, normalization is done before transformation <br> <br>

we also implement dropout here. for regularization. This randomaly shuts down some nodes which acts as a regularization. <br>

In [22]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm

In [23]:
# hyperparameter

blocksize = 32  # context length
batchsize = 64
max_iters = 5000
eval_interval = 500
eval_iter = 200
lr = 3e-3
n_embed = 384
n_layer = 6
n_heads = 6
dropout = 0.2

torch.manual_seed = 1337

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [24]:
with open('input.txt', 'r', encoding="utf-8") as f:
    text = f.read()

In [25]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
# this is character level tokenizor
stoi = {s: i for i, s in enumerate(chars)}
itos = {i: s for i, s in enumerate(chars)}
encode = lambda s: [stoi[i] for i in s]
decode = lambda l: "".join([itos[i] for i in l])

In [26]:
#test train split
data = torch.tensor(encode(text))
n = int(0.9 * len(data))
train = data[:n]
test = data[n:]

In [27]:
def get_batch(split=0):
    local_data = train if split == 0 else test
    ix = torch.randint(len(local_data) - blocksize, (batchsize,))
    ixb = torch.stack([local_data[i:i + blocksize] for i in ix])
    iyb = torch.stack([local_data[i + 1:i + blocksize + 1] for i in ix])
    ixb, iyb = ixb.to(device), iyb.to(device)
    return ixb, iyb

In [28]:
@torch.no_grad()
def estimate_loss():
    out = {}
    bi_model.eval()
    for split, split_name in enumerate(['train', 'val']):
        losses = torch.zeros(eval_iter)
        for k in range(eval_iter):
            X, Y = get_batch(split)
            logits, loss = bi_model(X, Y)
            losses[k] =  loss.item()
        out[split_name] = losses.mean()
    bi_model.train()
    return out

In [29]:
# implementing a simple one head of self attention

class Head(nn.Module):

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embed, head_size, bias = False)
    self.query = nn.Linear(n_embed, head_size, bias = False)
    self.value = nn.Linear(n_embed, head_size, bias = False)
    self.register_buffer('tril', torch.tril(torch.ones(blocksize, blocksize)))
    self.Dropout = nn.Dropout(dropout)

  def forward(self,x):
    B, T, C = x.shape
    k = self.key(x)
    q = self.query(x)
    v = self.value(x)

    wei = q @ k.transpose(-2,-1) * C ** -0.5  # (B, T, C) @ (B, C, T) --> (B, T, T)
    # wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))     --------> Error to reember for life, fkn hell
    wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))
    wei = F.softmax(wei, dim=-1)  # (B, T, T)
    wei = self.Dropout(wei)

    out = wei @ v  # (B, T, T) @ (B, T, C) --> (B, T, C)

    return out

In [30]:
class mul_head(nn.Module):

  def __init__(self, n_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
    self.proj = nn.Linear(n_heads * head_size, n_embed)
    self.Dropout = nn.Dropout(dropout)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.Dropout(self.proj(out))
    return out

In [31]:
class feed_forward(nn.Module):

  def __init__(self, n_embed):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embed, 4 * n_embed),  # here, according to paper, expected output is 4 times the input. so we nulitply it by 8
        nn.ReLU(),
        nn.Linear(4 * n_embed, n_embed),
        nn.Dropout(dropout),
    )

  def forward(self, x):
    return self.net(x)

In [32]:
class Block(nn.Module):

  def __init__(self, n_embed, n_heads):
    super().__init__()
    head_size = n_embed // n_heads
    self.sa = mul_head(n_heads, head_size)
    self.ffd = feed_forward(n_embed)
    self.ln1 = nn.LayerNorm(n_embed)
    self.ln2 = nn.LayerNorm(n_embed)

  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffd(self.ln2(x))
    return x

In [35]:
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(blocksize, n_embed)
        self.block = nn.Sequential(*[Block(n_embed, n_heads = n_heads) for _ in range(n_layer)])
        self.lnf = nn.LayerNorm(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B,  T = idx.shape

        tok_emb = self.token_embedding_table(idx)  # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, C)
        x = tok_emb + pos_emb
        x = self.block(x)
        x = self.lnf(x)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)

            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
          # as we have implemented pos embded. we cannot have idx greater than block size, so must crop it if it bigger
          idx_cond = idx[:, -blocksize:]

          logits, loss = self(idx_cond)
          logits = logits[:, -1, :]  # becomes (B, C)
          probs = F.softmax(logits, dim=-1)  # (B, C)
          idx_next = torch.multinomial(probs, num_samples=1)  # (B,1)
          idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)

        return idx

In [36]:
bi_model = BigramLanguageModel()
m = bi_model.to(device)
optimizer = torch.optim.AdamW(bi_model.parameters(), lr=lr)

In [37]:
for iter in tqdm(range(max_iters)):

    # every once in a while, we calculate loss
    if iter%eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss = {losses['train']:.4f}, val loss: {losses['val']:.4f}")

    # generating a sample batch
    xb, yb = get_batch()

    # evaluate the loss
    logits, loss = bi_model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

  0%|          | 1/5000 [00:07<10:45:21,  7.75s/it]

step 0: train loss = 4.3596, val loss: 4.3548


 10%|█         | 503/5000 [00:45<1:07:41,  1.11it/s]

step 500: train loss = 2.1801, val loss: 2.2176


 20%|██        | 1003/5000 [01:23<59:16,  1.12it/s]  

step 1000: train loss = 1.9614, val loss: 2.0595


 30%|███       | 1503/5000 [02:00<51:54,  1.12it/s]  

step 1500: train loss = 1.9355, val loss: 2.0466


 40%|████      | 2003/5000 [02:38<44:18,  1.13it/s]  

step 2000: train loss = 1.9131, val loss: 2.0263


 50%|█████     | 2503/5000 [03:15<36:59,  1.13it/s]

step 2500: train loss = 1.9902, val loss: 2.0934


 60%|██████    | 3003/5000 [03:53<28:29,  1.17it/s]

step 3000: train loss = 2.2019, val loss: 2.2608


 70%|███████   | 3503/5000 [04:30<21:19,  1.17it/s]

step 3500: train loss = 2.2439, val loss: 2.2910


 80%|████████  | 4001/5000 [05:08<20:29,  1.23s/it]

step 4000: train loss = 2.1627, val loss: 2.2303


 90%|█████████ | 4503/5000 [05:45<07:21,  1.12it/s]

step 4500: train loss = 2.1641, val loss: 2.2254


100%|██████████| 5000/5000 [06:14<00:00, 13.34it/s]


In [38]:
context = torch.zeros((1,1), dtype=torch.long, device = device)
print(decode(bi_model.generate(context, max_new_tokens=500)[0].tolist()))


Wote heriu
hhyb hy bi foa nowoorvyetrcks,
A
tabeesath Iis hise a sha'
And Rhay fret walave ast, spotenichapsscroungelhoe wh tisatelk inot gled arathy bee he, or theessearce e E nesea ond saiverd nor coooryte,
Chim u I ce heacer prarpor'hy frot pe cal wche 
Thert watdo afthavll iso ud you
Tiar gansodtowpynsndem' an tr tever,
Thasdoska anto-bho l'd Mack, hace oferdeere I kiors
Ther be forncutthought intl ouboterfee aol alcon eife bo precrom chol'd, Edess lemend,
EO,d nown kloogndwire betimeath ed 
