<a href="https://colab.research.google.com/github/RNTUNOOB/Tiny_GPT/blob/main/Final_MyGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-02-19 11:15:37--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-02-19 11:15:37 (101 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm

In [3]:
# hyperparameter

blocksize = 64  # context length
batchsize = 64
max_iters = 2500
eval_interval = 500
eval_iter = 200
lr = 3e-3
n_embed = 384
n_layer = 6
n_heads = 6
dropout = 0.2

torch.manual_seed = 1337

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
with open('input.txt', 'r', encoding="utf-8") as f:
    text = f.read()

In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
# this is character level tokenizor
stoi = {s: i for i, s in enumerate(chars)}
itos = {i: s for i, s in enumerate(chars)}
encode = lambda s: [stoi[i] for i in s]
decode = lambda l: "".join([itos[i] for i in l])

In [6]:
#test train split
data = torch.tensor(encode(text))
n = int(0.9 * len(data))
train = data[:n]
test = data[n:]

In [7]:
def get_batch(split=0):
    local_data = train if split == 0 else test
    ix = torch.randint(len(local_data) - blocksize, (batchsize,))
    ixb = torch.stack([local_data[i:i + blocksize] for i in ix])
    iyb = torch.stack([local_data[i + 1:i + blocksize + 1] for i in ix])
    ixb, iyb = ixb.to(device), iyb.to(device)
    return ixb, iyb

In [8]:
@torch.no_grad()
def estimate_loss():
    out = {}
    bi_model.eval()
    for split, split_name in enumerate(['train', 'val']):
        losses = torch.zeros(eval_iter)
        for k in range(eval_iter):
            X, Y = get_batch(split)
            logits, loss = bi_model(X, Y)
            losses[k] =  loss.item()
        out[split_name] = losses.mean()
    bi_model.train()
    return out

In [9]:
# implementing a simple one head of self attention

class Head(nn.Module):

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embed, head_size, bias = False)
    self.query = nn.Linear(n_embed, head_size, bias = False)
    self.value = nn.Linear(n_embed, head_size, bias = False)
    self.register_buffer('tril', torch.tril(torch.ones(blocksize, blocksize)))
    self.Dropout = nn.Dropout(dropout)

  def forward(self,x):
    B, T, C = x.shape
    k = self.key(x)
    q = self.query(x)
    v = self.value(x)

    wei = q @ k.transpose(-2,-1) * C ** -0.5  # (B, T, C) @ (B, C, T) --> (B, T, T)
    # wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))     --------> Error to reember for life, fkn hell
    wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))
    wei = F.softmax(wei, dim=-1)  # (B, T, T)
    wei = self.Dropout(wei)

    out = wei @ v  # (B, T, T) @ (B, T, C) --> (B, T, C)

    return out

In [10]:
class mul_head(nn.Module):

  def __init__(self, n_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
    self.proj = nn.Linear(n_heads * head_size, n_embed)
    self.Dropout = nn.Dropout(dropout)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.Dropout(self.proj(out))
    return out

In [11]:
class feed_forward(nn.Module):

  def __init__(self, n_embed):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embed, 4 * n_embed),  # here, according to paper, expected output is 4 times the input. so we nulitply it by 8
        nn.ReLU(),
        nn.Linear(4 * n_embed, n_embed),
        nn.Dropout(dropout),
    )

  def forward(self, x):
    return self.net(x)

In [12]:
class Block(nn.Module):

  def __init__(self, n_embed, n_heads):
    super().__init__()
    head_size = n_embed // n_heads
    self.sa = mul_head(n_heads, head_size)
    self.ffd = feed_forward(n_embed)
    self.ln1 = nn.LayerNorm(n_embed)
    self.ln2 = nn.LayerNorm(n_embed)

  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffd(self.ln2(x))
    return x

In [13]:
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(blocksize, n_embed)
        self.block = nn.Sequential(*[Block(n_embed, n_heads = n_heads) for _ in range(n_layer)])
        self.lnf = nn.LayerNorm(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B,  T = idx.shape

        tok_emb = self.token_embedding_table(idx)  # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, C)
        x = tok_emb + pos_emb
        x = self.block(x)
        x = self.lnf(x)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)

            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
          # as we have implemented pos embded. we cannot have idx greater than block size, so must crop it if it bigger
          idx_cond = idx[:, -blocksize:]

          logits, loss = self(idx_cond)
          logits = logits[:, -1, :]  # becomes (B, C)
          probs = F.softmax(logits, dim=-1)  # (B, C)
          idx_next = torch.multinomial(probs, num_samples=1)  # (B,1)
          idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)

        return idx

In [14]:
bi_model = BigramLanguageModel()
m = bi_model.to(device)
optimizer = torch.optim.AdamW(bi_model.parameters(), lr=lr)

In [15]:
for iter in tqdm(range(max_iters)):

    # every once in a while, we calculate loss
    if iter%eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss = {losses['train']:.4f}, val loss: {losses['val']:.4f}")

    # generating a sample batch
    xb, yb = get_batch()

    # evaluate the loss
    logits, loss = bi_model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

  0%|          | 0/2500 [00:00<?, ?it/s]

step 0: train loss = 4.2857, val loss: 4.2874


 20%|██        | 502/2500 [01:20<1:17:13,  2.32s/it]

step 500: train loss = 2.0725, val loss: 2.1294


 40%|████      | 1002/2500 [02:30<1:20:50,  3.24s/it]

step 1000: train loss = 1.8004, val loss: 1.9480


 60%|██████    | 1502/2500 [03:39<53:47,  3.23s/it]  

step 1500: train loss = 1.6568, val loss: 1.8277


 80%|████████  | 2002/2500 [04:49<26:58,  3.25s/it]

step 2000: train loss = 1.6105, val loss: 1.7944


100%|██████████| 2500/2500 [05:43<00:00,  7.28it/s]


In [16]:
context = torch.zeros((1,1), dtype=torch.long, device = device)
print(decode(bi_model.generate(context, max_new_tokens=500)[0].tolist()))


And my send officenton to plimage;
And with him the drangly sirth.
Towar
A thy hie the his wadam and by ot affices but fies,
Keepter.

Lord No him.

GLOUCESTER:
A too tell:
Auffer, the paint touck my crown wearied,
And be brave, her wrone that of your nor out knought him,
Peake the ever be to knight Richmes to else in't to bootning. And so n in est
That perpoke of himselfore, son-chorpards soul'stance,
As, but my more then mistrengues at one too there-tods Estabbedier's the pqueth,
And sweep a p
