<a href="https://colab.research.google.com/github/Trickshotblaster/nn-practices/blob/main/TransformerV4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
!pip install tiktoken



In [44]:
import tiktoken
enc = tiktoken.encoding_for_model("gpt2")
print(enc.n_vocab)
enc.decode(enc.encode("Hello world!"))

50257


'Hello world!'

In [45]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f: # input.txt
    text = f.read()
print(text[:100])

--2024-06-11 21:49:35--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2024-06-11 21:49:35 (26.5 MB/s) - ‘input.txt.1’ saved [1115394/1115394]

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [46]:
train_amount = 0.95
idx = int(train_amount * len(text))
train_text = text[:idx]
val_text = text[idx:]

In [47]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [48]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [49]:
class DataLoader:
  def __init__(self, text, batch_size, block_size):
    self.text = torch.tensor(enc.encode(text)).to(device)
    self.batch_size = batch_size
    self.block_size = block_size
    self.current_pos = 0
  def steps_per_epoch(self):
    return len(self.text) // (self.batch_size * self.block_size)
  def next(self):
    if self.current_pos + self.batch_size * self.block_size + 1 >= len(self.text):
      self.current_pos = 0
    buf = self.text[self.current_pos:self.current_pos + self.batch_size * self.block_size + 1]
    ins = buf[:-1].view(self.batch_size, self.block_size)
    tgts = buf[1:].view(self.batch_size, self.block_size)
    self.current_pos += self.batch_size * self.block_size + 1
    return ins, tgts
dl = DataLoader(train_text, 4, 8)
dl.next()

(tensor([[ 5962, 22307,    25,   198,  8421,   356,  5120,   597],
         [ 2252,    11,  3285,   502,  2740,    13,   198,   198],
         [ 3237,    25,   198,  5248,   461,    11,  2740,    13],
         [  198,   198,  5962, 22307,    25,   198,  1639,   389]],
        device='cuda:0'),
 tensor([[22307,    25,   198,  8421,   356,  5120,   597,  2252],
         [   11,  3285,   502,  2740,    13,   198,   198,  3237],
         [   25,   198,  5248,   461,    11,  2740,    13,   198],
         [  198,  5962, 22307,    25,   198,  1639,   389,   477]],
        device='cuda:0'))

In [50]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d_model, n_heads):
    super().__init__()
    self.d_model = d_model
    self.n_heads = n_heads
    assert d_model % n_heads == 0, "d_model must be divisible by n_heads"
    self.d_key = self.d_model // self.n_heads

    self.wq = nn.Linear(d_model, d_model)
    self.wk = nn.Linear(d_model, d_model)
    self.wv = nn.Linear(d_model, d_model)

    self.wo = nn.Linear(d_model, d_model)
  def forward(self, ins, mask=None):
    batch_size, seq_len, d_model = ins.size()
    Q = self.wq(ins).view(batch_size, seq_len, self.n_heads, self.d_key).transpose(1, 2)
    K = self.wk(ins).view(batch_size, seq_len, self.n_heads, self.d_key).transpose(1, 2)
    V = self.wv(ins).view(batch_size, seq_len, self.n_heads, self.d_key).transpose(1, 2)

    scaled_dot_product = (Q @ K.transpose(2, 3)) / (self.d_model ** 0.5)

    if mask is not None:
      scaled_dot_product += mask

    attn_scores = F.softmax(scaled_dot_product, dim=-1) @ V
    attn_scores = attn_scores.transpose(1, 2).contiguous().view(batch_size, seq_len, d_model)
    return self.wo(attn_scores)
MHA = MultiHeadAttention(32, 4)
MHA(torch.randn(2, 16, 32))

tensor([[[ 0.1465, -0.0072, -0.1859,  ...,  0.0988,  0.1258,  0.1897],
         [ 0.1164, -0.0511, -0.1894,  ...,  0.0652,  0.1090,  0.1570],
         [ 0.1068, -0.0378, -0.1796,  ...,  0.0628,  0.1309,  0.1662],
         ...,
         [ 0.1485, -0.0119, -0.1950,  ...,  0.0917,  0.1187,  0.1919],
         [ 0.1222, -0.0181, -0.1773,  ...,  0.0908,  0.1043,  0.1626],
         [ 0.1329, -0.0275, -0.1921,  ...,  0.0852,  0.1044,  0.1713]],

        [[ 0.1677,  0.0359, -0.1063,  ...,  0.1061,  0.2597,  0.1273],
         [ 0.1588,  0.0244, -0.0693,  ...,  0.1145,  0.2435,  0.1713],
         [ 0.1374,  0.0180, -0.0611,  ...,  0.1227,  0.2743,  0.1127],
         ...,
         [ 0.1540,  0.0666, -0.0622,  ...,  0.0823,  0.2676,  0.1201],
         [ 0.1591,  0.0413, -0.0641,  ...,  0.0839,  0.2593,  0.0880],
         [ 0.1709,  0.0635, -0.0681,  ...,  0.0869,  0.2646,  0.0714]]],
       grad_fn=<ViewBackward0>)

In [51]:
class MLP(nn.Module):
  def __init__(self, in_size, hidden_size, out_size):
    super().__init__()
    self.l1 = nn.Linear(in_size, hidden_size)
    self.l2 = nn.Linear(hidden_size, out_size)
    self.gelu = nn.GELU()
  def forward(self, ins):
    acts = self.gelu(self.l1(ins))
    return self.l2(acts)

In [52]:
class DecoderBlock(nn.Module):
  def __init__(self, vocab_size, d_model, n_heads, dropout=0.1):
    super().__init__()
    self.d_model = d_model
    self.n_heads = n_heads
    self.dropout = nn.Dropout(dropout)
    self.MHA = MultiHeadAttention(d_model, n_heads)
    self.MLP = MLP(d_model, 4*d_model, d_model)
    self.layernorm1 = nn.LayerNorm(d_model)
    self.layernorm2 = nn.LayerNorm(d_model)
  def forward(self, ins, mask=None):
    res1 = ins.clone()
    attn_result = self.MHA(ins, mask=mask)
    norm_result = self.layernorm1(attn_result)
    norm_result += res1
    res2 = norm_result.clone()
    mlp_result = self.MLP(norm_result)
    mlp_result_norm = self.layernorm2(mlp_result)
    return mlp_result_norm + res2

In [53]:
class GPT(nn.Module):
  def __init__(self, vocab_size, block_size, n_layers=2, n_heads=4, d_model=64):
    super().__init__()
    self.vocab_size = vocab_size
    self.block_size = block_size
    self.n_layers = n_layers
    self.n_heads = n_heads
    self.d_model = d_model

    self.token_embedding = nn.Embedding(vocab_size, d_model)
    self.position_embedding = nn.Embedding(block_size, d_model)
    self.decoder_stack = nn.ModuleList([
        DecoderBlock(vocab_size, d_model, n_heads) for _ in range(n_layers)
    ])
    self.output_proj = nn.Linear(d_model, vocab_size)
    #self.output_proj.weight = self.token_embedding.weight
  def forward(self, ins, targets=None):
    B, T = ins.size()

    x = self.token_embedding(ins.to(device))
    input_indices = torch.arange(T).to(device)
    x += self.position_embedding(input_indices)

    look_ahead_mask = torch.triu(
        torch.ones((T, T)), diagonal=1
    )
    look_ahead_mask.masked_fill_(look_ahead_mask == 1, float("-inf"))
    look_ahead_mask = look_ahead_mask.to(device)

    for decoder in self.decoder_stack:
      x = decoder(x, mask=look_ahead_mask)
    logits = self.output_proj(x)
    loss = None
    if targets is not None:
      targets = targets.to(device)
      loss = F.cross_entropy(logits.view(-1, self.vocab_size), targets.view(-1))
    return logits, loss
my_GPT = GPT(enc.n_vocab, 32, 12, 12, 768).to(device)

In [54]:
x, y = dl.next()
logits, loss = my_GPT(x, y)
print(logits.shape, loss.item())

torch.Size([4, 8, 50257]) 14.276529312133789


In [70]:
batch_size = 64
block_size = 32
n_layers = 4
n_heads = 4
d_model = 128
lr = 3e-4



my_GPT = GPT(enc.n_vocab, block_size, n_layers, n_heads, d_model)
my_GPT = my_GPT.to(device)

compile = True
if compile and torch.cuda.is_available():
  my_GPT = torch.compile(my_GPT)

optim = torch.optim.AdamW(my_GPT.parameters(), lr=lr)
data_loader = DataLoader(train_text, batch_size, block_size)

val_data_loader = DataLoader(val_text, batch_size, block_size)
val_interval = 200

log_interval = 50
max_steps = 3000
print("Steps per epoch:", data_loader.steps_per_epoch())
print(f"GPT Parameters: {sum(p.numel() for p in my_GPT.parameters()) / 1e6} million")

Steps per epoch: 156
GPT Parameters: 13.713233 million


In [56]:
torch.set_float32_matmul_precision("high")

In [71]:
import time
best_val_loss = float("inf")
my_GPT.train()
for step in range(max_steps + 1):
  step_start = time.time()
  x, y = data_loader.next()
  logits, loss = my_GPT(x, y)
  optim.zero_grad()
  loss.backward()
  optim.step()

  if step % log_interval == 0:
    print(f"Step {step}, loss: {loss.item()}, time: {round((time.time() - step_start) * 1e3, 2)} ms")
  if step % val_interval == 0:
    with torch.no_grad():
      val_loss = 0
      for val_step in range(val_data_loader.steps_per_epoch()):
        val_x, val_y = val_data_loader.next()
        logits, loss = my_GPT(val_x, val_y)
        val_loss += loss
      val_loss /= val_data_loader.steps_per_epoch()
      if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(my_GPT.state_dict(), 'best_model.pth')
      print(f"Val loss for step {step}: {val_loss}")
my_GPT.load_state_dict(torch.load('best_model.pth'))
my_GPT.eval()

Step 0, loss: 12.45822525024414, time: 21222.53 ms
Val loss for step 0: 11.486330032348633
Step 50, loss: 7.174983024597168, time: 221.92 ms
Step 100, loss: 6.1902241706848145, time: 223.84 ms
Step 150, loss: 6.5473480224609375, time: 222.98 ms
Step 200, loss: 5.959547996520996, time: 224.23 ms
Val loss for step 200: 6.103331089019775
Step 250, loss: 5.141306400299072, time: 223.82 ms
Step 300, loss: 5.676750183105469, time: 226.76 ms
Step 350, loss: 5.078444004058838, time: 223.62 ms
Step 400, loss: 5.461614608764648, time: 227.33 ms
Val loss for step 400: 5.703752517700195
Step 450, loss: 4.707574367523193, time: 228.5 ms
Step 500, loss: 4.616297721862793, time: 226.37 ms
Step 550, loss: 5.273550987243652, time: 226.83 ms
Step 600, loss: 5.044036865234375, time: 228.69 ms
Val loss for step 600: 5.409703731536865
Step 650, loss: 4.630270957946777, time: 226.94 ms
Step 700, loss: 4.664117813110352, time: 227.15 ms
Step 750, loss: 4.572318077087402, time: 225.87 ms
Step 800, loss: 4.750

OptimizedModule(
  (_orig_mod): GPT(
    (token_embedding): Embedding(50257, 128)
    (position_embedding): Embedding(32, 128)
    (decoder_stack): ModuleList(
      (0-3): 4 x DecoderBlock(
        (dropout): Dropout(p=0.1, inplace=False)
        (MHA): MultiHeadAttention(
          (wq): Linear(in_features=128, out_features=128, bias=True)
          (wk): Linear(in_features=128, out_features=128, bias=True)
          (wv): Linear(in_features=128, out_features=128, bias=True)
          (wo): Linear(in_features=128, out_features=128, bias=True)
        )
        (MLP): MLP(
          (l1): Linear(in_features=128, out_features=512, bias=True)
          (l2): Linear(in_features=512, out_features=128, bias=True)
          (gelu): GELU(approximate='none')
        )
        (layernorm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (layernorm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      )
    )
    (output_proj): Linear(in_features=128, out_features=50257,

In [72]:
prompt = "\n"
input_tokens = enc.encode(prompt)
output_tokens = enc.encode(prompt)
for x in range(200):
  if len(input_tokens) > block_size:
    input_tokens = input_tokens[1:]
  context_tensor = torch.tensor(input_tokens).view(1, -1).to(device)
  logits, loss = my_GPT(context_tensor)
  probs = F.softmax(logits[:, -1, :])
  result = torch.multinomial(probs, num_samples=1).item()
  input_tokens.append(result)
  output_tokens.append(result)
print(enc.decode(output_tokens))

  probs = F.softmax(logits[:, -1, :])
  probs = F.softmax(logits[:, -1, :])



HORTENSIO:
Tush she Isabel; rewards our heart yourself to death,
Peace, proud silence for your words, his son,
 templeemark me begun look to you; ha' pleasure to Brittany.
That's love that all mal speech?

PETRUCHIO:
Who burst?

 presenceUMIO:
Be in Pad to his whore require:
Wriceardine by three thou hastUS:
Yes unto these woman better the ins the wholeness.

HORTENSIO:
Hark! if or good time shoulder, and brought for joy:
fl sun judge me to bed to Barthie:
Onea enforcedciuscear chast Brut sister.

TRANIO:
Why, I'll firmly the unknown steel of a great host.

POMPEY:
That are happily, that on this is but he is with how plain
If it how Katharina: think,
