<a href="https://colab.research.google.com/github/Trickshotblaster/nn-practices/blob/main/TransformerV4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.7.0


In [None]:
import tiktoken
enc = tiktoken.encoding_for_model("gpt2")
print(enc.n_vocab)
enc.decode(enc.encode("Hello world!"))

50257


'Hello world!'

In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print(text[:100])

--2024-06-11 15:33:13--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-06-11 15:33:13 (25.5 MB/s) - ‘input.txt’ saved [1115394/1115394]

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [None]:
train_amount = 0.95
idx = int(train_amount * len(text))
train_text = text[:idx]
val_text = text[idx:]

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [None]:
class DataLoader:
  def __init__(self, text, batch_size, block_size):
    self.text = enc.encode(text)
    self.batch_size = batch_size
    self.block_size = block_size
    self.current_pos = 0
  def steps_per_epoch(self):
    return len(self.text) // (self.batch_size * self.block_size)
  def next(self):
    if self.current_pos + self.batch_size * self.block_size + 1 >= len(self.text):
      self.current_pos = 0
    buf = self.text[self.current_pos:self.current_pos + self.batch_size * self.block_size + 1]
    ins = torch.tensor(buf[:-1]).view(self.batch_size, self.block_size)
    tgts = torch.tensor(buf[1:]).view(self.batch_size, self.block_size)
    self.current_pos += self.batch_size * self.block_size + 1
    return ins, tgts
dl = DataLoader(train_text, 4, 8)
dl.next()

(tensor([[ 5962, 22307,    25,   198,  8421,   356,  5120,   597],
         [ 2252,    11,  3285,   502,  2740,    13,   198,   198],
         [ 3237,    25,   198,  5248,   461,    11,  2740,    13],
         [  198,   198,  5962, 22307,    25,   198,  1639,   389]]),
 tensor([[22307,    25,   198,  8421,   356,  5120,   597,  2252],
         [   11,  3285,   502,  2740,    13,   198,   198,  3237],
         [   25,   198,  5248,   461,    11,  2740,    13,   198],
         [  198,  5962, 22307,    25,   198,  1639,   389,   477]]))

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d_model, n_heads):
    super().__init__()
    self.d_model = d_model
    self.n_heads = n_heads
    assert d_model % n_heads == 0, "d_model must be divisible by n_heads"
    self.d_key = self.d_model // self.n_heads

    self.wq = nn.Linear(d_model, d_model)
    self.wk = nn.Linear(d_model, d_model)
    self.wv = nn.Linear(d_model, d_model)

    self.wo = nn.Linear(d_model, d_model)
  def forward(self, ins, mask=None):
    batch_size, seq_len, d_model = ins.size()
    Q = self.wq(ins).view(batch_size, seq_len, self.n_heads, self.d_key).transpose(1, 2)
    K = self.wk(ins).view(batch_size, seq_len, self.n_heads, self.d_key).transpose(1, 2)
    V = self.wv(ins).view(batch_size, seq_len, self.n_heads, self.d_key).transpose(1, 2)

    scaled_dot_product = (Q @ K.transpose(2, 3)) / (self.d_model ** 0.5)

    if mask is not None:
      scaled_dot_product += mask

    attn_scores = F.softmax(scaled_dot_product, dim=-1) @ V
    attn_scores = attn_scores.transpose(1, 2).contiguous().view(batch_size, seq_len, d_model)
    return self.wo(attn_scores)
MHA = MultiHeadAttention(32, 4)
MHA(torch.randn(2, 16, 32))

tensor([[[-0.1876,  0.1363,  0.0678,  ..., -0.1028,  0.1414, -0.0776],
         [-0.1812,  0.1432,  0.0522,  ..., -0.0997,  0.1453, -0.0724],
         [-0.1832,  0.1258,  0.0397,  ..., -0.1007,  0.1347, -0.0795],
         ...,
         [-0.1698,  0.1284,  0.0637,  ..., -0.1138,  0.1245, -0.0730],
         [-0.1845,  0.1589,  0.0566,  ..., -0.0982,  0.1473, -0.0760],
         [-0.1824,  0.1383,  0.0624,  ..., -0.1017,  0.1360, -0.0719]],

        [[-0.1146,  0.1173,  0.1476,  ..., -0.2827,  0.2060, -0.0723],
         [-0.1079,  0.1302,  0.1606,  ..., -0.2923,  0.2371, -0.0788],
         [-0.1305,  0.1058,  0.1719,  ..., -0.3135,  0.2192, -0.0797],
         ...,
         [-0.0948,  0.1529,  0.1157,  ..., -0.2403,  0.1726, -0.0373],
         [-0.1082,  0.1400,  0.1395,  ..., -0.2814,  0.2133, -0.0204],
         [-0.1185,  0.1393,  0.1558,  ..., -0.2868,  0.1977, -0.0504]]],
       grad_fn=<ViewBackward0>)

In [None]:
class MLP(nn.Module):
  def __init__(self, in_size, hidden_size, out_size):
    super().__init__()
    self.l1 = nn.Linear(in_size, hidden_size)
    self.l2 = nn.Linear(hidden_size, out_size)
    self.gelu = nn.GELU()
  def forward(self, ins):
    acts = self.gelu(self.l1(ins))
    return self.l2(acts)

In [None]:
class DecoderBlock(nn.Module):
  def __init__(self, vocab_size, d_model, n_heads, dropout=0.1):
    super().__init__()
    self.d_model = d_model
    self.n_heads = n_heads
    self.dropout = nn.Dropout(dropout)
    self.MHA = MultiHeadAttention(d_model, n_heads)
    self.MLP = MLP(d_model, 4*d_model, d_model)
    self.layernorm1 = nn.LayerNorm(d_model)
    self.layernorm2 = nn.LayerNorm(d_model)
  def forward(self, ins, mask=None):
    res1 = ins.clone()
    attn_result = self.MHA(ins, mask=mask)
    norm_result = self.layernorm1(attn_result)
    norm_result += res1
    res2 = norm_result.clone()
    mlp_result = self.MLP(norm_result)
    mlp_result_norm = self.layernorm2(mlp_result)
    return mlp_result_norm + res2

In [68]:
class GPT(nn.Module):
  def __init__(self, vocab_size, block_size, n_layers=2, n_heads=4, d_model=64):
    super().__init__()
    self.vocab_size = vocab_size
    self.block_size = block_size
    self.n_layers = n_layers
    self.n_heads = n_heads
    self.d_model = d_model

    self.token_embedding = nn.Embedding(vocab_size, d_model)
    self.position_embedding = nn.Embedding(block_size, d_model)
    self.decoder_stack = nn.ModuleList([
        DecoderBlock(vocab_size, d_model, n_heads) for _ in range(n_layers)
    ])
    self.output_proj = nn.Linear(d_model, vocab_size)
    #self.output_proj.weight = self.token_embedding.weight
  def forward(self, ins, targets=None):
    B, T = ins.size()
    x = self.token_embedding(ins)
    input_indices = torch.arange(T)
    x += self.position_embedding(input_indices)

    look_ahead_mask = torch.triu(
        torch.ones((T, T)), diagonal=1
    )
    look_ahead_mask.masked_fill_(look_ahead_mask == 1, float("-inf"))
    for decoder in self.decoder_stack:
      x = decoder(x, mask=look_ahead_mask)
    logits = self.output_proj(x)
    loss = None
    if targets is not None:
      loss = F.cross_entropy(logits.view(-1, self.vocab_size), targets.view(-1))
    return logits, loss
my_GPT = GPT(enc.n_vocab, 32, 12, 12, 768)

In [69]:
x, y = dl.next()
logits, loss = my_GPT(x, y)
print(logits.shape, loss.item())

torch.Size([4, 8, 50257]) 15.353865623474121


In [94]:
batch_size = 4
block_size = 32
n_layers = 4
n_heads = 4
d_model = 128
lr = 3e-4

my_GPT = GPT(enc.n_vocab, block_size, n_layers, n_heads, d_model)
optim = torch.optim.AdamW(my_GPT.parameters(), lr=lr)
data_loader = DataLoader(train_text, batch_size, block_size)

log_interval = 50
max_steps = 3000
print("Steps per epoch:", data_loader.steps_per_epoch())
print(f"GPT Parameters: {sum(p.numel() for p in my_GPT.parameters()) / 1e6} million")

Steps per epoch: 2500
GPT Parameters: 13.713233 million


In [95]:
torch.set_float32_matmul_precision("high")

In [96]:
import time
for step in range(max_steps):
  step_start = time.time()
  x, y = data_loader.next()
  logits, loss = my_GPT(x, y)
  optim.zero_grad()
  loss.backward()
  optim.step()

  if step % log_interval == 0:
    print(f"Step {step}, loss: {loss.item()}, time: {round((time.time() - step_start) * 1e3, 2)} ms")

Step 0, loss: 12.357880592346191, time: 405.67 ms
Step 50, loss: 8.230766296386719, time: 304.09 ms
Step 100, loss: 5.759158611297607, time: 243.55 ms
Step 150, loss: 5.9091057777404785, time: 245.58 ms
Step 200, loss: 4.242574691772461, time: 243.56 ms
Step 250, loss: 5.715686321258545, time: 242.08 ms
Step 300, loss: 4.924989223480225, time: 242.25 ms
Step 350, loss: 6.233471393585205, time: 254.71 ms
Step 400, loss: 7.218895435333252, time: 245.02 ms
Step 450, loss: 5.946331024169922, time: 267.01 ms
Step 500, loss: 6.201785564422607, time: 251.0 ms
Step 550, loss: 5.575164318084717, time: 255.19 ms
Step 600, loss: 6.235467910766602, time: 245.02 ms
Step 650, loss: 6.295952796936035, time: 256.05 ms
Step 700, loss: 6.352460861206055, time: 252.03 ms
Step 750, loss: 6.290209770202637, time: 252.2 ms
Step 800, loss: 4.736455917358398, time: 247.3 ms
Step 850, loss: 6.201642990112305, time: 239.98 ms
Step 900, loss: 4.767206192016602, time: 243.85 ms
Step 950, loss: 5.344666004180908, 

In [97]:
prompt = "First Citizen:"
input_tokens = enc.encode(prompt)
output_tokens = enc.encode(prompt)
for x in range(100):
  if len(input_tokens) > block_size:
    input_tokens = input_tokens[1:]
  context_tensor = torch.tensor(input_tokens).view(1, -1)
  logits, loss = my_GPT(context_tensor)
  probs = F.softmax(logits[:, -1, :])
  result = torch.multinomial(probs, num_samples=1).item()
  input_tokens.append(result)
  output_tokens.append(result)
print(enc.decode(output_tokens))

  probs = F.softmax(logits[:, -1, :])


First Citizen:
I pray done a last: 'tis a man, the same.
Because'd, we must go upon be see dead?

Keep fares:
You will slander a gods you by from perove?

First longAT'Tis already, my poor father'st men.

KING EDWARD IV:
By comfort, look I had doth, my hearted gone,
And being soon the best.

Second sides:
But been, as
