<a href="https://colab.research.google.com/github/Trickshotblaster/nn-practices/blob/main/TransformerV4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tiktoken



In [2]:
import tiktoken
enc = tiktoken.encoding_for_model("gpt2")
print(enc.n_vocab)
enc.decode(enc.encode("Hello world!"))

50257


'Hello world!'

In [3]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f: # input.txt
    text = f.read()
print(text[:100])

--2024-06-13 22:03:29--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2024-06-13 22:03:29 (17.7 MB/s) - ‘input.txt.1’ saved [1115394/1115394]

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [None]:
!pip install --upgrade datasets
from datasets import load_dataset

ds = load_dataset("codeparrot/github-code", split="train[0.1%]")

print(ds)
print(len(text))



Downloading data:   0%|          | 0/1126 [00:00<?, ?files/s]

In [4]:
train_amount = 0.95
idx = int(train_amount * len(text))
train_text = text[:idx]
val_text = text[idx:]

In [5]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [7]:
import random
class DataLoader:
  def __init__(self, text, batch_size, block_size, random_sample=True):
    self.text = torch.tensor(enc.encode(text)).to(device)
    self.batch_size = batch_size
    self.block_size = block_size
    self.current_pos = 0
    self.random_sample = random_sample
  def steps_per_epoch(self):
    return len(self.text) // (self.batch_size * self.block_size)
  def next(self):
    if self.current_pos + self.batch_size * self.block_size + 1 >= len(self.text):
      self.current_pos = 0
    if self.random_sample:
      idx = int((random.random() * len(self.text)) - (self.batch_size * self.block_size + 1) - 1)
      buf = self.text[idx:idx + (self.batch_size * self.block_size + 1)] #[self.current_pos:self.current_pos + self.batch_size * self.block_size + 1]
      if len(buf) == 0:
        return self.next()
    else:
      buf = self.text[self.current_pos:self.current_pos + self.batch_size * self.block_size + 1]
    ins = buf[:-1].view(self.batch_size, self.block_size)
    tgts = buf[1:].view(self.batch_size, self.block_size)
    self.current_pos += self.batch_size * self.block_size + 1
    return ins, tgts
dl = DataLoader(train_text, 4, 8)
dl.next()

(tensor([[ 7500,   592,    11,   198,  2504,   804,  1165, 37248],
         [  287,   674,  2219, 14298,    25,   198,  3237,  1276],
         [  307,   772,   287,   674,  1230,    13,   198,  1639],
         [ 4145,  1873,  1549,    11,   314,   481,   467,  6808]]),
 tensor([[  592,    11,   198,  2504,   804,  1165, 37248,   287],
         [  674,  2219, 14298,    25,   198,  3237,  1276,   307],
         [  772,   287,   674,  1230,    13,   198,  1639,  4145],
         [ 1873,  1549,    11,   314,   481,   467,  6808,  1497]]))

In [8]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d_model, n_heads):
    super().__init__()
    self.d_model = d_model
    self.n_heads = n_heads
    assert d_model % n_heads == 0, "d_model must be divisible by n_heads"
    self.d_key = self.d_model // self.n_heads

    self.wq = nn.Linear(d_model, d_model)
    self.wk = nn.Linear(d_model, d_model)
    self.wv = nn.Linear(d_model, d_model)

    self.wo = nn.Linear(d_model, d_model)
  def forward(self, ins, mask=None):
    batch_size, seq_len, d_model = ins.size()
    Q = self.wq(ins).view(batch_size, seq_len, self.n_heads, self.d_key).transpose(1, 2)
    K = self.wk(ins).view(batch_size, seq_len, self.n_heads, self.d_key).transpose(1, 2)
    V = self.wv(ins).view(batch_size, seq_len, self.n_heads, self.d_key).transpose(1, 2)

    #scaled_dot_product = (Q @ K.transpose(2, 3)) / (self.d_model ** 0.5)

    #if mask is not None:
      #scaled_dot_product += mask

    attn_scores = F.scaled_dot_product_attention(Q, K, V, attn_mask=mask)
    #F.softmax(scaled_dot_product, dim=-1) @ V
    attn_scores = attn_scores.transpose(1, 2).contiguous().view(batch_size, seq_len, d_model)
    return self.wo(attn_scores)
MHA = MultiHeadAttention(32, 4)
MHA(torch.randn(2, 16, 32))

tensor([[[ 0.1183,  0.1336,  0.0290,  ..., -0.2397,  0.1399, -0.1039],
         [ 0.1351,  0.1101,  0.0016,  ..., -0.2307,  0.1385, -0.1189],
         [ 0.1755,  0.1172,  0.0071,  ..., -0.1999,  0.1167, -0.1345],
         ...,
         [ 0.1378,  0.1237,  0.0297,  ..., -0.1953,  0.1114, -0.1279],
         [ 0.2406,  0.1049,  0.0135,  ..., -0.2194,  0.1763, -0.1585],
         [ 0.0993,  0.1412,  0.0307,  ..., -0.2116,  0.1037, -0.1321]],

        [[ 0.1755,  0.1363,  0.1488,  ..., -0.0733,  0.1255, -0.2467],
         [ 0.1497,  0.1942,  0.1458,  ..., -0.1603,  0.0484, -0.1838],
         [ 0.1338,  0.2105,  0.1820,  ..., -0.1377,  0.0490, -0.1816],
         ...,
         [ 0.1532,  0.1529,  0.1851,  ..., -0.1071,  0.0459, -0.1696],
         [ 0.1916,  0.1314,  0.0623,  ..., -0.1553,  0.0754, -0.2610],
         [ 0.1765,  0.1128,  0.1206,  ..., -0.1518,  0.0474, -0.1735]]],
       grad_fn=<ViewBackward0>)

In [9]:
class MLP(nn.Module):
  def __init__(self, in_size, hidden_size, out_size):
    super().__init__()
    self.l1 = nn.Linear(in_size, hidden_size)
    self.l2 = nn.Linear(hidden_size, out_size)
    self.gelu = nn.GELU()
  def forward(self, ins):
    acts = self.gelu(self.l1(ins))
    return self.l2(acts)

In [10]:
class DecoderBlock(nn.Module):
  def __init__(self, vocab_size, d_model, n_heads, dropout=0.1):
    super().__init__()
    self.d_model = d_model
    self.n_heads = n_heads
    self.dropout = nn.Dropout(dropout)
    self.MHA = MultiHeadAttention(d_model, n_heads)
    self.MLP = MLP(d_model, 4*d_model, d_model)
    self.layernorm1 = nn.LayerNorm(d_model)
    self.layernorm2 = nn.LayerNorm(d_model)
  def forward(self, ins, mask=None):
    res1 = ins.clone()
    attn_result = self.MHA(ins, mask=mask)
    norm_result = self.layernorm1(attn_result)
    norm_result += res1
    res2 = norm_result.clone()
    mlp_result = self.MLP(norm_result)
    mlp_result_norm = self.layernorm2(mlp_result)
    return mlp_result_norm + res2

In [11]:
class GPT(nn.Module):
  def __init__(self, vocab_size, block_size, n_layers=2, n_heads=4, d_model=64):
    super().__init__()
    self.vocab_size = vocab_size
    self.block_size = block_size
    self.n_layers = n_layers
    self.n_heads = n_heads
    self.d_model = d_model

    self.token_embedding = nn.Embedding(vocab_size, d_model)
    self.position_embedding = nn.Embedding(block_size, d_model)
    self.decoder_stack = nn.ModuleList([
        DecoderBlock(vocab_size, d_model, n_heads) for _ in range(n_layers)
    ])
    self.output_proj = nn.Linear(d_model, vocab_size)
    #self.output_proj.weight = self.token_embedding.weight
  def forward(self, ins, targets=None):
    B, T = ins.size()

    x = self.token_embedding(ins.to(device))
    input_indices = torch.arange(T).to(device)
    x += self.position_embedding(input_indices)

    look_ahead_mask = torch.triu(
        torch.ones((T, T)), diagonal=1
    )
    look_ahead_mask.masked_fill_(look_ahead_mask == 1, float("-inf"))
    look_ahead_mask = look_ahead_mask.to(device)

    for decoder in self.decoder_stack:
      x = decoder(x, mask=look_ahead_mask)
    logits = self.output_proj(x)
    loss = None
    if targets is not None:
      targets = targets.to(device)
      loss = F.cross_entropy(logits.view(-1, self.vocab_size), targets.view(-1))
    return logits, loss
my_GPT = GPT(enc.n_vocab, 32, 12, 12, 768).to(device)

In [12]:
x, y = dl.next()
logits, loss = my_GPT(x, y)
print(logits.shape, loss.item())

torch.Size([4, 8, 50257]) 13.431506156921387


In [18]:
batch_size = 4
block_size = 16
n_layers = 2
n_heads = 4
d_model = 32
lr = 3e-4



my_GPT = GPT(enc.n_vocab, block_size, n_layers, n_heads, d_model)
my_GPT = my_GPT.to(device)

compile = True
if compile and torch.cuda.is_available():
  my_GPT = torch.compile(my_GPT)

optim = torch.optim.AdamW(my_GPT.parameters(), lr=lr)
data_loader = DataLoader(train_text, batch_size, block_size, random_sample=True)

val_data_loader = DataLoader(val_text, batch_size, block_size, random_sample=False)
val_interval = 200

log_interval = 50
max_steps = 3000
print("Steps per epoch:", data_loader.steps_per_epoch())
print(f"GPT Parameters: {sum(p.numel() for p in my_GPT.parameters()) / 1e6} million")

Steps per epoch: 5000
GPT Parameters: 3.292625 million


In [14]:
torch.set_float32_matmul_precision("high")

In [19]:
import time
best_val_loss = float("inf")
my_GPT.train()
for step in range(max_steps + 1):
  step_start = time.time()
  x, y = data_loader.next()
  logits, loss = my_GPT(x, y)
  optim.zero_grad()
  loss.backward()
  optim.step()

  if step % log_interval == 0:
    print(f"Step {step}, loss: {loss.item()}, time: {round((time.time() - step_start) * 1e3, 2)} ms")
  if step % val_interval == 0:
    with torch.no_grad():
      val_loss = 0
      for val_step in range(val_data_loader.steps_per_epoch()):
        val_x, val_y = val_data_loader.next()
        logits, loss = my_GPT(val_x, val_y)
        val_loss += loss
      val_loss /= val_data_loader.steps_per_epoch()
      if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(my_GPT.state_dict(), 'best_model.pth')
      print(f"Val loss for step {step}: {val_loss}")
my_GPT.load_state_dict(torch.load('best_model.pth'))
my_GPT.eval()

Step 0, loss: 11.783772468566895, time: 113.32 ms
Val loss for step 0: 11.50425910949707
Step 50, loss: 10.398383140563965, time: 67.84 ms
Step 100, loss: 8.120105743408203, time: 113.62 ms
Step 150, loss: 7.678050518035889, time: 66.83 ms
Step 200, loss: 7.893157005310059, time: 66.8 ms
Val loss for step 200: 7.408690452575684
Step 250, loss: 7.235362529754639, time: 87.85 ms
Step 300, loss: 6.955040454864502, time: 87.27 ms
Step 350, loss: 6.1286396980285645, time: 112.71 ms
Step 400, loss: 6.125448703765869, time: 83.69 ms
Val loss for step 400: 6.776885986328125
Step 450, loss: 7.455333232879639, time: 112.11 ms
Step 500, loss: 6.582122325897217, time: 78.17 ms
Step 550, loss: 6.479583263397217, time: 84.92 ms
Step 600, loss: 7.278920650482178, time: 92.71 ms
Val loss for step 600: 6.578640937805176
Step 650, loss: 6.18936014175415, time: 84.25 ms
Step 700, loss: 5.844069957733154, time: 93.15 ms
Step 750, loss: 6.245770454406738, time: 80.64 ms
Step 800, loss: 7.812077045440674, t

GPT(
  (token_embedding): Embedding(50257, 32)
  (position_embedding): Embedding(16, 32)
  (decoder_stack): ModuleList(
    (0-1): 2 x DecoderBlock(
      (dropout): Dropout(p=0.1, inplace=False)
      (MHA): MultiHeadAttention(
        (wq): Linear(in_features=32, out_features=32, bias=True)
        (wk): Linear(in_features=32, out_features=32, bias=True)
        (wv): Linear(in_features=32, out_features=32, bias=True)
        (wo): Linear(in_features=32, out_features=32, bias=True)
      )
      (MLP): MLP(
        (l1): Linear(in_features=32, out_features=128, bias=True)
        (l2): Linear(in_features=128, out_features=32, bias=True)
        (gelu): GELU(approximate='none')
      )
      (layernorm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      (layernorm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    )
  )
  (output_proj): Linear(in_features=32, out_features=50257, bias=True)
)

In [20]:
prompt = "ROMEO:"
input_tokens = enc.encode(prompt)
output_tokens = enc.encode(prompt)
for x in range(200):
  if len(input_tokens) > block_size:
    input_tokens = input_tokens[1:]
  context_tensor = torch.tensor(input_tokens).view(1, -1).to(device)

  logits, loss = my_GPT(context_tensor)
  probs = F.softmax(logits[:, -1, :])
  result = torch.multinomial(probs, num_samples=1).item()
  input_tokens.append(result)
  output_tokens.append(result)
print(enc.decode(output_tokens))

  probs = F.softmax(logits[:, -1, :])


ROMEO:
INC John me.

IUS:
Some goes speak,
I advertise to progress she hear all: I al by the tale the empty lodge

alledINAAR:
Sh for me, thenst.
That orderly as inKE it heirEN:
He, sir the thee:
Should, forthself; where here her you d I will had '
To she, his nuns. flesh:

No's face menW weep makes of was at not in:
What of whyCome not act Paul me to of new by counsel example that, then
Drawhee deceit something warriorsourO:
Art you?
HaveER be i in fun instantly, 'el:
Have hisuke she hath we, dependent will
To bount oddsow,
Theity his loved him VI upon a, sir,tisbuck,
To else to burning loss this dowQueen being lips, his hanging your.

D bending an who:

