In [1]:
# connect to G_drive
# from google.colab import drive
# drive.mount('/content/drive')
# download tiny shakespeare texts
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-08-17 01:20:54--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.2’


2024-08-17 01:20:54 (20.1 MB/s) - ‘input.txt.2’ saved [1115394/1115394]



In [2]:
# load text dataset
with open('ninput.txt', 'r', encoding='utf-8') as f:
  text = f.read()

print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [3]:
# get all unique characters appears in the corpus, sorted
chars = sorted(list(set(c for c in text)))
len(chars), ''.join(chars)

(65, "\n !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")

In [4]:
# write a encoder and decoder
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [5]:
# split
import torch
data = torch.tensor(encode(text), dtype=torch.long)  # encode text and save to dataset
N = int(0.9*len(data))  # 90% training and the rest for validation
train_data = data[:N]
val_data = data[N:]

block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [6]:
# data loading
torch.manual_seed(5525)
batch_size = 4

# generate a small batch of data of inputs x and targets y
def get_batch(split):
  data = train_data if split == 'train' else val_data
  # generates a tensor ix containing batch_size random integers as starting indices
  # to extract blocks of text from data tensor
  # subtract block_size to ensure enough data remaining to extract a block of size
  # i.e., i + block_size should not exceed the length of the data
  ix = torch.randint(len(data) - block_size, (batch_size, ))

  # stack 1d datas together
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+1+block_size] for i in ix])

  # x, y = x.to(device), y.to(device)
  return x, y

xb, yb = get_batch('train')
# xb.shape, yb.shape  # (torch.Size([4, 8]), torch.Size([4, 8]))
xb, yb

(tensor([[ 1, 58, 46, 53, 59,  6,  1, 53],
         [42,  1, 57, 53,  1, 51, 59, 41],
         [ 1, 58, 46, 43,  0, 39, 44, 58],
         [10,  0, 32, 46, 43,  1, 52, 43]]),
 tensor([[58, 46, 53, 59,  6,  1, 53, 56],
         [ 1, 57, 53,  1, 51, 59, 41, 46],
         [58, 46, 43,  0, 39, 44, 58, 43],
         [ 0, 32, 46, 43,  1, 52, 43, 61]]))

In [7]:
# bigram
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(5525)

class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    # each token directly reads off the logits for the next token from a lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)  # the lookup table

  def forward(self, idx, targets=None):
    # idx and targets are both (B, T) tensor of integers
    logits = self.token_embedding_table(idx)  # (B, T, C), the scores for next character in the sequence

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)  # (B * T, C)
      targets = targets.view(B*T)  # (B * T) or targets = targets.view(-1)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      # get the predictions
      logits, loss = self(idx)
      # get last time step
      logits = logits[:, -1, :]  # (B, C)
      # softmax to get probs
      probs = F.softmax(logits, dim=-1)  # (B, C)
      # predict next index by sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
      # append new idx to the running sequence
      idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
    return idx

vocab_size = len(chars)
m = BigramLanguageModel(vocab_size)
# out = m(xb, yb) # out.shape  # torch.Size([4, 8, 96])
logits, loss = m(xb, yb)
print(logits.shape, loss)

idx = torch.zeros((1, 1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65]) tensor(4.6213, grad_fn=<NllLossBackward0>)

JvtfTNnOP
d snF!CC&DZ-OjRICJPUiIdoZBLYeR'Vc?ob
ivOqVd kNXQ;Dp
d &DlMkLws?QGcofLY:q;Bhp?Eyggn;OIpNQnS


In [8]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32

for steps in range(10000):
  # get a training data batch
  xb, yb = get_batch('train')

  # evaluate the loss
  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print(loss.item())

2.474010467529297


In [9]:
print(decode(m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=10000)[0].tolist()))



IXDinopeioreve agerpllin-qupNGHI's be Pke wit fase scerof dstho wnde t O:
KINT:WAn sthag m coul kis, myor to ayHu be athise nge ncofooly s man ang ves! EN nourtietts, at '!
G w:

Laere tue uto a; spo bereven o and s padrvern,
Th blds hin'se,
Myow cabeso st gy te as:


I'n wer touree sowhe, oband--w ind,
Whityond bala bor, m?
c.
hed t t ang CENVerous.
Habl yepilen msevee
T:
MENEveare en, b, IO: tothokmifasOLAgre w, frthat, m prr's heangh orcheswet y m thandeanot atreeroulllero dare. andrditha af lf t sthis s onowerdy chas go,
is,
Thal nclen, f thy be s, s, geres igpt's t t men ne m; igoeverpf hienoursus, uis ates folllle meel pthak m pthe oris.
VO:
Mang ascos ay I den'bs
I pr spr athaco amy
I NRAMAUESorerar.
Wancre s s
hangowoweanthe, whow ono thitheevanthil w thouseton;
ad
Yofof outhotl'ds;

O: homy omee erchin bencide, SLE her ckier blfomenomy not oulerld.
Fods, hyseadergoudlekncen


Wha! y ine I see; y trve an y, kerdon ply eno;
ARonkes, s wirt s.

Tatetrd t ppratort, le omou Swile

### Math trick of self-attension

In [10]:
import torch

torch.manual_seed(5525)
B, T, C = 4, 8, 2  # batch, time, channels

x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [11]:
# version 1
weight = torch.tril(torch.ones(T, T))  # create a TxT lower triangular matrix
weight = weight / weight.sum(1, keepdim=True)  # every row is averaged by col numbers 
res = weight @ x  # average by column per row (T, T) @ (B, T, C) --boardcast--> (B, T, C)
weight.shape, res.shape

(torch.Size([8, 8]), torch.Size([4, 8, 2]))

In [12]:
# version2: softmax
from torch.nn import functional as F

tril = torch.tril(torch.ones(T, T))  # create a TxT lower triangular matrix
weight = torch.zeros((T, T))
weight = weight.masked_fill(tril == 0, float('-inf'))  # masked fill the weight matrix at place where in tril==0 with -inf
# weight = F.softmax(weight, dim=-1)  # use softmax to get averaged by row
# res2 = weight @ x
weight

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [13]:
weight = F.softmax(weight, dim=-1)  # use softmax to get averaged by row
weight

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [14]:
res2 = weight @ x
torch.allclose(res, res2)

True

### Self-attension

**Scaled Dot-Product Attension**

$$
\text{Attension}(Q, K, V) = \text{softmax}(\frac{QK^T}{\sqrt{d_k}})V
$$

divided by $\sqrt{d_k}$ to avoid too sharpy after softmax

In [15]:
import torch.nn as nn

head_size = 16

key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x)  # (B, T, 16)
q = query(x)  # (B, T, 16)

wei = q @ k.transpose(-2, -1)  # (B, T, 16) @ (B, 16, T) ---> (B, T, T)
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v
out.shape

torch.Size([4, 8, 16])

In [16]:
wei[0]

tensor([[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [9.0661e-01, 9.3387e-02, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [1.1255e-01, 8.4306e-01, 4.4397e-02, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [2.5565e-01, 1.3902e-01, 3.8473e-01, 2.2060e-01, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [3.6798e-02, 1.1804e-04, 9.5667e-01, 6.4098e-03, 6.6768e-06, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [5.8463e-02, 2.4333e-01, 2.7453e-02, 9.3168e-02, 5.2570e-01, 5.1882e-02,
         0.0000e+00, 0.0000e+00],
        [1.9701e-01, 5.4503e-02, 3.3448e-01, 1.1815e-01, 2.3135e-02, 2.1201e-01,
         6.0718e-02, 0.0000e+00],
        [5.5008e-02, 1.8497e-01, 2.9514e-02, 8.2761e-02, 3.6395e-01, 4.9922e-02,
         1.8261e-01, 5.1268e-02]], grad_fn=<SelectBackward0>)

Attension is a communication mechanism.
Init is a set of vectors.
Each batch dimension is independent.
self-attension: keys, queries, values all come from same source
