In [1]:
with open(file="input.txt", mode="r", encoding="utf-8") as f:
  text = f.read()

print(len(text))

1115393


In [2]:
text[:1000]

"First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be done: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citizens, the patricians good.\nWhat authority surfeits on would relieve us: if they\nwould yield us but the superfluity, while it were\nwholesome, we might guess they relieved us humanely;\nbut they think we are too dear: the leanness that\nafflicts us, the object of our misery, is as an\ninventory to particularise their abundance; our\nsufferance is a gain to them Let us revenge this with\nour pikes, ere we become rakes: for the gods know I\nspeak this in hunger 

In [3]:
# collecting all the unique characters in the corpus
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [4]:
# creating a mapping from chars to integers
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s] # string to char to index
decode = lambda l: "".join([itos[i] for i in l]) # char to string then join

print(encode("my name is vishu !"))
print(decode([51, 63, 1, 52, 39, 51, 43, 1, 47, 57, 1, 60, 47, 57, 46, 59, 1, 2]))

[51, 63, 1, 52, 39, 51, 43, 1, 47, 57, 1, 60, 47, 57, 46, 59, 1, 2]
my name is vishu !


In [5]:
import torch

In [6]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115393]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [7]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [8]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [9]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print(f"When context is {context}, the target is {target}")

When context is tensor([18]), the target is 47
When context is tensor([18, 47]), the target is 56
When context is tensor([18, 47, 56]), the target is 57
When context is tensor([18, 47, 56, 57]), the target is 58
When context is tensor([18, 47, 56, 57, 58]), the target is 1
When context is tensor([18, 47, 56, 57, 58,  1]), the target is 15
When context is tensor([18, 47, 56, 57, 58,  1, 15]), the target is 47
When context is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is 58


In [10]:
torch.manual_seed(1337)

batch_size = 4 # how many independent sequences to process ?
block_size = 8 # maximum length of the context length to predict ?

def get_batch(split):
  # generate batches of data for input (x) and output (y)
  data = train_data if split == "train" else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])

  return x, y

xb, yb = get_batch("train")
print("inputs:")
print(xb.shape)
print(xb)
print("target:")
print(yb.shape)
print(yb)

print("------------------")

for b in range(batch_size):
  for t in range(block_size):
    context = xb[b, :t+1]
    target = yb[b, t]
    print(f"When input is {context.tolist()}, target is {target}")

inputs:
torch.Size([4, 8])
tensor([[53, 59,  6,  1, 58, 56, 47, 40],
        [49, 43, 43, 54,  1, 47, 58,  1],
        [13, 52, 45, 43, 50, 53,  8,  0],
        [ 1, 39,  1, 46, 53, 59, 57, 43]])
target:
torch.Size([4, 8])
tensor([[59,  6,  1, 58, 56, 47, 40, 59],
        [43, 43, 54,  1, 47, 58,  1, 58],
        [52, 45, 43, 50, 53,  8,  0, 26],
        [39,  1, 46, 53, 59, 57, 43,  0]])
------------------
When input is [53], target is 59
When input is [53, 59], target is 6
When input is [53, 59, 6], target is 1
When input is [53, 59, 6, 1], target is 58
When input is [53, 59, 6, 1, 58], target is 56
When input is [53, 59, 6, 1, 58, 56], target is 47
When input is [53, 59, 6, 1, 58, 56, 47], target is 40
When input is [53, 59, 6, 1, 58, 56, 47, 40], target is 59
When input is [49], target is 43
When input is [49, 43], target is 43
When input is [49, 43, 43], target is 54
When input is [49, 43, 43, 54], target is 1
When input is [49, 43, 43, 54, 1], target is 47
When input is [49, 43, 

In [11]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):

    # each token directly reads off the logits for the next token from the lookup table
    logits = self.token_embedding_table(idx)

    if targets is None:
      loss = None

    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      
      logits, loss = self(idx)
      logits = logits[:, -1, :]
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx, idx_next), dim=1)

    return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


torch.Size([32, 65])
tensor(4.8948, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [12]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [16]:
batch_size = 32

for steps in range(10000):

  xb, yb = get_batch(split='train')

  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

  print(loss.item())


2.452850580215454
2.544088125228882
2.5024654865264893
2.559384822845459
2.479424238204956
2.3964293003082275
2.4621920585632324
2.590568780899048
2.523573637008667
2.440568685531616
2.5325677394866943
2.567152500152588
2.5133845806121826
2.3936336040496826
2.466322183609009
2.368321657180786
2.562734603881836
2.3920631408691406
2.45816707611084
2.6093900203704834
2.423999309539795
2.4549763202667236
2.448296546936035
2.49141263961792
2.497774839401245
2.4092857837677
2.44084095954895
2.3384525775909424
2.560450553894043
2.477339744567871
2.530503511428833
2.5151102542877197
2.578629732131958
2.4296977519989014
2.3904428482055664
2.3276515007019043
2.4791228771209717
2.5043091773986816
2.4057867527008057
2.426945924758911
2.4725310802459717
2.521289825439453
2.389709949493408
2.540320634841919
2.4795777797698975
2.3668618202209473
2.358574867248535
2.4156880378723145
2.378319025039673
2.506113290786743
2.4738049507141113
2.4161417484283447
2.4293627738952637
2.3913564682006836
2.409700

In [17]:
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=200)[0].tolist()))


ALo, warawoofe, M: atre deseeshen tar me ifukeshaceweag t io, d at.
KE: co ctisefang he t d knerde, t thises;
Bund wiemetiarele hen
le, be ad, jush we, withindire INENGRe' thovexpu a PESAn stlis wilur


### Math in self-attention

In [18]:
torch.manual_seed(seed=1337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [19]:
# mean from start to (t-1)th context
xbow = torch.zeros((B, T, C))
for b in range(B):
  for t in range(T):

    xprev = x[b, :t+1]
    xbow[b, t] = torch.mean(xprev, 0)

In [27]:
torch.manual_seed(seed=42)
a = torch.tril(torch.ones(size=(3, 3)))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(low=0, high=10, size=(3, 2)).float()
c = a @ b
print(a)
print(b)
print(c)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])
