In [1]:
# !pip install tiktoken

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tiktoken
  Downloading tiktoken-0.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.3.3


In [6]:

import tiktoken 
tokenizer = tiktoken.get_encoding("gpt2")
with open('poetry.txt', 'r', encoding='latin-1') as f:
    _tokens = tokenizer.encode_ordinary(f.read())

# sample tokens
print("\nSamples from tokenization: ")
print([tokenizer.decode_single_token_bytes(token) for token in _tokens[150:170]])


Samples from tokenization: 
[b' her', b' mother', b' are', b' credited', b' with', b' having', b' researched', b',', b'\n', b'authent', b'icated', b',', b' and', b' compiled', b' much', b' of', b' the', b' material', b' School', b'craft']


In [7]:
num_tokens = len(_tokens)
vocab = list(set(_tokens))
vocab_size = len(vocab)
# ordinal_encodings

otoe = {i : vocab[i] for i in range(vocab_size)}
etoo = {vocab[i] : i for i in range(vocab_size)}
# otoe = {i : _tokens[i] for i in range(num_tokens)}
# etoo = {_tokens[i] : i for i in range(num_tokens)}
ordinalize = lambda t : etoo[t]
deordinalize = lambda t : otoe[t]

tokens = [ordinalize(t) for t in _tokens]
assert(_tokens == [deordinalize(t) for t in tokens])
print(f'number of tokens = {len(tokens)}')
assert(max(tokens) == vocab_size - 1)

number of tokens = 4604451


In [8]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [9]:
batch_size = 16
block_size = 32
max_iters = 10000
eval_iters = 500
eval_interval = 500
learning_rate = 1e-2
device = "cuda" if torch.cuda.is_available() else "cpu"
n_embd = 32
n_head = 4
n_layer = 4
print("device:" + device)
dropout = 0.5

device:cuda


In [14]:

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out
    








class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out
    






class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)
    





class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x
    



class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        # if loss is not None:
        #     if loss < 5.5:  
        #         learning_rate = 1e-3

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


In [10]:

data = torch.tensor(tokens, dtype=torch.long, device=device)
print(data.shape, data.dtype)

torch.Size([4604451]) torch.int64


In [11]:

train_data = data[:int(num_tokens * 0.9)]
val_data = data[int(num_tokens * 0.9): ]

train_data.get_device()

0

In [12]:

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i: i+block_size] for i in ix])
    y = torch.stack([data[i+1: i+block_size+1] for i in ix])
    return x, y


xb, yb = get_batch("train")

In [13]:
print(xb.shape)
print(yb.shape)


torch.Size([8, 32])
torch.Size([8, 32])


In [15]:

model = BigramLanguageModel(vocab_size)
m = model.to(device)


In [16]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


In [17]:


# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

2.06093 M parameters


In [31]:
for x in range(5):
    
  for iter in range(max_iters):

      # every once in a while evaluate the loss on train and val sets
      if iter % eval_interval == 0 or iter == max_iters - 1:
          losses = estimate_loss()
          print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

      # sample a batch of data
      xb, yb = get_batch('train')

      # evaluate the loss
      logits, loss = model(xb, yb)
      optimizer.zero_grad(set_to_none=True)
      loss.backward()
      optimizer.step()

  model_name = "Model_iter_" + str(30000 + (x+1)*max_iters)
  torch.save(model.state_dict(), "Model_iter_30000")


step 0: train loss 5.2114, val loss 5.5830
step 500: train loss 5.2293, val loss 5.5942
step 1000: train loss 5.2099, val loss 5.5962
step 1500: train loss 5.2176, val loss 5.6108
step 2000: train loss 5.2129, val loss 5.5932
step 2500: train loss 5.2437, val loss 5.5880
step 3000: train loss 5.2218, val loss 5.6148
step 3500: train loss 5.2331, val loss 5.5978
step 4000: train loss 5.2139, val loss 5.5798
step 4500: train loss 5.2313, val loss 5.6084
step 5000: train loss 5.2271, val loss 5.5835
step 5500: train loss 5.2187, val loss 5.5946
step 6000: train loss 5.2174, val loss 5.5847
step 6500: train loss 5.2255, val loss 5.5939
step 7000: train loss 5.2139, val loss 5.5921
step 7500: train loss 5.2213, val loss 5.6061
step 8000: train loss 5.2224, val loss 5.5863
step 8500: train loss 5.2081, val loss 5.6001
step 9000: train loss 5.2519, val loss 5.5956
step 9500: train loss 5.2040, val loss 5.5731
step 9999: train loss 5.2191, val loss 5.5992
step 0: train loss 5.1989, val loss 5.

4


In [33]:
torch.save(model.state_dict(), "Model_iter_80000")

In [35]:
xb, yb = get_batch('val')
_idx = model.generate(xb, 100)

print(_idx.shape)

torch.Size([8, 132])


In [36]:

for batch in _idx:
    res = []
    for num in batch:
        num2 = deordinalize(int(num))
        res.append(num2)
    resstr = tokenizer.decode(res)
    print(resstr)


riv'd' whence in that part, where first a breach
As of a wall appear'd, I could descry
A portal, and three steps beneath his body on me found
Then spied and whence they
(Stands it's sons of my poor with the goddisedies,
Were curves and stabbing mightier ancestry.
Such fever, and shame denied Scouts far and seed,
Nay the flame, they by whilst
Of his we will ask what we twain.
I could bras yet pad.
In the feeling more
 sadly of unknown, that bleared to Knowledge,, all things. 1737,

Your sports did determine in the month of July;
There's less fraud in plain damme than your sly by my truly;
'Tis sack he now serve".
Tune my heart is done there last. Cato dede you so,
Or rather yet not so welessly Company with her
Nor will much pursued, and sittingainy trees in holier summer stands.
But wore, till the world's rich a chap and told I fell,
Shook with the wide a grain,
A mind with hard and wit and themselves alive,
Of Nature and discontent amazed,
Of his head and room to rank

Let thy heels sp

In [None]:

# xb, yb = get_batch('train')
# _idx = model.generate(xb, 100)

# print(_idx.shape)

# for batch in _idx:
#     res = []
#     for num in batch:
#         num2 = deordinalize(int(num))
#         res.append(num2)
#     resstr = tokenizer.decode(res)
#     print(resstr)
