In [2]:
import requests

url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
text = response.text

In [3]:
print(text[:100])
print(len(text))

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You
1115394


In [4]:
# Obtaining all the set of characters that are ocurring in the text
# The vocabulary are the possible characters that the model can see or emit
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("> Set of characters:"+''.join(chars))
print("> Vocab Size:", vocab_size)

> Set of characters:
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
> Vocab Size: 65


In [5]:
# Strategy to tokenize the text: convert the RAW text to some sequence of integers
# according to some vocabulary of elements
# Mapping the characters to integers, so we can obtain a encoder and decoder
# Character level

# Puedes tener una secuencia muy larga de tokens con vocabulario pequeño o
# o pequeñas secuencias de enteros con grandes vocabularios, 
# -- > In the practise, peopl uses subword encodings
stoi = { character:i for i,character in enumerate(chars)}
itos = {i:character for i,character in enumerate(chars)}

encode = lambda s: [ stoi[c] for c in s ]
decode = lambda s: ''.join([itos[i] for i in s])

print(encode("HOLA"))
print(decode(encode("HOLA")))

[20, 27, 24, 13]
HOLA


In [6]:
# Tokenizer used in GPT
import tiktoken
enc = tiktoken.get_encoding("cl100k_base")

# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("gpt-4")
print(enc.decode([23,666,2302]))

8 Thxt


In [7]:
# To keep it simple, we will be using the char lvl tokenizer, lets use torch
import torch

# Encoding the data into tensors
data = torch.tensor(encode(text), dtype=torch.long)

print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [8]:
# Obtraining the training and validation set of data 
# Our split will be 90% for training set and 10% for validation 
n = int(0.9 * len(data))
train_data = data[:n]
valid_data = data[n:]


In [9]:
# We can't fit all the dataset into the model, so we need to create 
# minor batches for it, chunks of data, max length foir a chunk
block_size = 8 
'''
This string has actually multiple examples packed into it, because all of this characters 
follow each other. In this chunk of 9 chars, the're actually 8 individual examples 
packed in there
'''
train_data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [10]:
# Basically in the example before we have all these examples
x = train_data[:block_size]
y = train_data[1:block_size+1] # Targets for each position

for t in range(block_size):
    context = x[:t+1] # all characters up to t anmd including t
    target = y[t]
    print(f"When input is {context} the target is {target}")

# We do that to make transforme to seee the context of each of these words,
# For the first word,from the near neightbour to the end of them, so 
# all the way from as little as one to as block size.
# So when we sampling, we can sample from as little as one character
# and the transformers knows how to predict the next character in context of one, 
# and then to to size of block size

When input is tensor([18]) the target is 47
When input is tensor([18, 47]) the target is 56
When input is tensor([18, 47, 56]) the target is 57
When input is tensor([18, 47, 56, 57]) the target is 58
When input is tensor([18, 47, 56, 57, 58]) the target is 1
When input is tensor([18, 47, 56, 57, 58,  1]) the target is 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


In [11]:
# When fidding to the GPU, we will have many batches of chunks of text, because the GPU
# -> That's for take GPU busy, they're very good on parallel processing of data
# We want to proces multipole chuks, all at the same time, but they're processed completely
# independently and do not talk with each other

# Generating the batch of data
torch.manual_seed(1337)
batch_size = 4 # The independent sequences that we will process in parallel
block_size = 8 # The maximum context length for predictions

def get_batch(split):
    # Generates a small batch of data of inputs x and targets y
    data = train_data if split =='train' else valid_data
    # Size between zero and random block size
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y 

xb, yb = get_batch('train')
print("Inputs: ")
print(xb.shape)
print(xb)
print("Tragets: ")
print(yb.shape)
print(yb)

print('-'*10)

for b in range(batch_size): # Batch dimension
    for t in range(block_size): # Time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        # print(f"When input is {context.tolist()} the targtet is: {target}")


Inputs: 
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
Tragets: 
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----------


In [12]:
print(xb)
print(yb)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [13]:
import torch
import torch.nn as nn 
from torch.nn import functional as F 

torch.manual_seed(1337)


class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()  
        # Each token directly reads off the logits for the next token from a lookup table 
        # This creates a token embedding table of size: vocab_size x vocab_size
        # bawsically is a tensor of vocab_size x vocab_size
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) tensor integers 
        logits = self.token_embedding_table(idx) # (Batch=4, Time=8, Channel=65) tensor
        # Channel and vocab size are the same, The logits 
        # are basically the scores for the next character sequence
        # Here we predict what comes next just on individual identity of this single token
        # This tokens do not see any context yet.
        # ------------------------------

        if targets is None:
            loss = None # Will return logits in (B, T, C) form and None
        else:
            # Obtaining the loss, negative log likelihood, between preddictions and targets 
            # Quality of logits respect targets, we have the identity of the next char, so how 
            # well we can predict it.
            # We also need to reshape the logits B,T,C to B,C,T in order to fit torch function
            B,T,C = logits.shape 

            # We stretch the logits into 2 dimensional tensor to conform better pythorch def
            # hERE we can evaluate the quality of the model on some data
            logits = logits.view(B*T, C) # Becomes (B, C)
            targets = targets.view(B*T) # -1 is also valid
            loss = F.cross_entropy(logits, targets) # -ln(1/65) == 4.174 should be the ideal loss

        return logits, loss 
    
    #Generate the model 
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Get the predictions by calling the forward function 
            logits, loss = self(idx)
            # focus only on the last time step 
            logits = logits[:, -1, :] # Becomes (B, C)
            # Obtaining the probabilities by applying the softmax 
            probs = F.softmax(logits, dim=1) # (B, C)
            # Sample from the distribution to obtain new characters in the sequence 
            idx_next = torch.multinomial(probs, num_samples= 1) # (B, 1)
            # Append sampled index to the current sequence, 
            # Concatenating along the first dim, which is the Time dimension (T=8)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1,1), dtype=torch.long)
# As we work with batches, we use the index 0 to unplug the 0 batch
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [14]:
# training the model, let's create an optimizer 
# Using PyTorch optimizer
# Normally you will use 1e-3 o 1e-4 for networks, 
# but for smaller ones you can use bigger ones
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [15]:
# training the model 
batch_size = 32 # 4 Is too small
STEPS_RANGE = 10000

for steps in range(STEPS_RANGE):
    # Sample batch data 
    xb, yb = get_batch('train')

    # Evaluating the loss 
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True) # Zeroing all the gradients from the previous step
    loss.backward() # Getting the gradients for all the parameters
    optimizer.step() # Using those gradients to update the parameters

    if steps % (STEPS_RANGE / 10) == 0:    
        print(loss.item())


4.704006195068359
3.7031264305114746
3.1371781826019287
2.776794672012329
2.5844571590423584
2.5105180740356445
2.531585931777954
2.504757881164551
2.4696712493896484
2.4838879108428955


In [16]:
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=250)[0].tolist()))


Iyoteng h hasbe pave pirance
Rie hicomyonthar's
Plinseard ith henoure wounonthioneir thondy, y heltieiengerofo'dsssit ey
KIN d pe wither vouprrouthercc.
hathe; d!
My hind tt hinig t ouchos tes; st yo hind wotte grotonear 'so it t jod weancotha:
h hay


In [18]:
print(torch.cuda.is_available())

True


In [22]:
f = {'train':1.12355678}
print(f"{f['train']:.4f}")

1.1236


In [17]:
print(yb)
print(yb.view(4*8))
print(xb)

tensor([[58,  7, 57, 43, 56, 60, 39, 52],
        [46, 39, 50, 50,  1, 52, 53, 58],
        [ 0, 31, 58, 39, 52, 42, 57,  1],
        [58, 46, 63,  1, 46, 53, 56, 57],
        [50, 47,  1, 50, 47, 49, 43,  1],
        [41, 63,  6,  1, 50, 47, 49, 43],
        [ 8,  1, 25, 63,  1, 61, 53, 51],
        [24, 33, 15, 21, 27, 10,  0, 19],
        [60, 39, 52, 58,  7, 51, 39, 47],
        [53, 53, 42,  1, 26, 53, 56, 44],
        [44,  1, 63, 53, 59,  1, 53, 52],
        [30, 31, 10,  0, 13, 52, 42,  1],
        [ 2,  1, 57, 43, 56, 47, 53, 59],
        [46, 39, 56, 45, 43,  1, 63, 43],
        [43,  6,  0, 14, 63,  1, 41, 47],
        [46, 58, 43, 42,  1, 50, 47, 49],
        [ 0, 35, 47, 50, 58,  1, 58, 46],
        [30, 27, 25, 17, 27, 10,  0, 13],
        [53, 59, 56,  1, 45, 53, 53, 42],
        [57, 46, 39, 50, 50,  1, 49, 47],
        [11,  1, 39, 52, 42,  1, 52, 53],
        [58, 46, 43, 56,  1, 42, 39, 63],
        [10,  1, 21,  1, 61, 53, 59, 50],
        [63, 57, 43, 50, 44,  0, 2

RuntimeError: shape '[32]' is invalid for input of size 256

In [None]:
f = torch.tensor([
        [43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
print(f)
f = torch.cat((f, torch.tensor([[1],[2],[3],[4]])), dim=1)
print(f)
f = torch.cat((f, torch.tensor([[1],[2],[3],[4]])), dim=1)
print(f)

tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
tensor([[43, 58,  5, 57,  1, 46, 43, 39,  1],
        [53, 56,  1, 58, 46, 39, 58,  1,  2],
        [58,  1, 58, 46, 39, 58,  1, 46,  3],
        [17, 27, 10,  0, 21,  1, 54, 39,  4]])
tensor([[43, 58,  5, 57,  1, 46, 43, 39,  1,  1],
        [53, 56,  1, 58, 46, 39, 58,  1,  2,  2],
        [58,  1, 58, 46, 39, 58,  1, 46,  3,  3],
        [17, 27, 10,  0, 21,  1, 54, 39,  4,  4]])


In [None]:
def get_stats(ids):
    """
        Given a list of integers, return a dictionary of counts of consecutive pairs
        Example: [1, 2, 3, 1, 2] -> {(1, 2): 2, (2, 3): 1, (3, 1): 1}
    """
    counts = {}
    for pair in zip(ids, ids[1:]): # iterate consecutive elements
        counts[pair] = counts.get(pair, 0) + 1
    return counts

In [None]:
text = "Hello world"

# Text to bytes with utf8
ids = list(text.encode('utf-8'))
print(ids)
merges = {}


[72, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100]


In [None]:
# Find the pair with the lowest merge index
while len(ids) >=2:
    # Obtaining the pairs with the lowest merge inputs 
    stats = get_stats(ids)
    print(stats)
    # pair = min(stats, key= lambda p: self.merges.get(p))
    break

{(72, 101): 1, (101, 108): 1, (108, 108): 1, (108, 111): 1, (111, 32): 1, (32, 119): 1, (119, 111): 1, (111, 114): 1, (114, 108): 1, (108, 100): 1}
