# Dev my own GPT for stories

In [2]:
# read data
with open(r'../assets/contes.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [5]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  441596


In [6]:
# look the first 1000 characters
print(text[:1000])

LA BARBE BLEUE.

Il était une fois un homme qui avait de belles maisons à la ville et à la campagne, de la vaisselle d'or et d'argent, des meubles en broderie et des carrosses tout dorés. Mais, par malheur, cet homme avait la barbe bleue; cela le rendait si

laid et si terrible, qu'il n'était femme ni fille qui ne s'enfuît devant lui.

Une de ses voisines, dame de qualité, avait deux filles parfaitement belles. Il lui en demanda une en mariage, en lui laissant le choix de celle qu'elle voulait lui donner. Elles n'en voulaient point toutes deux, et se le renvoyaient l'une à l'autre, ne pouvant se résoudre à prendre un homme qui eût la barbe bleue. Ce qui les dégoûta encore, c'est qu'il avait déjà épousé plusieurs femmes, et qu'on ne savait ce que ces femmes étaient devenues.

La Barbe Bleue, pour faire connaissance, les mena, avec leur mère et trois ou quatre de leurs meilleures amies, et quelques jeunes gens du voisinage, à une de ses maisons de campagne, où on demeura huit jours entie

In [7]:
# get all unique characters that occur
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"'(),-.0123456789:;?ABCDEFGHIJLMNOPQRSTUVXYZabcdefghijlmnopqrstuvxyz«»ÇÉÊàâçèéêëîïôùûœ—
90


## Tokenization


We can use SentencePiece from Google

In [8]:
# Charater to integers
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]  # encoding part
decode = lambda l: ''.join([itos[i] for i in l])  # decoding part

print(encode('Salut, tu vas bien ?'))
print(decode(encode('Salut, tu vas bien ?')))

[40, 47, 57, 66, 65, 7, 1, 65, 66, 1, 67, 47, 64, 1, 48, 55, 51, 59, 1, 22]
Salut, tu vas bien ?


We use the package tiktoken from OpenIA

In [21]:
import tiktoken
# enc = tiktoken.get_encoding("cl100k_base")
enc = tiktoken.encoding_for_model("gpt-4")
print(enc.encode("Salut, tu vas bien ?"))
print(enc.decode(enc.encode("Salut, tu vas bien ?")))

[17691, 332, 11, 9964, 44496, 14707, 949]
Salut, tu vas bien ?


In [24]:
# We encode the entire text dataset and store it into torch.Tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([441596]) torch.int64
tensor([33, 23,  1, 24, 23, 39, 24, 27,  1, 24, 33, 27, 42, 27,  9,  0,  0, 31,
        57,  1, 80, 65, 47, 55, 65,  1, 66, 59, 51,  1, 52, 60, 55, 64,  1, 66,
        59,  1, 54, 60, 58, 58, 51,  1, 62, 66, 55,  1, 47, 67, 47, 55, 65,  1,
        50, 51,  1, 48, 51, 57, 57, 51, 64,  1, 58, 47, 55, 64, 60, 59, 64,  1,
        76,  1, 57, 47,  1, 67, 55, 57, 57, 51,  1, 51, 65,  1, 76,  1, 57, 47,
         1, 49, 47, 58, 61, 47, 53, 59, 51,  7,  1, 50, 51,  1, 57, 47,  1, 67,
        47, 55, 64, 64, 51, 57, 57, 51,  1, 50,  4, 60, 63,  1, 51, 65,  1, 50,
         4, 47, 63, 53, 51, 59, 65,  7,  1, 50, 51, 64,  1, 58, 51, 66, 48, 57,
        51, 64,  1, 51, 59,  1, 48, 63, 60, 50, 51, 63, 55, 51,  1, 51, 65,  1,
        50, 51, 64,  1, 49, 47, 63, 63, 60, 64, 64, 51, 64,  1, 65, 60, 66, 65,
         1, 50, 60, 63, 80, 64,  9,  1, 34, 47, 55, 64,  7,  1, 61, 47, 63,  1,
        58, 47, 57, 54, 51, 66, 63,  7,  1, 49, 51, 65,  1, 54, 60, 58, 58, 51,
       

In [25]:
# Split in train and test
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [26]:
block_size = 8  #This is the size of the context
train_data[:block_size + 1]

tensor([33, 23,  1, 24, 23, 39, 24, 27,  1])

In [28]:
x = train_data[:block_size]
y = train_data[1:block_size + 1]
for t in range(block_size):
    context = x[: t + 1]
    target = y[t]
    print(f"When the input is {context} the target is: {target}")

When the input is tensor([33]) the target is: 23
When the input is tensor([33, 23]) the target is: 1
When the input is tensor([33, 23,  1]) the target is: 24
When the input is tensor([33, 23,  1, 24]) the target is: 23
When the input is tensor([33, 23,  1, 24, 23]) the target is: 39
When the input is tensor([33, 23,  1, 24, 23, 39]) the target is: 24
When the input is tensor([33, 23,  1, 24, 23, 39, 24]) the target is: 27
When the input is tensor([33, 23,  1, 24, 23, 39, 24, 27]) the target is: 1


In [30]:
torch.manual_seed(42)
batch_size = 4  # how many independent sequences will we process in //
block_size = 8  # what ia the maximum context length for predictions

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i: i + block_size] for i in ix])
    y = torch.stack([data[i + 1: i + block_size + 1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('Inputs :')
print(xb.shape)
print(xb)
print('')
print('Tragets :')
print(yb.shape)
print(yb)

print('------')

Inputs :
torch.Size([4, 8])
tensor([[55, 65,  1, 66, 59,  1, 61, 51],
        [66, 55,  1, 47, 57, 57, 79, 63],
        [ 9,  0,  0, 31, 57,  1, 48, 47],
        [55, 57, 57, 51,  1, 57, 51,  1]])

Tragets :
torch.Size([4, 8])
tensor([[65,  1, 66, 59,  1, 61, 51, 66],
        [55,  1, 47, 57, 57, 79, 63, 51],
        [ 0,  0, 31, 57,  1, 48, 47, 55],
        [57, 57, 51,  1, 57, 51,  1, 58]])
------


In [31]:
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t + 1]
        target = yb[b, t]
        print(f"When the input is {context} the target is: {target}")

When the input is tensor([55]) the target is: 65
When the input is tensor([55, 65]) the target is: 1
When the input is tensor([55, 65,  1]) the target is: 66
When the input is tensor([55, 65,  1, 66]) the target is: 59
When the input is tensor([55, 65,  1, 66, 59]) the target is: 1
When the input is tensor([55, 65,  1, 66, 59,  1]) the target is: 61
When the input is tensor([55, 65,  1, 66, 59,  1, 61]) the target is: 51
When the input is tensor([55, 65,  1, 66, 59,  1, 61, 51]) the target is: 66
When the input is tensor([66]) the target is: 55
When the input is tensor([66, 55]) the target is: 1
When the input is tensor([66, 55,  1]) the target is: 47
When the input is tensor([66, 55,  1, 47]) the target is: 57
When the input is tensor([66, 55,  1, 47, 57]) the target is: 57
When the input is tensor([66, 55,  1, 47, 57, 57]) the target is: 79
When the input is tensor([66, 55,  1, 47, 57, 57, 79]) the target is: 63
When the input is tensor([66, 55,  1, 47, 57, 57, 79, 63]) the target is

## Model creation

In [52]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(42)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size) -> None:
        super().__init__()
        # Each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B, T) tensor of intergers
        logits = self.token_embedding_table(idx)  # (B, T, C)
        # B => Number of batch
        # T => The size of the context
        # C => Number od Channel, i.e. the size of the vocabulary
        if targets is None:
            loss = None
        else:
            # For the cross_entropy the channel C must be the second one
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get predictions
            logits, loss = self(idx)
            # focus only in the last time step
            logits = logits[:, -1, :]
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=1)  # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T + 1)
        return idx

The cross entropy return a loss for a random choose around $-ln(1/vocab\_size)$

In [53]:
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 90])
tensor(5.3363, grad_fn=<NllLossBackward0>)


In [66]:
idx = torch.zeros((1,1), dtype=torch.long)
print(decode(list(m.generate(idx, max_new_tokens=100)[0].numpy())))


lLxETYvCpnfô2aM4èîyRTn8EôAîFzpxvz1s»Fi—2n?Z9;Êee-ErLLeO7'çèDuPoGâ5dôâEdi8,tëcxVbbÉî7Bi3ù;UQ8cÉMe-àv5


## Model training

In [75]:
from tqdm import tqdm

In [67]:
# creation of a pytroch optimization object
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [76]:
batch_size = 32
for steps in tqdm(range(100000)):

    # sample batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()  # get the gradients for all parameters
    optimizer.step()  # uptdate parameters

print(loss.item())

  0%|          | 0/100000 [00:00<?, ?it/s]

100%|██████████| 100000/100000 [04:33<00:00, 365.08it/s]

2.3640780448913574





In [77]:
idx = torch.zeros((1,1), dtype=torch.long)
print(decode(list(m.generate(idx, max_new_tokens=400)[0].numpy())))


Vent, elhéchava chai se qui-grenoit, irone mogainvoù va fe mis det blouintant, blaievavœu'one n rempées r »
n a qu'ét.

Ile ail ceurou'all de, de chet. ju aitre des ce pamitt drsill ne ss mpon eunene deinseueai ces'a-it cr, e. plluvorre ablletos te je r. mons allemoite à à porerifuintaît-met fi na pe t qunsan paime e t-à an d oitôt, let, vou'ai (cesin Pep. qui!
El, lle, d Puaissureto- ut de let t 


## Mathematical trick in self-attention

To avoid communication bewteen past and the "future"

In [78]:
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

We first use a very poor meethod to aggregate previous information

In [79]:
# We want x[b, t] = mean_{i<=t} x_{b, i}
# version 1: uggly way
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, : t + 1] # (t, C)
        xbow[b, t] = torch.mean(xprev, 0)

In [80]:
x[0]

tensor([[ 0.3565, -0.1091],
        [ 1.5039,  1.5870],
        [ 0.8381,  0.2145],
        [-1.5449,  1.0934],
        [ 1.0445, -1.5426],
        [ 1.1026,  0.5029],
        [-0.5149, -0.2694],
        [-0.7648, -0.9041]])

In [81]:
xbow[0]

tensor([[ 0.3565, -0.1091],
        [ 0.9302,  0.7390],
        [ 0.8995,  0.5641],
        [ 0.2884,  0.6965],
        [ 0.4396,  0.2486],
        [ 0.5501,  0.2910],
        [ 0.3980,  0.2110],
        [ 0.2526,  0.0716]])

Now, we use a better way to aggregate information using // calculation

In [86]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b  # matrix multiplication
print("a = ", a)
print('----')
print("b = ", b)
print('----')
print("c = ", c)

a =  tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
----
b =  tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
----
c =  tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [89]:
# version 2: use @
wei = torch.tril(torch.ones(T, T))
wei /= wei.sum(1, keepdim=True)
xbow2 = wei @ x  # (T, T) * (B, T, C) --> (B, T, C)
torch.allclose(xbow, xbow2)

True

In [91]:
# version 3: use Softax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros(T, T)
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True