In [1]:
# The tiny shakespear was downloaded from: 
# https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [2]:
with open('./data/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print("length of text", len(text))

length of text 1115393


In [3]:
charachters = sorted(set(list(text))) #set contains only one of each character, can't have duplicates
vocabular_size = len(charachters)
print(''.join(charachters))
print(vocabular_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [4]:
# tokenizing the input text
# This will be a character level -- if I want to make my own chatbots with my own data, 
# This part will be different, I think this is a very basic example. chat gpt uses a subword tokenizer
# These function inverse the mapping between integers and string
stoi = { ch:i for i, ch in enumerate(charachters) } # A dictionary of the charachter - integer mapping
itos = { i:ch  for i, ch in enumerate(charachters)} # A dictionary of integer - charachter mapping
encode = lambda s: [stoi[c] for c in s]  # text to tokens: s is input string, we make a list out of the lookups in stoi.
decode = lambda l: ''.join([itos[i] for i in l]) # tokens to text: l is list of integers, we loop over it with i
print(encode('hey, this is a test'))
print(decode(encode('hey, this is a test')))

[46, 43, 63, 6, 1, 58, 46, 47, 57, 1, 47, 57, 1, 39, 1, 58, 43, 57, 58]
hey, this is a test


In [5]:
import torch
data = torch.tensor(encode(text), dtype=torch.long) # A tensor is a multi dimensional array of data from a single data type. 
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115393]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [6]:
# we train the dataset with chunks at a time
# We also split up the data in a training set, and a validation set
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]
print(train_data)

tensor([18, 47, 56,  ..., 46, 43, 56])


In [7]:
block_size = 8 # the length of chunks that will be fed to the model for training. in our case 8 charachters at a time.
train_data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [8]:
# This is the principe of learning
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context} the target: {target}")
print(range(block_size))
print(x)
print(y)

When input is tensor([18]) the target: 47
When input is tensor([18, 47]) the target: 56
When input is tensor([18, 47, 56]) the target: 57
When input is tensor([18, 47, 56, 57]) the target: 58
When input is tensor([18, 47, 56, 57, 58]) the target: 1
When input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58
range(0, 8)
tensor([18, 47, 56, 57, 58,  1, 15, 47])
tensor([47, 56, 57, 58,  1, 15, 47, 58])


In [9]:
# Generate training batches
torch.manual_seed(1337)
batch_size = 4 # How many independent sequences will we process in parallel 
block_size = 8 # The maximum amount of context length for prediction
# Basically this will determine the dimensions of the different py torch tensors.

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # generate random positions to grab chunk out of train / val data
    x = torch.stack([data[i:i+block_size] for i in ix]) # first block size characters, starting at I
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # Offset of 1 from X. 
    return x, y 

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[53, 59,  6,  1, 58, 56, 47, 40],
        [49, 43, 43, 54,  1, 47, 58,  1],
        [13, 52, 45, 43, 50, 53,  8,  0],
        [ 1, 39,  1, 46, 53, 59, 57, 43]])
targets:
torch.Size([4, 8])
tensor([[59,  6,  1, 58, 56, 47, 40, 59],
        [43, 43, 54,  1, 47, 58,  1, 58],
        [52, 45, 43, 50, 53,  8,  0, 26],
        [39,  1, 46, 53, 59, 57, 43,  0]])
when input is [53] the target: 59
when input is [53, 59] the target: 6
when input is [53, 59, 6] the target: 1
when input is [53, 59, 6, 1] the target: 58
when input is [53, 59, 6, 1, 58] the target: 56
when input is [53, 59, 6, 1, 58, 56] the target: 47
when input is [53, 59, 6, 1, 58, 56, 47] the target: 40
when input is [53, 59, 6, 1, 58, 56, 47, 40] the target: 59
when input is [49] the target: 43
when input is [49, 43] the target: 43
when input is [49, 43, 43] the target: 54
when input is [49, 43, 43, 54] the target: 1
when input is [49, 43, 43, 54, 1] the target: 47
when input is [49, 43, 43, 5

In [10]:
print(xb)

tensor([[53, 59,  6,  1, 58, 56, 47, 40],
        [49, 43, 43, 54,  1, 47, 58,  1],
        [13, 52, 45, 43, 50, 53,  8,  0],
        [ 1, 39,  1, 46, 53, 59, 57, 43]])


In [11]:
import torch
import torch.nn as nn 
from torch.nn import functional as F 
torch.manual_seed(1337)
# Constructor 
class BigramLanguageModel(nn.Module):

    def __init__(self, vocabular_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocabular_size, vocabular_size) # tensor of shape vocab size x vocab size (65 x 65)

    def forward(self, idx, targets=None):

        # Logits are basically the scores for the next character in the sequence.
        logits = self.token_embedding_table(idx) # (B, T, C) (Batch == 4 , Time == 8 , Channel == 65)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets) # evaluate loss / quality of predictions. quality of logits in retrospect to the targets
    
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T : Batch, Time (block_size)) array of indices in the current context
                #xb =
                    #[
                    #[57,  0, 58, 46, 39, 52,  1, 50],
                    #[26, 19,  1, 20, 17, 26, 30, 37],
                    #[43,  8,  0,  0, 31, 43, 41, 53],
                    #[54, 54, 63,  1, 40, 56, 43, 31]
                    #]
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
                #logits =
                    #[
                    #[ [65 scores], [65 scores], ... 8 times ... ],   # sequence 0
                    #[ [65 scores], [65 scores], ... 8 times ... ],   # sequence 1
                    #[ [65 scores], [65 scores], ... 8 times ... ],   # sequence 2
                    #[ [65 scores], [65 scores], ... 8 times ... ]    # sequence 3
                    #]
            logits = logits[:, -1, :] # becomes (B, C) : Only use the predictions based on the last token in each row.
                # logits
                    # sequence 0 last token: 50
                    # sequence 1 last token: 37
                    # sequence 2 last token: 53
                    # sequence 3 last token: 31
            probs = F.softmax(logits, dim=-1) # Calculate probabilities
                # probs
                    # probs =
                    #   [
                    #   [p0, p1, p2, ... p64],   # sums to 1
                    #   [p0, p1, p2, ... p64],
                    #   [p0, p1, p2, ... p64],
                    #   [p0, p1, p2, ... p64]
                    #   ]
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1) pick one tocken per row
                # idx_next
                    # idx_next =
                    #   [
                    #   [12],
                    #   [ 0],
                    #   [41],
                    #   [63]
                    #   ]
            idx = torch.cat((idx, idx_next), dim=1)
                    # before:
                        # [57,  0, 58, 46, 39, 52,  1, 50]
                    # after:
                        # [57,  0, 58, 46, 39, 52,  1, 50, 12]   # example sampled token
        return idx
    
m = BigramLanguageModel(vocabular_size)
logits, loss = m(xb, yb) # xb = input, yb = targets
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist())) # run the generation. it's gibberish now.

tensor(4.8948, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [12]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [20]:
batch_size = 32
for steps in range(10000): # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.4325833320617676


In [14]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist())) # getting better structure
# At this point, the character that is being predicted, is predicted based on one character. we now need to let it talk to the other characters.


llo br. ave aviasurf my, may be ivee iuedrd whar ksth y h bora s be hese, woweee; the! KI 'de, ulseecherd d o blllando;LUCEO, oraingofoff ve!
RIfans picsheserer hee anf,
TOFonk? me ain ckntoty ded. bo'llll st ta d:
ELIS me hurf lal y, ma dus pe athouo
By bre ndy; by s afreanoo adicererupa anse tecorro llaus a!
OLengerithesinthengove fal ames trr
TI ar I t, mes, n sar; my w,

Whank'the
THek' merer, dd
We ntem lud engitonso; cer ize helour
Jginte the?
Thak orblyoruldvicee chot, p,
Bealivolde Th ll


In [15]:
torch.manual_seed(1337)
B, T, C = 4, 8, 32 
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 32])

In [16]:
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1]
        xbow[b, t] = torch.mean(xprev, 0)
print(x[0])
print(xbow[0])

tensor([[ 1.8077e-01, -6.9988e-02, -3.5962e-01, -9.1520e-01,  6.2577e-01,
          2.5510e-02,  9.5451e-01,  6.4349e-02,  3.6115e-01,  1.1679e+00,
         -1.3499e+00, -5.1018e-01,  2.3596e-01, -2.3978e-01, -9.2111e-01,
          1.5433e+00,  1.3488e+00, -1.3964e-01,  2.8580e-01,  9.6512e-01,
         -2.0371e+00,  4.9314e-01,  1.4870e+00,  5.9103e-01,  1.2603e-01,
         -1.5627e+00, -1.1601e+00, -3.3484e-01,  4.4777e-01, -8.0164e-01,
          1.5236e+00,  2.5086e+00],
        [-6.6310e-01, -2.5128e-01,  1.0101e+00,  1.2155e-01,  1.5840e-01,
          1.1340e+00, -1.1539e+00, -2.9840e-01, -5.0754e-01, -9.2392e-01,
          5.4671e-01, -1.4948e+00, -1.2057e+00,  5.7182e-01, -5.9735e-01,
         -6.9368e-01,  1.6455e+00, -8.0299e-01,  1.3514e+00, -2.7592e-01,
         -1.5108e+00,  2.1048e+00,  2.7630e+00, -1.7465e+00,  1.4516e+00,
         -1.5103e+00,  8.2115e-01, -2.1153e-01,  7.7890e-01,  1.5333e+00,
          1.6097e+00, -4.0323e-01],
        [-8.3447e-01,  5.9780e-01, -5.14

In [17]:
wei = torch.tril(torch.ones(T, T))
    #wei = at time step T, you are allowed to look at positions 0 -> t
    #    [
    #    [1, 0, 0, 0, 0, 0, 0, 0],
    #    [1, 1, 0, 0, 0, 0, 0, 0],
    #    [1, 1, 1, 0, 0, 0, 0, 0],
    #    [1, 1, 1, 1, 0, 0, 0, 0],
    #    [1, 1, 1, 1, 1, 0, 0, 0],
    #    [1, 1, 1, 1, 1, 1, 0, 0],
    #    [1, 1, 1, 1, 1, 1, 1, 0],
    #    [1, 1, 1, 1, 1, 1, 1, 1]
    #    ]

wei = wei / wei.sum(1, keepdim=True) # normalize each row to sum to 1.
    #wei =
    #    [
    #    [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
    #    [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
    #    [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
    #    [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
    #    [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
    #    [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
    #    [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
    #    [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]
    #    ]
xbow2 = wei @ x # multiply by the logit, effecitvely getting the mean (weighted sums)



In [29]:
# use weighted aggragation without looking in to future characthers
# xb = (B, T) = (4, 8)
# [
# [57,  0, 58, 46, 39, 52,  1, 50],
# [26, 19,  1, 20, 17, 26, 30, 37],
# [43,  8,  0,  0, 31, 43, 41, 53],
# [54, 54, 63,  1, 40, 56, 43, 31]
# ]


torch.manual_seed(1337)
B, T, C = 4, 8, 32 
x = torch.randn(B, T, C)


head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key (x) # B, T, 16
q = query(x) # B, T, 16

wei = q @ k.transpose(-2, -1) # B, T, 16 @ B, 16, T --> B, T, T

tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf')) # not aggregate anything from future tokens
wei = F.softmax(wei, dim=-1) # Normalize row to 1

v = value(x)
out = wei @ v


In [30]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)