In [None]:
# The tiny shakespear was downloaded from: 
# https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [None]:
with open('./data/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print("length of text", len(text))

In [None]:
charachters = sorted(set(list(text))) #set contains only one of each character, can't have duplicates
vocabular_size = len(charachters)
print(''.join(charachters))
print(vocabular_size)

In [None]:
# tokenizing the input text
# This will be a character level -- if I want to make my own chatbots with my own data, 
# This part will be different, I think this is a very basic example. chat gpt uses a subword tokenizer
# These function inverse the mapping between integers and string
stoi = { ch:i for i, ch in enumerate(charachters) } # A dictionary of the charachter - integer mapping
itos = { i:ch  for i, ch in enumerate(charachters)} # A dictionary of integer - charachter mapping
encode = lambda s: [stoi[c] for c in s]  # text to tokens: s is input string, we make a list out of the lookups in stoi.
decode = lambda l: ''.join([itos[i] for i in l]) # tokens to text: l is list of integers, we loop over it with i
print(encode('hey, this is a test'))
print(decode(encode('hey, this is a test')))

In [None]:
import torch
data = torch.tensor(encode(text), dtype=torch.long) # A tensor is a multi dimensional array of data from a single data type. 
print(data.shape, data.dtype)
print(data[:1000])

In [None]:
# we train the dataset with chunks at a time
# We also split up the data in a training set, and a validation set
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]
print(train_data)

In [None]:
block_size = 8 # the length of chunks that will be fed to the model for training. in our case 8 charachters at a time.
train_data[:block_size + 1]

In [None]:
# This is the principe of learning
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context} the target: {target}")
print(range(block_size))
print(x)
print(y)

In [None]:
# Generate training batches
torch.manual_seed(1337)
batch_size = 4 # How many independent sequences will we process in parallel 
block_size = 8 # The maximum amount of context length for prediction
# Basically this will determine the dimensions of the different py torch tensors.

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # generate random positions to grab chunk out of train / val data
    x = torch.stack([data[i:i+block_size] for i in ix]) # first block size characters, starting at I
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # Offset of 1 from X. 
    return x, y 

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

In [None]:
print(xb)

In [None]:
import torch
import torch.nn as nn 
from torch.nn import functional as F 
torch.manual_seed(1337)
# Constructor 
class BigramLanguageModel(nn.Module):

    def __init__(self, vocabular_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocabular_size, vocabular_size) # tensor of shape vocab size x vocab size (65 x 65)

    def forward(self, idx, targets=None):

        # Logits are basically the scores for the next character in the sequence.
        logits = self.token_embedding_table(idx) # (B, T, C) (Batch == 4 , Time == 8 , Channel == 65)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets) # evaluate loss / quality of predictions. quality of logits in retrospect to the targets
    
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T : Batch, Time (block_size)) array of indices in the current context
                #xb =
                    #[
                    #[57,  0, 58, 46, 39, 52,  1, 50],
                    #[26, 19,  1, 20, 17, 26, 30, 37],
                    #[43,  8,  0,  0, 31, 43, 41, 53],
                    #[54, 54, 63,  1, 40, 56, 43, 31]
                    #]
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
                #logits =
                    #[
                    #[ [65 scores], [65 scores], ... 8 times ... ],   # sequence 0
                    #[ [65 scores], [65 scores], ... 8 times ... ],   # sequence 1
                    #[ [65 scores], [65 scores], ... 8 times ... ],   # sequence 2
                    #[ [65 scores], [65 scores], ... 8 times ... ]    # sequence 3
                    #]
            logits = logits[:, -1, :] # becomes (B, C) : Only use the predictions based on the last token in each row.
                # logits
                    # sequence 0 last token: 50
                    # sequence 1 last token: 37
                    # sequence 2 last token: 53
                    # sequence 3 last token: 31
            probs = F.softmax(logits, dim=-1) # Calculate probabilities
                # probs
                    # probs =
                    #   [
                    #   [p0, p1, p2, ... p64],   # sums to 1
                    #   [p0, p1, p2, ... p64],
                    #   [p0, p1, p2, ... p64],
                    #   [p0, p1, p2, ... p64]
                    #   ]
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1) pick one tocken per row
                # idx_next
                    # idx_next =
                    #   [
                    #   [12],
                    #   [ 0],
                    #   [41],
                    #   [63]
                    #   ]
            idx = torch.cat((idx, idx_next), dim=1)
                    # before:
                        # [57,  0, 58, 46, 39, 52,  1, 50]
                    # after:
                        # [57,  0, 58, 46, 39, 52,  1, 50, 12]   # example sampled token
        return idx
    
m = BigramLanguageModel(vocabular_size)
logits, loss = m(xb, yb) # xb = input, yb = targets
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist())) # run the generation. it's gibberish now.

In [None]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [None]:
batch_size = 32
for steps in range(10000): # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

In [None]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist())) # getting better structure
# At this point, the character that is being predicted, is predicted based on one character. we now need to let it talk to the other characters.

In [None]:
torch.manual_seed(1337)
B, T, C = 4, 8, 2 
x = torch.randn(B, T, C)
x.shape

In [28]:
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1]
        xbow[b, t] = torch.mean(xprev, 0)
print(x[0])
print(xbow[0])

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])
tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])


In [None]:
wei = torch.tril(torch.ones(T, T))
    #wei = at time step T, you are allowed to look at positions 0 -> t
    #    [
    #    [1, 0, 0, 0, 0, 0, 0, 0],
    #    [1, 1, 0, 0, 0, 0, 0, 0],
    #    [1, 1, 1, 0, 0, 0, 0, 0],
    #    [1, 1, 1, 1, 0, 0, 0, 0],
    #    [1, 1, 1, 1, 1, 0, 0, 0],
    #    [1, 1, 1, 1, 1, 1, 0, 0],
    #    [1, 1, 1, 1, 1, 1, 1, 0],
    #    [1, 1, 1, 1, 1, 1, 1, 1]
    #    ]

wei = wei / wei.sum(1, keepdim=True) # normalize each row to sum to 1.
    #wei =
    #    [
    #    [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
    #    [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
    #    [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
    #    [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
    #    [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
    #    [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
    #    [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
    #    [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]
    #    ]
xbow2 = wei @ x # multiply by the logit, effecitvely getting the mean (weighted sums)



In [36]:
xbow[0], xbow2[0]

(tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))