In [1]:
#!pip install datasets
from datasets import load_dataset
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("tiny_shakespeare")

In [3]:
train = ''.join(dataset['train']['text'])
val = ''.join(dataset['validation']['text'])
test = ''.join(dataset['test']['text'])

In [4]:
print(f"len of train/val/test - {len(train)} / {len(val)} / {len(test)}")

len of train/val/test - 1003854 / 55770 / 55770


In [5]:
chars = sorted(list(set(''.join(train + val + test))))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [6]:
# mapping for chars to int 
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode('mmm, ice cream so good'))
print(decode(encode('mmm, ice cream so good')))

[51, 51, 51, 6, 1, 47, 41, 43, 1, 41, 56, 43, 39, 51, 1, 57, 53, 1, 45, 53, 53, 42]
mmm, ice cream so good


In [7]:
import torch
train_tensor = torch.tensor(encode(train), dtype=torch.long)
print(train_tensor.shape)

torch.Size([1003854])


In [8]:
val_tensor = torch.tensor(encode(val), dtype=torch.long)
test_tensor = torch.tensor(encode(test), dtype=torch.long)

In [9]:
block_size = 8

In [10]:
x = train_tensor[:block_size]
y = train_tensor[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target is -> {target}")

when input is tensor([18]) the target is -> 47
when input is tensor([18, 47]) the target is -> 56
when input is tensor([18, 47, 56]) the target is -> 57
when input is tensor([18, 47, 56, 57]) the target is -> 58
when input is tensor([18, 47, 56, 57, 58]) the target is -> 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target is -> 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is -> 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is -> 58


In [11]:
torch.manual_seed(228)
batch_size = 4

def get_batch(split):
    data = train_tensor if split == 'train' else val_tensor
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets')
print(yb.shape)
print(yb)

print('______')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b:t+1]
        target = yb[b,t]
        
        print(f'when the input is {context.tolist()}, the target -> {target}')

inputs:
torch.Size([4, 8])
tensor([[19,  1, 30, 21, 15, 20, 13, 30],
        [49,  6,  1, 61, 46, 63,  1, 46],
        [16, 21, 27, 10,  0, 33, 52, 46],
        [53, 52, 42,  1, 25, 59, 56, 42]])
targets
torch.Size([4, 8])
tensor([[ 1, 30, 21, 15, 20, 13, 30, 16],
        [ 6,  1, 61, 46, 63,  1, 46, 39],
        [21, 27, 10,  0, 33, 52, 46, 39],
        [52, 42,  1, 25, 59, 56, 42, 43]])
______
when the input is [[19, 1, 30, 21, 15, 20, 13, 30]], the target -> 1
when the input is [[19, 1, 30, 21, 15, 20, 13, 30], [49, 6, 1, 61, 46, 63, 1, 46]], the target -> 30
when the input is [[19, 1, 30, 21, 15, 20, 13, 30], [49, 6, 1, 61, 46, 63, 1, 46], [16, 21, 27, 10, 0, 33, 52, 46]], the target -> 21
when the input is [[19, 1, 30, 21, 15, 20, 13, 30], [49, 6, 1, 61, 46, 63, 1, 46], [16, 21, 27, 10, 0, 33, 52, 46], [53, 52, 42, 1, 25, 59, 56, 42]], the target -> 15
when the input is [[19, 1, 30, 21, 15, 20, 13, 30], [49, 6, 1, 61, 46, 63, 1, 46], [16, 21, 27, 10, 0, 33, 52, 46], [53, 52, 42, 1

In [12]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(228)

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, targets=None):
        
        logits = self.token_embedding_table(idx)
        
        if targets is None:
            loss = None
        else:
            
        
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)

            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B,C) array of indices in the current context
        for _ in range(max_new_tokens):
            #get the preds
            logits, loss = self(idx)
            #focus only on the last step
            logits = logits[:,-1,:] # becomes (B,C)
            #apply softmax to get probs
            probs = F.softmax(logits, dim=1) # (B, C)
            # sample from distribution
            idx_next = torch.multinomial(probs, num_samples=1) #(B, 1)
            #append sampled index to the running sequence 
            idx = torch.cat((idx,idx_next), dim=1) #(B, T+1)
        return idx
            
    
m = BigramLanguageModel(vocab_size)

logits, loss = m(xb, yb)

print(logits.shape)
print(loss)

idx = torch.zeros((1,1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8414, grad_fn=<NllLossBackward0>)

s?NGzGlfLLKFKVn
b.CFPMAPiMA'fkHTbScTr-EmsUxjU-pOulxfko;nzB-TiD?fCJFloGkF$sqcdnJFIf-PG,Btnq-E..P;gWV?


In [13]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [14]:
batch_size = 32

for steps in range(10000):
    
    #sample a batch of data
    xb, yb = get_batch('train')
    
    #eval the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print(loss.item())

2.390397310256958


In [15]:
print(decode(m.generate(idx, max_new_tokens=400)[0].tolist()))


T:
iced msuge lllonay, mousest lthak.
IR:

LBO:

T:
My:
STEl tind.
Fz'd horistlleyowiof hay and youn tugouthend:
BETheveasut for tarn lus myordie barerthea, y sto IN blot yerike
sh ade
JEN:
Fonilavet hatesl, tr fy hy, dof Ils;
Domfor mby owby
Yos wime d th,

We, toabean d
HARigs.

MENothatin w?
Mourd:
Th alamby u t mes hesthero oure.
A: yveratho;

Wheldo pou EXESTENIfothours wapayouland t doune t 


In [16]:
# self-attention
torch.manual_seed(228)
B, T, C = 4, 8, 32 # Batch, Time , Channel

#single Head self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=True)

k = key(x)   #(B, T, 16)
q = query(x) #(B, T, 16)
wei = q @ k.transpose(-2,-1)  # (B, T ,16) @ (B, 16, T) ----> (B, T , T)

x = torch.randn(B, T, C)
tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=1)

v = value(x)
out = wei @ v 
#out = wei @ x




out.shape

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x8 and 32x16)

In [None]:
tril

In [None]:
out