In [1]:
import torch
from yaml import load
try:
    from yaml import CLoader as Loader
except ImportError:
    from yaml import Loader
import os
torch.cuda.empty_cache()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
stream = open("config.yaml")
config = load(stream, Loader)
stream.close()

In [3]:
input_loc = config['dataset']
file = open(input_loc, "r")
text = file.read()
print(text[:300])

I. A SCANDAL IN BOHEMIA


I.

To Sherlock Holmes she is always _the_ woman. I have seldom heard him
mention her under any other name. In his eyes she eclipses and
predominates the whole of her sex. It was not that he felt any emotion
akin to love for Irene Adler. All emotions, and that one particula


In [4]:
print("Length of characters: ", len(text))

Length of characters:  565060


In [5]:
# All the unique characters that the text has
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"&(),-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz ¢£¦¨©½ÂÃÅâ˜“”€™
92


In [6]:
# Create mapping from character to integers
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

sample_text = 'Hello World'
assert decode(encode(sample_text)) == sample_text
print(encode(sample_text))
print(decode(encode(sample_text)))

# Google uses sentencepiece - Subword tokenizer
# Open AI has tiktoken

[30, 54, 61, 61, 64, 1, 45, 64, 67, 61, 53]
Hello World


In [7]:
# Encoding the entire dataset
import torch
data = torch.tensor(encode(text), dtype = torch.long)
print(data.shape, data.dtype)

torch.Size([565060]) torch.int64


In [8]:
# Splitting up the data
n = int(config['split-ratio']*len(data))
train_data = data[:n]
val_data = data[n:]

In [9]:
block_size = config['block-size']
train_data[:block_size+1]

tensor([31,  9,  1, 23,  1, 41, 25, 23, 36, 26, 23, 34,  1, 31, 36,  1, 24, 37,
        30, 27, 35, 31, 23,  0,  0,  0, 31,  9,  0,  0, 42, 64,  1, 41, 57, 54,
        67, 61, 64, 52, 60,  1, 30, 64, 61, 62, 54, 68,  1, 68, 57, 54,  1, 58,
        68,  1, 50, 61, 72, 50, 74, 68,  1, 49, 69, 57, 54, 49,  1, 72, 64, 62,
        50, 63,  9,  1, 31,  1, 57, 50, 71, 54,  1, 68, 54, 61, 53, 64, 62,  1,
        57, 54, 50, 67, 53,  1, 57, 58, 62,  0, 62, 54, 63, 69, 58, 64, 63,  1,
        57, 54, 67,  1, 70, 63, 53, 54, 67,  1, 50, 63, 74,  1, 64, 69, 57, 54,
        67,  1, 63, 50, 62, 54,  9,  1, 31, 63,  1, 57, 58, 68,  1, 54, 74, 54,
        68,  1, 68, 57, 54,  1, 54, 52, 61, 58, 65, 68, 54, 68,  1, 50, 63, 53,
         0, 65, 67, 54, 53, 64, 62, 58, 63, 50, 69, 54, 68,  1, 69, 57, 54,  1,
        72, 57, 64, 61, 54,  1, 64, 55,  1, 57, 54, 67,  1, 68, 54, 73,  9,  1,
        31, 69,  1, 72, 50, 68,  1, 63, 64, 69,  1, 69, 57, 50, 69,  1, 57, 54,
         1, 55, 54, 61, 69,  1, 50, 63, 

In [9]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context}, the target is {target}")

When input is tensor([18]), the target is 47
When input is tensor([18, 47]), the target is 56
When input is tensor([18, 47, 56]), the target is 57
When input is tensor([18, 47, 56, 57]), the target is 58
When input is tensor([18, 47, 56, 57, 58]), the target is 1
When input is tensor([18, 47, 56, 57, 58,  1]), the target is 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]), the target is 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is 58
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58]), the target is 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47]), the target is 64
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64]), the target is 43
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43]), the target is 52
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52]), the target is 10
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10]), the target is 0
When inp

When input is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43]), the target is 42
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 4

In [10]:
torch.manual_seed(1337)
batch_size = config['batch-size']
block_size = config['block-size']

def get_batch(split):
    # Generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix]).to(device)
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]).to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train','val']:
        losses = torch.zeros(config['eval-iters'])
        for k in range(config['eval-iters']):
            X, Y = get_batch(split)
            logits, loss = model(X,Y)
            losses[k] = loss.item()
        out[split] = losses.mean().item()
    model.train()
    return out

xb, yb = get_batch('train')

In [11]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class Head(nn.Module):
    def __init__(self, head_size):
        super(Head, self).__init__()
        self.key = nn.Linear(config['n-embed'], head_size, bias = False)
        self.query = nn.Linear(config['n-embed'], head_size, bias = False)
        self.value = nn.Linear(config['n-embed'], head_size, bias = False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(config['dropout'])
    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        # Compute Attention Scores ('affinities')
        wei = q @ k.transpose(-2,-1) * C**-0.5 # B,T,C @ B,C,T => B,T,T
        wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
        wei = F.softmax(wei, dim = -1)
        wei = self.dropout(wei)
        # Perform weighted aggregation of values
        v = self.value(x)
        out = wei @ v
        return out
        

In [12]:
class MultiHeadAttention(nn.Module):
    """ Multiple Heads of Self-Attention in Parallel"""

    def __init__(self, num_heads, head_size):
        super(MultiHeadAttention, self).__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(config['n-embed'],config['n-embed'])
        self.dropout = nn.Dropout(config['dropout'])

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.dropout(self.proj(out))
        return out

In [13]:
class FeedForward(nn.Module):
    """A Simple feed forward neural network with non linearity"""

    def __init__(self, n_embed):
        super(FeedForward, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(config['dropout']),
        )

    def forward(self, x):
        return self.net(x)

In [14]:
class Block(nn.Module):
    """ Transformers block: Communication followed by computation"""

    def __init__(self, n_embed, n_head):
        # n_embed embedding dimension, n_heads, number of heads we'd like to have
        super(Block, self).__init__()
        head_size = n_embed//n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embed)
        self.ln1 = nn.LayerNorm(config['n-embed'])
        self.ln2 = nn.LayerNorm(config['n-embed'])

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [15]:
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, config['n-embed'])
        self.positional_embedding_table = nn.Embedding(block_size, config['n-embed'])
        self.blocks = nn.Sequential(*[Block(config['n-embed'], n_head = config['n-head']) for _ in range(config['n-layers'])]
        )
        self.ln_f = nn.LayerNorm(config['n-embed'])
        self.lm_head = nn.Linear(config['n-embed'], vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T)
        token_embeddings = self.token_embedding_table(idx) # (B,T,C)
        pos_embeddings = self.positional_embedding_table(torch.arange(T, device = device)) # (T,C)
        x = token_embeddings + pos_embeddings # (B,T,C)
        x = self.blocks(x) 
        x = self.ln_f(x)
        logits = self.lm_head(x) # (B,T,vocab_size)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            idx_cond = idx[:,-block_size:]
            # Get predictions
            logits, _ = self(idx_cond)
            # Focus only on the last time stamp
            logits = logits[:, -1, :]
            # Apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)
            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # Append sampled index to running sequence
            idx = torch.cat((idx, idx_next), dim = 1) # (B,T+1)
        return idx

    
model = BigramLanguageModel().to(device)
logits, loss = model(xb, yb)
print(logits.shape, loss.shape)
print(loss.item())

torch.Size([8192, 65]) torch.Size([])
4.282502174377441


In [16]:
idx = torch.zeros((1,1), dtype = torch.long, device = device)
print(decode(model.generate(idx,max_new_tokens=100)[0].tolist()))
idx.to('cpu')
del idx


'Yw!N?,tXOR; KplIxouvOrxyrf$vvw$DGpZuKZObWZtUBv!vcUoaxEgaEiV
.pBPPVrsjWtyP-givFR?iAYS,DFWQnCmd,:$pWQ


In [17]:
# Optimization object 
optimizer = torch.optim.AdamW(model.parameters(), lr = config['learning-rate'])
batch_size = config['batch-size']
for iter in range(config['max-iters']):
    if iter % config['eval-iters'] == 0:
        losses = estimate_loss()
        print(f'Step {iter}: train loss {losses["train"]:.4f} val loss {losses["val"]:.4f}')

    xb, yb = get_batch('train')

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

Step 0: train loss 4.2850 val loss 4.2824
Step 500: train loss 2.1148 val loss 2.1740
Step 1000: train loss 1.7017 val loss 1.8484
Step 1500: train loss 1.5323 val loss 1.7238
Step 2000: train loss 1.4318 val loss 1.6343
Step 2500: train loss 1.3632 val loss 1.5780
Step 3000: train loss 1.3129 val loss 1.5507
Step 3500: train loss 1.2698 val loss 1.5182
Step 4000: train loss 1.2357 val loss 1.5039
Step 4500: train loss 1.2060 val loss 1.4927
Step 5000: train loss 1.1792 val loss 1.4886
Step 5500: train loss 1.1529 val loss 1.4850
Step 6000: train loss 1.1280 val loss 1.4758
Step 6500: train loss 1.1055 val loss 1.4806
Step 7000: train loss 1.0824 val loss 1.4849
Step 7500: train loss 1.0605 val loss 1.4881
Step 8000: train loss 1.0402 val loss 1.4955
Step 8500: train loss 1.0188 val loss 1.5040
Step 9000: train loss 0.9948 val loss 1.5087
Step 9500: train loss 0.9749 val loss 1.5237
Step 10000: train loss 0.9526 val loss 1.5161


KeyboardInterrupt: 

In [18]:
context = torch.zeros((1,1), dtype = torch.long, device = device)
text = decode(model.generate(context,max_new_tokens=5000)[0].tolist())

In [20]:
file = open("Output2.txt", 'w')
file.write(text)

5001

In [27]:
torch.save(model, "model_ver2.pth")

## The mathematical trick to self-attention

In [143]:
torch.manual_seed(1337)
B,T,C = 4,8,2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [146]:
wei = torch.tril(torch.ones(T,T))
wei = wei / wei.sum(1, keepdim = True)
xbow = wei @ x

In [148]:
# The proper version
tril = torch.tril(torch.ones(T,T)) # 
wei = torch.zeros(T,T) # Interaction strength or affinity. How much from the past we wanna average
wei = wei.masked_fill(tril == 0, float('-inf')) # Tokens from future cannot be agregated!
wei = F.softmax(wei, dim = -1)
xbow2 = wei @ x
torch.allclose(xbow, xbow2)

True

In [21]:
# Final Version: Self-Attention!
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

# Single Head Perform Self Attention!
head_size = 16
key = nn.Linear(C, head_size, bias = False)
query = nn.Linear(C, head_size, bias = False)
value = nn.Linear(C, head_size, bias = False)
k = key(x)
q = query(x)
wei = q @ k.transpose(-2,-1)


tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim = -1)
v = value(value)
out = wei @ v
# out = wei @ x