# Micro GPT

In [1]:
# Dataset
with open('../data/shakespear.txt', 'r') as f:
    text = f.read()

print("Total length of the data -->", len(text))

chrs = sorted(list(set(text)))
vocab_size = len(chrs)

print(f"The vocab size is {vocab_size}")
print(f"Printing all the elements from vocab --> {''.join(chrs)}")

Total length of the data --> 1115394
The vocab size is 65
Printing all the elements from vocab --> 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [2]:
# Mappings
stoi = {s:i for i, s in enumerate(chrs)}
itos = {i:s for i, s in enumerate(chrs)}
# print(stoi)
# print(itos

# def encode(context):
#     out = []
#     for s in context:
#         out.append(stoi[s])
#     return out

# Takes string and returns list of integers
encode = lambda context : [stoi[s] for s in context]
# Takes list of integers and returns string
decode = lambda ints : ''.join([itos[i] for i in ints])

print(decode(encode("hii there")))

hii there


In [3]:
# Creating dataset
import torch
data = torch.tensor(encode(text), dtype = torch.long)

print(f"Total data --> {data.shape[0]} items")
#print(data[0:400])

# Train and val split
n =  int(0.9 * len(data))
train = data[:n]
val = data[n:]

print(f"Training data --> {train.shape[0]} items")
print(f"Validation data --> {val.shape[0]} items")

Total data --> 1115394 items
Training data --> 1003854 items
Validation data --> 111540 items


In [4]:
torch.manual_seed(800)

batch_size = 4
block_size = 8

def get_batch(split):

    data = train if split == 'train' else val
    ix = torch.randint(len(data) - block_size, (batch_size,))
    xs = torch.stack([data[i : i+block_size] for i in ix])
    ys = torch.stack([data[i+1 : i+block_size+1] for i in ix])
    return xs, ys

xb, yb = get_batch('train')
print(xb.shape, yb.shape)

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, 0:t+1]
        target = yb[b, t]

        print(f"When input is {context.tolist()} output is {target.item()}")

torch.Size([4, 8]) torch.Size([4, 8])
When input is [50] output is 42
When input is [50, 42] output is 1
When input is [50, 42, 1] output is 40
When input is [50, 42, 1, 40] output is 43
When input is [50, 42, 1, 40, 43] output is 1
When input is [50, 42, 1, 40, 43, 1] output is 53
When input is [50, 42, 1, 40, 43, 1, 53] output is 59
When input is [50, 42, 1, 40, 43, 1, 53, 59] output is 56
When input is [1] output is 57
When input is [1, 57] output is 47
When input is [1, 57, 47] output is 56
When input is [1, 57, 47, 56] output is 6
When input is [1, 57, 47, 56, 6] output is 1
When input is [1, 57, 47, 56, 6, 1] output is 57
When input is [1, 57, 47, 56, 6, 1, 57] output is 46
When input is [1, 57, 47, 56, 6, 1, 57, 46] output is 53
When input is [51] output is 40
When input is [51, 40] output is 43
When input is [51, 40, 43] output is 56
When input is [51, 40, 43, 56] output is 1
When input is [51, 40, 43, 56, 1] output is 61
When input is [51, 40, 43, 56, 1, 61] output is 43
When 

# Bigram

In [5]:
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(500)

class BiGram(nn.Module):

    def __init__(self, vocab_size) -> None:
        super().__init__()

        # Embedding layer which has it's __call__ function
        self.embedding_table = nn.Embedding(vocab_size, vocab_size)

    # The nn.Module handles the __call__ func
    def forward(self, idx, targets=None):

        # idx and targets shape (B, T)
        logits = self.embedding_table(idx)

        # Just in case we only want Logits while generating
        if targets is None:
            loss = None
        else:
            # Logits that come out have shape [B, T, C]. For every batch, There are 8 characters and within these 8 characters, every
            # charater is passed through the channels (lookup table) 
            B, T, C = logits.shape
            # The cross_entropy loss takes in logits with shape [B, C]
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # Takes in idx -> Past context of shape [B, T] and predicts and appends the next token in context
        
        # 
        for _ in range(max_new_tokens):
            # This will call the forward func
            logits, loss = self(idx)
            # Since it's bigram model, we care about the last timestep only, so we extract that
            logits = logits[:, -1, :]       # noe dim is [B, C]
            probs = F.softmax(logits, dim = -1)     # dim = -1 means last dim
            ix = torch.multinomial(probs, num_samples = 1)
            # Concatenating the next timestep 
            idx = torch.cat((idx, ix), dim = 1)
        return idx

In [6]:
xb, yb = get_batch('train')
m = BiGram(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)
print(decode(m.generate(torch.zeros((1, 1), dtype = torch.long), max_new_tokens = 100)[0].tolist()))

torch.Size([32, 65])
tensor(4.6475, grad_fn=<NllLossBackward0>)

yCjzq -?kl$XMBh.Fq:cEoP
P:U bybATiG-Y
NycCA3OpXnYXs!GyeSu;rSSej:u ;?UM!
.Plzs!MMjvMjhETsZh,-BWFDAt?S


In [7]:
# Initializing a Torch oprimizer
optimizer = torch.optim.AdamW(m.parameters(), lr = 1e-3)

In [8]:
batch_size = 32
epochs = 50000

for epoch in range(epochs):

    # Get a batch from training data
    xb, yb = get_batch('train')
    # Forward Pass
    logits, loss = m(xb, yb)
    
    # Set zero grad and perform backprop
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()

    if epoch % 1000 == 0:
        print(f"Epoch   {epoch} / {epochs}      Loss -->    {loss.item()}")

Epoch   0 / 50000      Loss -->    4.696692943572998
Epoch   1000 / 50000      Loss -->    3.620892286300659
Epoch   2000 / 50000      Loss -->    3.1328930854797363
Epoch   3000 / 50000      Loss -->    2.791038990020752
Epoch   4000 / 50000      Loss -->    2.7017123699188232
Epoch   5000 / 50000      Loss -->    2.5440938472747803
Epoch   6000 / 50000      Loss -->    2.4252851009368896
Epoch   7000 / 50000      Loss -->    2.584836483001709
Epoch   8000 / 50000      Loss -->    2.387962579727173
Epoch   9000 / 50000      Loss -->    2.575676202774048
Epoch   10000 / 50000      Loss -->    2.5128278732299805
Epoch   11000 / 50000      Loss -->    2.4656758308410645
Epoch   12000 / 50000      Loss -->    2.394345760345459
Epoch   13000 / 50000      Loss -->    2.3631834983825684
Epoch   14000 / 50000      Loss -->    2.4389445781707764
Epoch   15000 / 50000      Loss -->    2.4022321701049805
Epoch   16000 / 50000      Loss -->    2.4722418785095215
Epoch   17000 / 50000      Loss --

In [39]:
# Generate a text starting with \n as the first character
torch.manual_seed(80)
print(decode(m.generate(torch.zeros((1, 1), dtype = torch.long), max_new_tokens = 100)[0].tolist()))


G myonerey fowherde y wil thorut t ten withtades, ire hewe Wid fat istis s aveathas ggr:
Paistwesh t


# Attention Mechanism

In [21]:
# Toy example of aggregating tokens
import torch
import torch.nn as nn
torch.manual_seed(42)

B, T, C = 4, 8, 32
head_size = 16

# Random trial batch of inputs of shape [B,T,C]
x = torch.randn(B, T, C)     # B,T,C --> 4,8,3

# Attention head layers
key = nn.Linear(C, head_size, bias = False)
query = nn.Linear(C, head_size, bias = False)
value = nn.Linear(C, head_size, bias = False)

# Get key and Query vectors by feedforwarding the input
k = key(x)      # [B, T, head_size]
q = query(x)    # [B, T, head_size]

wei = q @ k.transpose(-1, -2) * (head_size**-0.5)      # [B, T, head_size] @ [B, head_size, T] -> [B, T, T] (Weight matrix for every item in the batch)

# A Lower triangle 1.0
tril = torch.tril(torch.ones(T, T))
# Initializing weights to zero
# wei = torch.zeros_like(tril).float()
# Preventing Future tokens talking to current and past tokens
wei = wei.masked_fill(tril == 0, float('-inf'))
# Normalizing to get averages across time dimension
wei = wei.softmax(dim = -1)

# Getting the value vector
v = value(x)      # [B, T, head_size]

# Talking of weights with value
out = wei @ v     # [B,T,T] @ [B,T,head_size] -> [B,T,head_size]
out.shape

torch.Size([4, 8, 16])

In [24]:
out[0]

tensor([[ 0.7630, -0.2412, -0.4150,  0.3833,  0.5740, -1.6738,  0.7954,  0.6872,
         -0.3848,  0.5073, -0.5312, -0.1221,  0.0445,  1.2169,  0.9940,  1.5281],
        [ 0.4058, -0.0920, -0.7653, -0.5147,  0.1817, -0.4080,  0.0756, -0.7033,
         -0.0571,  0.3145,  0.3326,  0.0922,  0.1446,  0.5214,  0.3781, -0.1178],
        [ 0.2012,  0.0409, -0.1103,  0.3876,  0.6604, -0.8814,  0.2189,  0.0529,
         -0.4067,  0.3265, -0.1413, -0.2490, -0.4813,  0.5791,  0.9548,  1.0026],
        [ 0.0370,  0.2438, -0.1707, -0.0168, -0.0221, -0.3756, -0.1570, -0.6721,
         -0.1865,  0.2293,  0.1447,  0.1949,  0.2877,  0.4271,  0.1980,  0.0253],
        [ 0.2009,  0.1195, -0.2142,  0.3468,  0.1683, -0.8404,  0.0235, -0.0529,
         -0.1590,  0.2322, -0.2571,  0.0770,  0.1777,  0.6503,  0.4119,  0.5479],
        [-0.0446,  0.1640, -0.3607,  0.1286,  0.0677, -0.2968, -0.4088, -0.4496,
          0.0718,  0.0469,  0.0142,  0.1075,  0.0830,  0.2601,  0.1443, -0.1559],
        [ 0.1647,  0.0