## **Importing libraries**

In [35]:
# Importing libraries
import torch
import torch.nn as nn
from torch.nn import functional as F

## **Defining Hyperparametes**

In [36]:
# Defining the Hyper-parameters

batch_size = 32 

# Context length
block_size = 8 
max_iters = 100000
eval_interval = 10000
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200

torch.manual_seed(1337)

<torch._C.Generator at 0x10b8bccf0>

## **Exploring the dataset**

In [37]:
# Reading the text file
with open('shakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Total number of characters
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Mapping from characters to indices and vice versa
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }

# Encoding lambda function - String to list of integers
encode = lambda s: [stoi[c] for c in s]

# Decoding lambda function - List of integers to string
decode = lambda l: ''.join([itos[i] for i in l]) 

In [38]:
print('All chars:', "".join(chars))
print('Vocab size:', vocab_size)

All chars: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocab size: 65


In [39]:
# Try it out
print('Encoded text:', encode("Hi, I'm pranav"))
print('Decoded text:', decode(encode("Hi, I'm pranav")))

Encoded text: [20, 47, 6, 1, 21, 5, 51, 1, 54, 56, 39, 52, 39, 60]
Decoded text: Hi, I'm pranav


In [40]:
# Create torch tensor
data = torch.tensor(encode(text), dtype=torch.long)
print('Data shape:', data.shape)

# Train size
train_size = int(0.8 * len(data))

# Train and test split
train_data = data[:train_size]
val_data = data[train_size:]

Data shape: torch.Size([1115393])


## **Example sequence generation**

In [41]:
# Sampling a block/sequence
seq_length = 16
print(train_data[:seq_length+1])

# Data loading into x and y
x = train_data[:seq_length]
y = train_data[1:seq_length+1]

# Auto-regressively generating the next item
for t in range(seq_length):
    cont = x[:t+1]
    target = y[t]
    print(f'{cont} -> {target}')

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43])
tensor([18]) -> 47
tensor([18, 47]) -> 56
tensor([18, 47, 56]) -> 57
tensor([18, 47, 56, 57]) -> 58
tensor([18, 47, 56, 57, 58]) -> 1
tensor([18, 47, 56, 57, 58,  1]) -> 15
tensor([18, 47, 56, 57, 58,  1, 15]) -> 47
tensor([18, 47, 56, 57, 58,  1, 15, 47]) -> 58
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58]) -> 47
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47]) -> 64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64]) -> 43
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43]) -> 52
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52]) -> 10
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10]) -> 0
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0]) -> 14
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14]) -> 43


## **Batching**

In [42]:
def get_batch(split):
    
    # Select the data
    data = train_data if split == 'train' else val_data

    # Randomly select a starting index
    ix = torch.randint(len(data) - block_size, (batch_size,))

    # Create the batch
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    # Move to device
    x, y = x.to(device), y.to(device)
    return x, y

In [43]:
# Try it out
x, y = get_batch('train')

# Print the shapes
print(x.shape, y.shape)

# Print the first sequence
print(x)
print(y)

torch.Size([32, 8]) torch.Size([32, 8])
tensor([[45, 46, 39, 52,  6,  1, 19, 56],
        [ 0, 51, 53, 56, 43,  1, 47, 52],
        [21, 36, 17, 26, 17, 31, 10,  0],
        [52, 53, 61,  1, 61, 47, 50, 50],
        [58, 46, 39, 58,  1, 57, 43, 50],
        [45,  1, 57, 53, 51, 43,  1, 58],
        [58, 57,  0, 18, 56, 53, 51,  1],
        [ 8,  0,  0, 26, 59, 56, 57, 43],
        [57, 57,  1, 44, 43, 39, 56, 44],
        [51, 59, 58, 59, 39, 50, 50, 63],
        [47, 52, 45,  7, 61, 46, 47, 50],
        [45, 39, 47, 52,  1, 21,  1, 51],
        [51,  0, 21, 57,  1, 41, 56, 43],
        [52, 53, 58,  1, 51, 39, 49, 43],
        [49,  1, 39, 45, 39, 47, 52,  6],
        [53, 59, 50, 42,  0, 20, 39, 60],
        [ 1, 39, 52, 42,  1, 46, 47, 57],
        [40, 43, 39, 58, 47, 52, 45,  1],
        [58, 53,  1, 40, 56, 43, 39, 49],
        [39, 47, 52, 57, 58,  1, 58, 46],
        [60, 43,  1, 42, 43, 57, 43, 56],
        [39, 51, 43, 12,  0,  0, 28, 27],
        [47, 50, 50,  1, 58, 46, 47,

## **Defining the Bi-gram model**

In [44]:
@torch.no_grad()
def estimate_loss():

    # Output dictionary
    out = {}

    # Set the model to evaluation mode
    model.eval()

    #Loop over train and val splits
    for split in ['train', 'val']:

        # Set the loss to zero
        losses = torch.zeros(eval_iters)

        # Loop over the iterations
        for k in range(eval_iters):

            # Get the batch
            X, Y = get_batch(split)

            # Get the logits and loss
            logits, loss = model(X, Y)

            # Store the loss
            losses[k] = loss.item()

        # Store the average loss    
        out[split] = losses.mean()
    
    # Set the model back to training mode
    model.train()
    
    return out

class BiGramModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()

        # Create an embedding layer/table
        self.embed = nn.Embedding(vocab_size, vocab_size)

    def forward(self, x, targets=None):
        
        # x and targets are of shape (batch_size, seq_length)
        logits = self.embed(x) # (batch_size, seq_length, vocab_size)
        
        if targets is None:
            loss = None
            
        else:
            # Reshape logits and targets
            batch_size, seq_length, vocab_size = logits.shape
            logits = logits.view(batch_size * seq_length, vocab_size)
            targets = targets.view(-1) 

            # Calculate loss
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_tokens):

        # idx is of shape (batch_size, 1)
            
        # Generate text using the model
        for _ in range(max_tokens):

            # Get the logits and loss
            logits, loss = self(idx)

            # Get the last token
            logits = logits[:, -1, :] # (batch_size, vocab_size)

            # Get the probability distribution (softmax)
            probs = F.softmax(logits, dim=1) # (batch_size, vocab_size)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, 1) # (batch_size, 1)

            # Append the sampled token to the input
            idx = torch.cat((idx, idx_next), dim=1) # (batch_size, 2)

        return idx
    
model = BiGramModel(vocab_size)
logits, loss = model(x, y)

print(f"Number of parameters: {sum(p.numel() for p in model.parameters())/1e6} M")

print(logits.shape)
print(loss)

print(decode(model.generate(torch.zeros((1, 1), dtype=torch.long), max_tokens=100)[0].tolist()))

Number of parameters: 0.004225 M
torch.Size([256, 65])
tensor(4.8378, grad_fn=<NllLossBackward0>)

JLg,3D&OM .3YCjfolRwqXaDyttW!GmaUT-IIvuZV?sYfjzUvTQ3RwL ?etyLeg.COHW
Ri$ELkJMXpBEX;-G&Orl!bcH ;cq.z,


## **Training Loop**

In [45]:
# Define the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

batch_size = 32

# Train the model
for i in range(100000):

    # Get the data
    x, y = get_batch('train')

    # Get the logits and loss
    logits, loss = model(x, y)

    # Zero the gradients
    optimizer.zero_grad(set_to_none=True)

    # Backpropagate the loss
    loss.backward()

    # Update the model parameters
    optimizer.step()

    if i % 10000 == 0:
        # print(f'Loss at step {i}: {loss.item()}')
        losses = estimate_loss()
        print(f"Step {i}: Train Loss {losses['train']:.4f}; Validation Loss {losses['val']:.4f}")

Step 0: Train Loss 4.7629; Validation Loss 4.7693
Step 10000: Train Loss 2.4670; Validation Loss 2.5038
Step 20000: Train Loss 2.4390; Validation Loss 2.5125
Step 30000: Train Loss 2.4478; Validation Loss 2.5118
Step 40000: Train Loss 2.4473; Validation Loss 2.5083
Step 50000: Train Loss 2.4435; Validation Loss 2.5104
Step 60000: Train Loss 2.4480; Validation Loss 2.5004
Step 70000: Train Loss 2.4485; Validation Loss 2.5103
Step 80000: Train Loss 2.4423; Validation Loss 2.5092
Step 90000: Train Loss 2.4449; Validation Loss 2.5056


## **Example text generation**

In [46]:
print(decode(model.generate(torch.zeros((1, 1), dtype=torch.long), max_tokens=500)[0].tolist()))


K:
GOHUCo o tsursooy t!

K: spolld are theas
Paze OLLorss thathere roy!

Whirche gesl.
Fineis?
Qu athethe s ld ar, s fatinen sthy GUThofiliune t tr I CHary y.
ABEThay Grl, my bllle or; I
Wit n nge ilit'chot, t may wen sy be art y DWithicr; thofar stwh w henguayongour t wh RI thinshe s
I ck swe fer s messed
Clf nbimerer,

PUS le aipupalaswou d rs s,
Luisen beel lis ghee mm oused I rd se tanth tinemese kee che ld m thenoueaicethe n coro t hinerfl hat t bur quthour:
HE:




KI bid tor d!
BORilyonoi


## **Hyperparameters: Attention Mechanism**

In [47]:
# Updated parameters

batch_size = 32 
block_size = 256 
max_iters = 1000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embed = 16
n_head = 4
n_layer = 4
dropout = 0.2

## **Defining Single-Head, Multi-Head, FFN & Transformer Block**

In [48]:
# Single head attention layer
class Head(nn.Module):
    
    def __init__(self, head_size):
        super().__init__()

        # Create key, query and value layers
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        
        B, T, C = x.shape
        k = self.key(x) # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)

        # Calculate attention scores
        wei = q @ k.transpose(-2, -1) * C ** -0.5 # (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)

        v = self.value(x) # (B, T, head_size)
        out = wei @ v # (B, T, head_size)

        return out

# Multi-head attention layer
class MultiHeadAttention(nn.Module):
    
    def __init__(self, n_heads, head_size):
        super().__init__()

        # Create a list of heads
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])

        # Create a projection layer
        self.proj = nn.Linear(n_embed, n_embed)

        # Create a dropout layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        
        # Concatenate the outputs of the heads
        out = torch.cat([h(x) for h in self.heads], dim=-1)

        # Project the output and apply dropout
        out = self.dropout(self.proj(out))

        return out
    
# Feed-forward network 
class FeedForward(nn.Module):

    def __init__(self, n_embed):
        super().__init__()

        # Create a simple feed-forward network
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

# Transformer block
class Block(nn.Module):

    def __init__(self, n_embed, n_head):
        # n_head is the number of heads in the multi-head attention
        # n_embed is the number of dimensions in the embedding
        super().__init__()

        head_size = n_embed // n_head
        self.attn = MultiHeadAttention(n_head, head_size)
        self.ff = FeedForward(n_embed)
        self.layer_norm = nn.LayerNorm(n_embed)
        self.layer_norm2 = nn.LayerNorm(n_embed)

    def forward(self, x):

        # Apply the residual connection and layer normalization within the block
        x = x + self.attn(self.layer_norm(x))

        # Apply the residual connection and layer normalization within the feed-forward network
        x = x + self.ff(self.layer_norm2(x))
        
        return x

## **Defining Language Model with Attention**

In [61]:
# Bigram model with Attention
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        
        # Each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.pos_embedding_table = nn.Embedding(block_size, n_embed)
        
        # self.attn = Head(n_embed)
        # self.heads = MultiHeadAttention(4, n_embed//4) # 4 heads, each with 8 dimensions
        self.blocks = nn.Sequential(*[Block(n_embed, n_head=n_head) for _ in range(n_layer)])
        self.layer_norm = nn.LayerNorm(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):

        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.pos_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        
        # x = self.heads(x) # (B,T,C)
        # x = self.ff(x) # (B,T,C)
        x = self.blocks(x)
        logits = self.lm_head(x) # (B,T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            
            # Get the last block_size tokens
            idx_conditioned = idx[:, -block_size:] # (B, block_size)

            # Get the predictions
            logits, loss = self(idx_conditioned)
            
            # Focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            
            # Apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            
            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            
            # Append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        
        return idx

model = BigramLanguageModel(vocab_size)
m = model.to(device)

print(f"Number of parameters: {sum(p.numel() for p in m.parameters())/1e6} M")

Number of parameters: 0.019201 M


## **Training Loop**

In [62]:
# Define the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

batch_size = 32

# Train the model
for i in range(1000):

    # Get the data
    x, y = get_batch('train')

    # Get the logits and loss
    logits, loss = model(x, y)

    # Zero the gradients
    optimizer.zero_grad(set_to_none=True)

    # Backpropagate the loss
    loss.backward()

    # Update the model parameters
    optimizer.step()

    if i % 100 == 0:
        # print(f'Loss at step {i}: {loss.item()}')
        losses = estimate_loss()
        print(f"Step {i}: Train Loss {losses['train']:.4f}; Validation Loss {losses['val']:.4f}")

Step 0: Train Loss 4.4128; Validation Loss 4.4263
Step 100: Train Loss 3.0874; Validation Loss 3.1170
Step 200: Train Loss 2.8457; Validation Loss 2.8765
Step 300: Train Loss 2.7423; Validation Loss 2.7720
Step 400: Train Loss 2.6763; Validation Loss 2.7043
Step 500: Train Loss 2.6313; Validation Loss 2.6603
Step 600: Train Loss 2.5968; Validation Loss 2.6274
Step 700: Train Loss 2.5704; Validation Loss 2.5985
Step 800: Train Loss 2.5525; Validation Loss 2.5765
Step 900: Train Loss 2.5350; Validation Loss 2.5612


## **Example Text generation: Attention mechanism**

In [63]:
print(decode(m.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


A, Loreat bipangusghtheeand t ls;
OTo wigost in illirile arshe, yd.
Thitod:
L'onmy
By an dl.
Sougay.
Bou t thouonde! w w arce,
KISI d I fos s y
G
SES:

NNLI mind s thot touds or ay k I
Wesen atou sonke e tharm n ve icany wee thh rud, lere ferbss qe, ff.
Wher t mofo, p mathedbend ton inel HLoun-picole gr benis a
IP s heyorougher; a you'ees d he s port Vd bame tot wil merd we chispAthour myrith we o seaon lantpancoree he an aniure mal haoroue. fldind
NKOCee tiresly facent, te' nesin ae uree hiob t
