In [1]:
import sys 
import subprocess 
import pkgutil

def pip_install(pkg):
    if pkg.replace('-', '_') not in {m.name for m in pkgutil.iter_modules()}:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg])

for p in ['torch', 'matplotlib', 'numpy', 'transformers', 'tqdm', 'requests']:
    try:
        pip_install(p)
    except Exception as e:
        print(f'Warning: could not ensure install for {p}: {e}')

Input sequence -> tokenisation -> Cache Available? 
-> Yes -> Retrieve from cache -> Generate Token
-> No -> Compute KV Pairs -> Store in Cache -> Generate Token

The above code will ignore dependency versions and will install latest available releases

In [2]:
import math
import time
import os
import json
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [3]:
import requests
# URL for the Shakespeare dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
file_path = "input.txt"

# Only download if the file doesn't exist yet
if not os.path.exists(file_path):
    print("Downloading dataset...")
    response = requests.get(url)
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(response.text)
    print("Download complete.")
else:
    print("File already exists.")

File already exists.


In [4]:
with open(file_path, 'r', encoding='utf-8') as f:
    text = f.read()

print(f"Length of dataset in characters: {len(text)}")
print(f"First 100 characters:\n{text[:100]}")

Length of dataset in characters: 1115394
First 100 characters:
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [6]:
string_to_integer = {ch:i for i,ch in enumerate(chars)}
integer_to_string = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [string_to_integer[c] for c in s]
decode = lambda l: ''.join([integer_to_string[i] for i in l])

print(encode("Hello"))
print(decode(encode("Hello")))

[20, 43, 50, 50, 53]
Hello


Self attention Recap

A minimal Transformer Block stack with multi-head self-attention

Shapes:
- Input tokens ```(B, T)```
- Embeddings ```(B, T, C)```
- Attention per head QKV ```(B, T, head_size)```
- Attention weights ```(B, T, T)``` with causal mask

This baseline recomputes keys/values for the entire prefix at every generation step, which becomes costly as T grows.

- B - **Batch size** processing B number of distinct chunks of text at once
- T - **Sequence Length** Each chunk has T number of characters or tokens
- C - **Vocab Size/ Channels** The total C number of unique tokens or characters

In [None]:
block_size = 256
embed_size = 128
num_heads = 4
num_layers = 2
dropout = 0.0

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(embed_size, head_size, bias=False)
        self.query = nn.Linear(embed_size, head_size, bias=False)
        self.value = nn.Linear(embed_size, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.head_size = head_size

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)
        wei = q @ k.transpose(1, 2) / math.sqrt(self.head_size)