In [4]:
# Character level tokenisation

# Step 1: Input text
text = "hello world"

# Step 2: Get all unique characters in the text
vocab = sorted(list(set(text)))
print("Vocabulary:", vocab)

# Step 3: Create mappings: char → int and int → char
char_to_int = {ch: i for i, ch in enumerate(vocab)}
int_to_char = {i: ch for i, ch in enumerate(vocab)}

print("\nCharacter to Integer mapping:")
print(char_to_int)

# Step 4: Encode (text → integers)
encoded = [char_to_int[ch] for ch in text]
print("\nEncoded sequence:", encoded)

# Step 5: Decode (integers → text)
decoded = ''.join([int_to_char[i] for i in encoded])
print("\nDecoded text:", decoded)


Vocabulary: [' ', 'd', 'e', 'h', 'l', 'o', 'r', 'w']

Character to Integer mapping:
{' ': 0, 'd': 1, 'e': 2, 'h': 3, 'l': 4, 'o': 5, 'r': 6, 'w': 7}

Encoded sequence: [3, 2, 4, 4, 5, 0, 7, 5, 6, 4, 1]

Decoded text: hello world


In [5]:
# Bigram Model

# Count bigrams (pairs of consecutive chars)
import torch
vocab_size=len(vocab)
# Create a count matrix (vocab_size x vocab_size)
N = torch.zeros((vocab_size, vocab_size), dtype=torch.int32)
for ch1, ch2 in zip(encoded, encoded[1:]):
    N[ch1, ch2] += 1
print("Bigram count matrix:\n", N)


# Convert counts → probabilities
# Add 1 for smoothing (avoid division by zero)
P = (N + 1).float()
P /= P.sum(1, keepdim=True)
print("\nProbability matrix:\n", P)


# Generate text using the bigram model
import torch
# Start with a random character
ix = torch.randint(0, vocab_size, (1,)).item()
out = [ix]
for _ in range(20):  # generate 20 characters
    # Sample next char based on probability
    probs = P[ix]
    ix = torch.multinomial(probs, num_samples=1).item()
    out.append(ix)
# Decode indices to characters
generated_text = ''.join([int_to_char[i] for i in out])
print("\nGenerated text:", generated_text)

Bigram count matrix:
 tensor([[0, 0, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 1, 1, 0, 0],
        [1, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0]], dtype=torch.int32)

Probability matrix:
 tensor([[0.1111, 0.1111, 0.1111, 0.1111, 0.1111, 0.1111, 0.1111, 0.2222],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250],
        [0.1111, 0.1111, 0.1111, 0.1111, 0.2222, 0.1111, 0.1111, 0.1111],
        [0.1111, 0.1111, 0.2222, 0.1111, 0.1111, 0.1111, 0.1111, 0.1111],
        [0.0909, 0.1818, 0.0909, 0.0909, 0.1818, 0.1818, 0.0909, 0.0909],
        [0.2000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.2000, 0.1000],
        [0.1111, 0.1111, 0.1111, 0.1111, 0.2222, 0.1111, 0.1111, 0.1111],
        [0.1111, 0.1111, 0.1111, 0.1111, 0.1111, 0.2222, 0.1111, 0.1111]])

Generated text: elredeloo lrdwrrwd  w


In [6]:
# Token Embeddings

import torch
text_size = len(list(text))
embedding_dim = 8
# Create token embedding table
token_embedding_table = torch.nn.Embedding(text_size, embedding_dim)
# Convert encoded list to tensor
tokens = torch.tensor(encoded, dtype=torch.long)
# Lookup embeddings
token_embeds = token_embedding_table(tokens)
print("\nToken Embeddings shape:", token_embeds.shape)
print(token_embeds)



Token Embeddings shape: torch.Size([11, 8])
tensor([[ 0.1138,  2.9655,  0.0813, -0.0235, -1.3065,  1.4957, -1.0961, -0.0795],
        [ 0.7026,  0.6871, -0.4664,  0.9837,  1.9989,  0.6514,  0.5614, -0.6657],
        [ 1.4388, -1.5915,  0.0490, -0.3316,  0.3347, -0.6483,  0.5391,  0.8519],
        [ 1.4388, -1.5915,  0.0490, -0.3316,  0.3347, -0.6483,  0.5391,  0.8519],
        [ 0.2852,  0.2791,  1.7491,  0.3668, -0.7457,  0.9607,  2.3469, -0.6264],
        [-1.0757, -0.8299, -2.3622, -0.2736, -0.1199, -1.0613, -0.6246,  0.9568],
        [-1.3614, -0.1579,  0.5192, -0.6509, -0.2592,  0.1960,  1.0490, -1.1486],
        [ 0.2852,  0.2791,  1.7491,  0.3668, -0.7457,  0.9607,  2.3469, -0.6264],
        [ 1.5455,  0.5859, -0.4391,  0.6140,  1.0550,  1.5842, -0.4713, -0.7610],
        [ 1.4388, -1.5915,  0.0490, -0.3316,  0.3347, -0.6483,  0.5391,  0.8519],
        [-1.7192, -0.9628,  1.2277,  1.1895, -0.7530, -0.1038,  0.4013,  0.2315]],
       grad_fn=<EmbeddingBackward0>)


In [7]:
# Positional Embeddings

# sequence_length = 11
block_size = 11
# Create position embedding table
position_embedding_table = torch.nn.Embedding(block_size, embedding_dim)
positions = torch.arange(block_size)
position_embeds = position_embedding_table(positions)
print("\nPosition Embeddings shape:", position_embeds.shape)
print(position_embeds)


Position Embeddings shape: torch.Size([11, 8])
tensor([[-0.1842,  1.3653, -0.6117, -0.3598, -1.2056,  2.5702,  0.8442,  1.3253],
        [ 1.9362,  0.6216,  0.0669, -0.4296,  1.6063, -0.3118,  0.7864, -0.1483],
        [-1.0733, -0.0065,  0.5474,  0.2650,  0.3834, -1.7651, -0.5644,  0.3843],
        [ 0.4803, -0.4645,  0.1013, -0.1288,  0.1663, -1.5774, -2.2762, -0.0731],
        [-0.4474,  0.0143, -2.0777,  0.1586, -0.3498, -0.6126, -0.4383,  0.5347],
        [-1.3115, -0.0695, -0.4642,  1.3140, -0.6404,  1.6769,  0.3999, -1.3250],
        [-0.5935,  0.0769, -0.5989,  2.1621,  0.2156, -0.9815,  0.2800,  0.5814],
        [ 0.9430,  1.1100,  0.8671,  0.1920, -1.4802,  1.5407, -0.0635, -0.4202],
        [ 0.1060,  0.1448,  1.0616, -2.7503, -0.2745,  0.9843,  1.5009, -0.1337],
        [-0.1165, -0.6092,  0.9844,  0.1909,  0.1043, -0.0257,  2.0617, -0.7046],
        [-1.1790,  0.0265, -0.7522, -1.0726,  0.9736,  1.1765, -1.0006, -0.0236]],
       grad_fn=<EmbeddingBackward0>)


In [8]:
# Combine Token and Position Embeddings

combined = token_embeds + position_embeds
print("\nCombined Embeddings shape:", combined.shape)
print(combined)



Combined Embeddings shape: torch.Size([11, 8])
tensor([[-0.0704,  4.3308, -0.5304, -0.3833, -2.5121,  4.0660, -0.2519,  1.2458],
        [ 2.6387,  1.3087, -0.3995,  0.5542,  3.6052,  0.3396,  1.3477, -0.8140],
        [ 0.3655, -1.5980,  0.5964, -0.0665,  0.7182, -2.4134, -0.0254,  1.2362],
        [ 1.9192, -2.0560,  0.1503, -0.4603,  0.5010, -2.2257, -1.7371,  0.7788],
        [-0.1622,  0.2934, -0.3286,  0.5253, -1.0955,  0.3482,  1.9086, -0.0916],
        [-2.3872, -0.8995, -2.8265,  1.0404, -0.7603,  0.6156, -0.2246, -0.3682],
        [-1.9548, -0.0810, -0.0798,  1.5111, -0.0436, -0.7855,  1.3291, -0.5672],
        [ 1.2282,  1.3892,  2.6162,  0.5588, -2.2259,  2.5014,  2.2834, -1.0466],
        [ 1.6515,  0.7306,  0.6225, -2.1363,  0.7805,  2.5685,  1.0296, -0.8948],
        [ 1.3223, -2.2006,  1.0334, -0.1407,  0.4390, -0.6740,  2.6007,  0.1473],
        [-2.8982, -0.9362,  0.4754,  0.1170,  0.2207,  1.0726, -0.5993,  0.2079]],
       grad_fn=<AddBackward0>)


In [9]:
# One Head OF Self Attention

import torch
import torch.nn as nn
import torch.nn.functional as F

# Assume 'combined' from previous step has shape: (sequence_length, embedding_dim)
print("Input shape to attention:", combined.shape)

# Step 1: Define the head size (usually embedding_dim / num_heads)
head_size = 8  # since embedding_dim = 8

# Step 2: Create linear layers for Query, Key, and Value
key = nn.Linear(embedding_dim, head_size, bias=False)
query = nn.Linear(embedding_dim, head_size, bias=False)
value = nn.Linear(embedding_dim, head_size, bias=False)

# Step 3: Compute K, Q, V matrices
K = key(combined)      
Q = query(combined)    
V = value(combined)    

print("\nShapes -> K:", K.shape, "| Q:", Q.shape, "| V:", V.shape)

# Step 4: Compute raw attention scores (Q × Kᵀ)
att_scores = Q @ K.T  # shape: (seq_len, seq_len)
att_scores = att_scores / (head_size ** 0.5)  # scale to prevent large values

# Step 5: Apply softmax to get attention weights
att_weights = F.softmax(att_scores, dim=1)
print("\nAttention Weights shape:", att_weights.shape)

# Step 6: Multiply attention weights with values (Weighted sum)
out = att_weights @ V  # shape: (seq_len, head_size)

print("\nOutput shape:", out.shape)
print("\nOutput (first 2 tokens):\n", out[:2])


Input shape to attention: torch.Size([11, 8])

Shapes -> K: torch.Size([11, 8]) | Q: torch.Size([11, 8]) | V: torch.Size([11, 8])

Attention Weights shape: torch.Size([11, 11])

Output shape: torch.Size([11, 8])

Output (first 2 tokens):
 tensor([[ 0.4840,  0.3893, -0.4222,  0.1422,  0.4171,  0.5142, -0.1053,  0.1113],
        [-0.6188,  0.0931,  0.3657, -0.2021, -0.4161, -0.2850,  0.1374, -0.1526]],
       grad_fn=<SliceBackward0>)


In [10]:
# Multi Head Self Attention

# Step 1: Define parameters
embedding_dim = 8
num_heads = 4
head_size = embedding_dim // num_heads  # each head = 2 dimensions here

# Step 2: Create Q, K, V layers for *all heads at once*
key = nn.Linear(embedding_dim, embedding_dim, bias=False)
query = nn.Linear(embedding_dim, embedding_dim, bias=False)
value = nn.Linear(embedding_dim, embedding_dim, bias=False)

# Step 3: Compute Q, K, V
K = key(combined)     # (11, 8)
Q = query(combined)   # (11, 8)
V = value(combined)   # (11, 8)

# Step 4: Split into heads
# reshape to (num_heads, seq_len, head_size)
def split_heads(x):
    return x.view(num_heads, -1, head_size)

K_heads = split_heads(K)
Q_heads = split_heads(Q)
V_heads = split_heads(V)

print("\nK_heads shape:", K_heads.shape)  # (4, 11, 2)

# Step 5: Compute attention for each head
outputs = []
for i in range(num_heads):
    att_scores = Q_heads[i] @ K_heads[i].T           # (11, 11)
    att_scores = att_scores / (head_size ** 0.5)
    att_weights = F.softmax(att_scores, dim=1)
    out = att_weights @ V_heads[i]                   # (11, head_size)
    outputs.append(out)

# Step 6: Concatenate all heads
multi_head_out = torch.cat(outputs, dim=-1)  # (11, embedding_dim)

# Step 7: Final linear projection (optional but standard)
proj = nn.Linear(embedding_dim, embedding_dim)
final_out = proj(multi_head_out)

print("\nMulti-Head Output shape:", final_out.shape)
print("\nOutput for first 2 tokens:\n", final_out[:2])



K_heads shape: torch.Size([4, 11, 2])

Multi-Head Output shape: torch.Size([11, 8])

Output for first 2 tokens:
 tensor([[-0.1294, -0.2590,  0.1444,  0.0705, -0.3831, -0.1154,  0.9859,  0.1703],
        [ 0.0533,  0.3320,  0.3313, -0.0706, -0.5334,  0.0895,  0.5451,  0.1226]],
       grad_fn=<SliceBackward0>)
