# GPT Model

From the Andrej Karpathy [Let's build GPT](https://www.youtube.com/watch?v=kCc8FmEb1nY) video.

Code for this lecture is mostly from https://github.com/karpathy/ng-video-lecture/tree/master.

In here we explore ideas like self attention .. etc.

In [10]:
# A small demonstration of letting previous tokens influence the current token
import torch
from torch.nn import functional as F

torch.manual_seed(1337)

B, T, C = 4, 8, 32  # batch size, sequence length (context), embedding dimension

# random token embeddings for a batch
x = torch.randn(B, T, C)

# lower-triangular matrix of ones: tril[i,j] == 1 for j <= i
tril = torch.tril(torch.ones(T, T))
print(tril)

# Build an attention-like weight matrix where earlier tokens each time step i can attend
# only to positions 0..i (itself and earlier tokens).
# We put -inf in disallowed positions so softmax will assign them zero probability.
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
# After softmax over the last dimension, each row i contains normalized weights
# over positions 0..i (they sum to 1).
wei = F.softmax(wei, dim=-1)
print(wei)

# Apply the weights to the token embeddings.
# wei has shape (T, T) and x has shape (B, T, C).
# Broadcasting makes wei behave like (B, T, T), so the result has shape (B, T, C).
# out[b, i, :] is the weighted average of x[b, 0:i+1, :] using the weights in row i.
out = wei @ x
print(out.shape)

# This is a simple, explicit way to let information from previous tokens
# flow into the representation at each current time step (a toy causal attention).

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])
torch.Size([4, 8, 32])


In [17]:
import numpy as np

# --- Setup ---
np.random.seed(42)

num_nodes = 5
feature_dim = 4
d_k = d_v = 4

# Random features for each node
X = np.random.randn(num_nodes, feature_dim)

# Random projection matrices
W_Q = np.random.randn(feature_dim, d_k)
W_K = np.random.randn(feature_dim, d_k)
W_V = np.random.randn(feature_dim, d_v)

# Queries, Keys, Values
Q = X @ W_Q
K = X @ W_K
V = X @ W_V

# --- Step 1: Create a random directed adjacency matrix ---
# 0/1 edges, no self-loops for clarity
A = (np.random.rand(num_nodes, num_nodes) > 0.5).astype(int)
np.fill_diagonal(A, 0)  # remove self-loops

print("Adjacency matrix (directed graph):\n", A)

# --- Step 2: Compute attention scores only for existing edges ---
scores = Q @ K.T / np.sqrt(d_k)   # similarity matrix (N x N)

# Mask out non-edges (set to -inf before softmax)
mask = (A == 0)
scores_masked = np.where(mask, -1e9, scores)

# --- Step 3: Apply softmax row-wise (over neighbors only) ---
def softmax(x):
    e_x = np.exp(x)
    return e_x / (e_x.sum(axis=-1, keepdims=True) + 1e-9) # Add 1e-9 to 

attention_weights = softmax(scores_masked)

print("\nAttention weights (edge weights):\n", attention_weights)

# --- Step 4: Message passing ---
out = attention_weights @ V   # aggregate messages from neighbors
print("\nOutput node features after attention:\n", out)

Adjacency matrix (directed graph):
 [[0 1 1 1 1]
 [0 0 0 0 0]
 [0 1 0 1 1]
 [0 0 1 0 0]
 [0 0 1 1 0]]

Attention weights (edge weights):
 [[0.         0.58686424 0.11015243 0.29349828 0.00948504]
 [0.         0.         0.         0.         0.        ]
 [0.         0.29790782 0.         0.14780649 0.5542857 ]
 [0.         0.         1.         0.         0.        ]
 [0.         0.         0.75252504 0.24747496 0.        ]]

Output node features after attention:
 [[ 0.62674184  0.35085896 -0.7575579  -0.94982768]
 [ 0.          0.          0.          0.        ]
 [ 0.17279806 -1.11993955 -0.30863806 -1.02376616]
 [-0.29388799 -1.00053631  0.24193489  0.17904783]
 [ 0.22715068 -0.67936369  0.56923429  0.0995786 ]]


In [2]:
import numpy as np

np.random.seed(1337)


# --- Setup ---
np.random.seed(42)

num_nodes = 5
feature_dim = 4
d_k = d_v = 4

# Random features for each node
X = np.random.randn(num_nodes, feature_dim)

# Random projection matrices
W_Q = np.random.randn(feature_dim, d_k)
W_K = np.random.randn(feature_dim, d_k)
W_V = np.random.randn(feature_dim, d_v)


# --- Step 1: Create a random directed adjacency matrix ---
# 0/1 edges, no self-loops for clarity
A = (np.random.rand(num_nodes, num_nodes) > 0.5).astype(int)
np.fill_diagonal(A, 0)  # remove self-loops

# lower-triangular matrix of ones: tril[i,j] == 1 for j <= i
tril = np.tril(np.ones((T, T)))
print(tril)

# Build an attention-like weight matrix
wei = np.zeros((T, T))
wei[tril == 0] = -np.inf

# Softmax along the last dimension
# subtract max for numerical stability
exp_wei = np.exp(wei - np.nanmax(wei, axis=-1, keepdims=True))
wei = exp_wei / np.nansum(exp_wei, axis=-1, keepdims=True)
print(wei)

# Apply the weights to the token embeddings
# (T, T) @ (B, T, C) → (B, T, C) via broadcasting
out = wei @ x
print(out.shape)


[[1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 0. 0.]
 [1. 1. 1. 1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1.]]
[[1.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.5        0.5        0.         0.         0.         0.
  0.         0.        ]
 [0.33333333 0.33333333 0.33333333 0.         0.         0.
  0.         0.        ]
 [0.25       0.25       0.25       0.25       0.         0.
  0.         0.        ]
 [0.2        0.2        0.2        0.2        0.2        0.
  0.         0.        ]
 [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667
  0.         0.        ]
 [0.14285714 0.14285714 0.14285714 0.14285714 0.14285714 0.14285714
  0.14285714 0.        ]
 [0.125      0.125      0.125      0.125      0.125      0.125
  0.125      0.125     ]]
(4, 8, 32)
