In [4]:
import sys

# Añadimos la raíz del proyecto al path de Python
PROJECT_ROOT = "/Users/sultan/DataScience/LLM-From-Scratch-Project"
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

import torch

from src.model.attention import (
    create_causal_mask,
    scaled_dot_product_attention,
    MultiHeadAttention,
)

from src.model.layers import (
    TokenEmbedding,
    PositionalEmbedding,
    FeedForward,
    LayerNorm,
)

print("Python exe:", sys.executable)
print("Torch version:", torch.__version__)

Python exe: /Users/sultan/DataScience/LLM-From-Scratch-Project/.venv/bin/python
Torch version: 2.9.1


In [5]:
batch_size, num_heads, seq_len, head_dim = 1, 1, 4, 2

q = torch.randn(batch_size, num_heads, seq_len, head_dim)
k = torch.randn(batch_size, num_heads, seq_len, head_dim)
v = torch.randn(batch_size, num_heads, seq_len, head_dim)

mask = create_causal_mask(seq_len, device=q.device)

out, attn = scaled_dot_product_attention(q, k, v, mask=mask)

print("q shape:", q.shape)
print("k shape:", k.shape)
print("v shape:", v.shape)
print("out shape:", out.shape)
print("attn shape:", attn.shape)
print("\nCausal mask (0 = futuro bloqueado):")
print(mask[0, 0].int())
print("\nAttention matrix (head 0):")
print(attn[0, 0])

q shape: torch.Size([1, 1, 4, 2])
k shape: torch.Size([1, 1, 4, 2])
v shape: torch.Size([1, 1, 4, 2])
out shape: torch.Size([1, 1, 4, 2])
attn shape: torch.Size([1, 1, 4, 4])

Causal mask (0 = futuro bloqueado):
tensor([[1, 0, 0, 0],
        [1, 1, 0, 0],
        [1, 1, 1, 0],
        [1, 1, 1, 1]], dtype=torch.int32)

Attention matrix (head 0):
tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.5846, 0.4154, 0.0000, 0.0000],
        [0.2029, 0.6149, 0.1822, 0.0000],
        [0.1886, 0.0650, 0.1352, 0.6112]])


In [6]:
batch_size, seq_len, embed_dim, num_heads = 2, 5, 8, 2

x = torch.randn(batch_size, seq_len, embed_dim)
mha = MultiHeadAttention(embed_dim=embed_dim, num_heads=num_heads)

mask = create_causal_mask(seq_len, device=x.device)

out, attn = mha(x, mask=mask)

print("Input shape:", x.shape)
print("Output shape:", out.shape)
print("Attention shape:", attn.shape)
print("\nAttention matrix (batch 0, head 0):")
print(attn[0, 0])

Input shape: torch.Size([2, 5, 8])
Output shape: torch.Size([2, 5, 8])
Attention shape: torch.Size([2, 2, 5, 5])

Attention matrix (batch 0, head 0):
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3857, 0.6143, 0.0000, 0.0000, 0.0000],
        [0.2425, 0.3793, 0.3782, 0.0000, 0.0000],
        [0.2708, 0.1764, 0.2611, 0.2917, 0.0000],
        [0.2018, 0.2011, 0.1663, 0.2144, 0.2164]], grad_fn=<SelectBackward0>)


In [7]:
vocab_size = 50
max_seq_len = 16
embed_dim = 8
batch_size, seq_len = 2, 10

ids = torch.randint(0, vocab_size, (batch_size, seq_len))

tok_emb = TokenEmbedding(vocab_size, embed_dim)
pos_emb = PositionalEmbedding(max_seq_len, embed_dim)

t = tok_emb(ids)
p = pos_emb(ids)
s = t + p

print("Token emb shape:", t.shape)
print("Pos emb shape:", p.shape)
print("Sum shape:", s.shape)
print("\nExample token embedding[0,0]:", t[0, 0])
print("Example pos embedding[0,0]:", p[0, 0])

Token emb shape: torch.Size([2, 10, 8])
Pos emb shape: torch.Size([2, 10, 8])
Sum shape: torch.Size([2, 10, 8])

Example token embedding[0,0]: tensor([ 1.3288, -2.4420,  1.1842, -0.8649, -6.8487,  0.9799,  0.6968, -0.2026],
       grad_fn=<SelectBackward0>)
Example pos embedding[0,0]: tensor([ 1.3486, -2.1934,  0.7030,  0.8502, -0.6056, -0.5264, -0.4830, -1.2382],
       grad_fn=<SelectBackward0>)


In [8]:
batch_size, seq_len, d_model = 2, 5, 8

x = torch.randn(batch_size, seq_len, d_model)

ff = FeedForward(d_model)
ln = LayerNorm(d_model)

y = ff(x)
z = ln(y)

print("Input shape:", x.shape)
print("FFN output shape:", y.shape)
print("LayerNorm output shape:", z.shape)

# Opcional: ver medias y desviaciones por posición
print("\nMean over last dim before LN (first token):", y[0, 0].mean().item())
print("Std over last dim before LN (first token):", y[0, 0].std(unbiased=False).item())

print("\nMean over last dim after LN (first token):", z[0, 0].mean().item())
print("Std over last dim after LN (first token):", z[0, 0].std(unbiased=False).item())

Input shape: torch.Size([2, 5, 8])
FFN output shape: torch.Size([2, 5, 8])
LayerNorm output shape: torch.Size([2, 5, 8])

Mean over last dim before LN (first token): -0.17933684587478638
Std over last dim before LN (first token): 0.2778705060482025

Mean over last dim after LN (first token): 1.4901161193847656e-08
Std over last dim after LN (first token): 0.9999353289604187


In [9]:
# Mini pipeline: ids -> embeddings -> MHA -> FFN + LN

vocab_size = 50
max_seq_len = 16
embed_dim = 8
num_heads = 2
batch_size, seq_len = 2, 10

ids = torch.randint(0, vocab_size, (batch_size, seq_len))

tok_emb = TokenEmbedding(vocab_size, embed_dim)
pos_emb = PositionalEmbedding(max_seq_len, embed_dim)
mha = MultiHeadAttention(embed_dim=embed_dim, num_heads=num_heads)
ff = FeedForward(embed_dim)
ln1 = LayerNorm(embed_dim)
ln2 = LayerNorm(embed_dim)

x = tok_emb(ids) + pos_emb(ids)

mask = create_causal_mask(seq_len, device=x.device)

att_out, att_weights = mha(x, mask=mask)
x = x + att_out            # residual 1
x = ln1(x)

ff_out = ff(x)
x = x + ff_out             # residual 2
x = ln2(x)

print("Final output shape:", x.shape)
print("Attention weights shape:", att_weights.shape)

Final output shape: torch.Size([2, 10, 8])
Attention weights shape: torch.Size([2, 2, 10, 10])
