In [2]:
import sys, torch
print(sys.executable)
print(torch.__version__)

/Users/sultan/DataScience/LLM-From-Scratch-Project/.venv/bin/python
2.9.1


In [3]:
import torch

from src.model.attention import (
    create_causal_mask,
    scaled_dot_product_attention,
    MultiHeadAttention,
)

from src.model.layers import (
    TokenEmbedding,
    PositionalEmbedding,
    FeedForward,
    LayerNorm,
)

torch.__version__

ModuleNotFoundError: No module named 'src'

In [None]:
batch_size, num_heads, seq_len, head_dim = 1, 1, 4, 2

q = torch.randn(batch_size, num_heads, seq_len, head_dim)
k = torch.randn(batch_size, num_heads, seq_len, head_dim)
v = torch.randn(batch_size, num_heads, seq_len, head_dim)

mask = create_causal_mask(seq_len, device=q.device)

out, attn = scaled_dot_product_attention(q, k, v, mask=mask)

print("q shape:", q.shape)
print("k shape:", k.shape)
print("v shape:", v.shape)
print("out shape:", out.shape)
print("attn shape:", attn.shape)
print("\nCausal mask (0 = futuro bloqueado):")
print(mask[0, 0].int())
print("\nAttention matrix (head 0):")
print(attn[0, 0])

In [None]:
batch_size, seq_len, embed_dim, num_heads = 2, 5, 8, 2

x = torch.randn(batch_size, seq_len, embed_dim)
mha = MultiHeadAttention(embed_dim=embed_dim, num_heads=num_heads)

mask = create_causal_mask(seq_len, device=x.device)

out, attn = mha(x, mask=mask)

print("Input shape:", x.shape)
print("Output shape:", out.shape)
print("Attention shape:", attn.shape)
print("\nAttention matrix (batch 0, head 0):")
print(attn[0, 0])

In [None]:
vocab_size = 50
max_seq_len = 16
embed_dim = 8
batch_size, seq_len = 2, 10

ids = torch.randint(0, vocab_size, (batch_size, seq_len))

tok_emb = TokenEmbedding(vocab_size, embed_dim)
pos_emb = PositionalEmbedding(max_seq_len, embed_dim)

t = tok_emb(ids)
p = pos_emb(ids)
s = t + p

print("Token emb shape:", t.shape)
print("Pos emb shape:", p.shape)
print("Sum shape:", s.shape)
print("\nExample token embedding[0,0]:", t[0, 0])
print("Example pos embedding[0,0]:", p[0, 0])

In [None]:
batch_size, seq_len, d_model = 2, 5, 8

x = torch.randn(batch_size, seq_len, d_model)

ff = FeedForward(d_model)
ln = LayerNorm(d_model)

y = ff(x)
z = ln(y)

print("Input shape:", x.shape)
print("FFN output shape:", y.shape)
print("LayerNorm output shape:", z.shape)

# Opcional: ver medias y desviaciones por posiciÃ³n
print("\nMean over last dim before LN (first token):", y[0, 0].mean().item())
print("Std over last dim before LN (first token):", y[0, 0].std(unbiased=False).item())

print("\nMean over last dim after LN (first token):", z[0, 0].mean().item())
print("Std over last dim after LN (first token):", z[0, 0].std(unbiased=False).item())

In [None]:
# Mini pipeline: ids -> embeddings -> MHA -> FFN + LN

vocab_size = 50
max_seq_len = 16
embed_dim = 8
num_heads = 2
batch_size, seq_len = 2, 10

ids = torch.randint(0, vocab_size, (batch_size, seq_len))

tok_emb = TokenEmbedding(vocab_size, embed_dim)
pos_emb = PositionalEmbedding(max_seq_len, embed_dim)
mha = MultiHeadAttention(embed_dim=embed_dim, num_heads=num_heads)
ff = FeedForward(embed_dim)
ln1 = LayerNorm(embed_dim)
ln2 = LayerNorm(embed_dim)

x = tok_emb(ids) + pos_emb(ids)

mask = create_causal_mask(seq_len, device=x.device)

att_out, att_weights = mha(x, mask=mask)
x = x + att_out            # residual 1
x = ln1(x)

ff_out = ff(x)
x = x + ff_out             # residual 2
x = ln2(x)

print("Final output shape:", x.shape)
print("Attention weights shape:", att_weights.shape)