In [4]:
import sys

# Añadimos la raíz del proyecto al path de Python
PROJECT_ROOT = "/Users/sultan/DataScience/LLM-From-Scratch-Project"
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

import torch

from src.model.attention import (
    create_causal_mask,
    scaled_dot_product_attention,
    MultiHeadAttention,
)

from src.model.layers import (
    TokenEmbedding,
    PositionalEmbedding,
    FeedForward,
    LayerNorm,
)

print("Python exe:", sys.executable)
print("Torch version:", torch.__version__)

Python exe: /Users/sultan/DataScience/LLM-From-Scratch-Project/.venv/bin/python
Torch version: 2.9.1


In [5]:
batch_size, num_heads, seq_len, head_dim = 1, 1, 4, 2

q = torch.randn(batch_size, num_heads, seq_len, head_dim)
k = torch.randn(batch_size, num_heads, seq_len, head_dim)
v = torch.randn(batch_size, num_heads, seq_len, head_dim)

mask = create_causal_mask(seq_len, device=q.device)

out, attn = scaled_dot_product_attention(q, k, v, mask=mask)

print("q shape:", q.shape)
print("k shape:", k.shape)
print("v shape:", v.shape)
print("out shape:", out.shape)
print("attn shape:", attn.shape)
print("\nCausal mask (0 = futuro bloqueado):")
print(mask[0, 0].int())
print("\nAttention matrix (head 0):")
print(attn[0, 0])

q shape: torch.Size([1, 1, 4, 2])
k shape: torch.Size([1, 1, 4, 2])
v shape: torch.Size([1, 1, 4, 2])
out shape: torch.Size([1, 1, 4, 2])
attn shape: torch.Size([1, 1, 4, 4])

Causal mask (0 = futuro bloqueado):
tensor([[1, 0, 0, 0],
        [1, 1, 0, 0],
        [1, 1, 1, 0],
        [1, 1, 1, 1]], dtype=torch.int32)

Attention matrix (head 0):
tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.5846, 0.4154, 0.0000, 0.0000],
        [0.2029, 0.6149, 0.1822, 0.0000],
        [0.1886, 0.0650, 0.1352, 0.6112]])


In [6]:
batch_size, seq_len, embed_dim, num_heads = 2, 5, 8, 2

x = torch.randn(batch_size, seq_len, embed_dim)
mha = MultiHeadAttention(embed_dim=embed_dim, num_heads=num_heads)

mask = create_causal_mask(seq_len, device=x.device)

out, attn = mha(x, mask=mask)

print("Input shape:", x.shape)
print("Output shape:", out.shape)
print("Attention shape:", attn.shape)
print("\nAttention matrix (batch 0, head 0):")
print(attn[0, 0])

Input shape: torch.Size([2, 5, 8])
Output shape: torch.Size([2, 5, 8])
Attention shape: torch.Size([2, 2, 5, 5])

Attention matrix (batch 0, head 0):
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3857, 0.6143, 0.0000, 0.0000, 0.0000],
        [0.2425, 0.3793, 0.3782, 0.0000, 0.0000],
        [0.2708, 0.1764, 0.2611, 0.2917, 0.0000],
        [0.2018, 0.2011, 0.1663, 0.2144, 0.2164]], grad_fn=<SelectBackward0>)


In [7]:
vocab_size = 50
max_seq_len = 16
embed_dim = 8
batch_size, seq_len = 2, 10

ids = torch.randint(0, vocab_size, (batch_size, seq_len))

tok_emb = TokenEmbedding(vocab_size, embed_dim)
pos_emb = PositionalEmbedding(max_seq_len, embed_dim)

t = tok_emb(ids)
p = pos_emb(ids)
s = t + p

print("Token emb shape:", t.shape)
print("Pos emb shape:", p.shape)
print("Sum shape:", s.shape)
print("\nExample token embedding[0,0]:", t[0, 0])
print("Example pos embedding[0,0]:", p[0, 0])

Token emb shape: torch.Size([2, 10, 8])
Pos emb shape: torch.Size([2, 10, 8])
Sum shape: torch.Size([2, 10, 8])

Example token embedding[0,0]: tensor([ 1.3288, -2.4420,  1.1842, -0.8649, -6.8487,  0.9799,  0.6968, -0.2026],
       grad_fn=<SelectBackward0>)
Example pos embedding[0,0]: tensor([ 1.3486, -2.1934,  0.7030,  0.8502, -0.6056, -0.5264, -0.4830, -1.2382],
       grad_fn=<SelectBackward0>)


In [8]:
batch_size, seq_len, d_model = 2, 5, 8

x = torch.randn(batch_size, seq_len, d_model)

ff = FeedForward(d_model)
ln = LayerNorm(d_model)

y = ff(x)
z = ln(y)

print("Input shape:", x.shape)
print("FFN output shape:", y.shape)
print("LayerNorm output shape:", z.shape)

# Opcional: ver medias y desviaciones por posición
print("\nMean over last dim before LN (first token):", y[0, 0].mean().item())
print("Std over last dim before LN (first token):", y[0, 0].std(unbiased=False).item())

print("\nMean over last dim after LN (first token):", z[0, 0].mean().item())
print("Std over last dim after LN (first token):", z[0, 0].std(unbiased=False).item())

Input shape: torch.Size([2, 5, 8])
FFN output shape: torch.Size([2, 5, 8])
LayerNorm output shape: torch.Size([2, 5, 8])

Mean over last dim before LN (first token): -0.17933684587478638
Std over last dim before LN (first token): 0.2778705060482025

Mean over last dim after LN (first token): 1.4901161193847656e-08
Std over last dim after LN (first token): 0.9999353289604187


In [9]:
# Mini pipeline: ids -> embeddings -> MHA -> FFN + LN

vocab_size = 50
max_seq_len = 16
embed_dim = 8
num_heads = 2
batch_size, seq_len = 2, 10

ids = torch.randint(0, vocab_size, (batch_size, seq_len))

tok_emb = TokenEmbedding(vocab_size, embed_dim)
pos_emb = PositionalEmbedding(max_seq_len, embed_dim)
mha = MultiHeadAttention(embed_dim=embed_dim, num_heads=num_heads)
ff = FeedForward(embed_dim)
ln1 = LayerNorm(embed_dim)
ln2 = LayerNorm(embed_dim)

x = tok_emb(ids) + pos_emb(ids)

mask = create_causal_mask(seq_len, device=x.device)

att_out, att_weights = mha(x, mask=mask)
x = x + att_out            # residual 1
x = ln1(x)

ff_out = ff(x)
x = x + ff_out             # residual 2
x = ln2(x)

print("Final output shape:", x.shape)
print("Attention weights shape:", att_weights.shape)

Final output shape: torch.Size([2, 10, 8])
Attention weights shape: torch.Size([2, 2, 10, 10])


In [3]:
## Tokenizer sanity checks

In [8]:
pip install tokenizers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/opt/python@3.10/bin/python3.10 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [22]:
import os
os.getcwd()

'/Users/sultan/DataScience/LLM-From-Scratch-Project/V2/notebooks'

In [21]:
from tokenizers import Tokenizer

TOKENIZER_PATH = "../models/tokenizers/oscar_bpe_v2/tokenizer.json"
tokenizer = Tokenizer.from_file(TOKENIZER_PATH)

type(tokenizer), tokenizer.__class__.__name__

(tokenizers.Tokenizer, 'Tokenizer')

In [23]:
import os

TOKENIZER_PATH = "../models/tokenizers/oscar_bpe_v2/tokenizer.json"
print("Exists:", os.path.exists(TOKENIZER_PATH))

Exists: True


In [24]:
type(tokenizer), tokenizer.__class__.__name__

(tokenizers.Tokenizer, 'Tokenizer')

In [27]:
text = "Un perro es un canino? Un gato es un felino. La Tierra es el 3er planeta del Sol."

enc = tokenizer.encode(text)

print("n_tokens:", len(enc.ids))
print("first_token_ids:", enc.ids[:30])
print("first_tokens:", enc.tokens[:30])

decoded = tokenizer.decode(enc.ids)
print("decoded:", decoded)

n_tokens: 29
first_token_ids: [3669, 374, 299, 272, 289, 805, 767, 36, 655, 314, 70, 274, 272, 289, 2634, 767, 19, 425, 332, 3634, 272, 275, 461, 245, 1040, 1551, 321, 1468, 19]
first_tokens: ['Un', 'Ġper', 'ro', 'Ġes', 'Ġun', 'Ġcan', 'ino', '?', 'ĠUn', 'Ġg', 'a', 'to', 'Ġes', 'Ġun', 'Ġfel', 'ino', '.', 'ĠLa', 'ĠT', 'ierra', 'Ġes', 'Ġel', 'Ġ3', 'er', 'Ġplan', 'eta', 'Ġdel', 'ĠSol', '.']
decoded: Un perro es un canino? Un gato es un felino. La Tierra es el 3er planeta del Sol.


In [28]:
from pathlib import Path

ROOT = Path.cwd()

# subir 2 niveles: V2/notebooks -> V2 -> (repo root suele estar arriba de V2)
repo_root = ROOT
for _ in range(3):
    repo_root = repo_root.parent

print("Notebook cwd:", ROOT)
print("Search root :", repo_root)

patterns = ["**/*.pt", "**/*.pth", "**/*.bin", "**/checkpoint*", "**/ckpt*"]
hits = []
for p in patterns:
    hits += list(repo_root.glob(p))

hits = sorted(set(hits), key=lambda x: x.stat().st_size if x.exists() else 0, reverse=True)

print(f"\nFound {len(hits)} candidate files\n")
for h in hits[:40]:
    size_mb = h.stat().st_size / (1024**2)
    print(f"{size_mb:8.2f} MB  -  {h}")

Notebook cwd: /Users/sultan/DataScience/LLM-From-Scratch-Project/V2/notebooks
Search root : /Users/sultan/DataScience

Found 151 candidate files

   13.44 MB  -  /Users/sultan/DataScience/LLM-From-Scratch-Project/Base/models/checkpoints_oscar_long/gpt_char_best.pt
   13.44 MB  -  /Users/sultan/DataScience/LLM-From-Scratch-Project/V2/models/checkpoints_oscar_long/gpt_char_best.pt
   13.44 MB  -  /Users/sultan/DataScience/LLM-From-Scratch-Project/V1/models/checkpoints_oscar_long/gpt_char_best.pt
   13.44 MB  -  /Users/sultan/DataScience/LLM-From-Scratch-Project/V2/models/checkpoints_oscar_debug/gpt_char_best.pt
   13.44 MB  -  /Users/sultan/DataScience/LLM-From-Scratch-Project/V1/models/checkpoints_oscar_debug/gpt_char_best.pt
   13.44 MB  -  /Users/sultan/DataScience/LLM-From-Scratch-Project/Base/models/checkpoints_oscar_debug/gpt_char_best.pt
    5.99 MB  -  /Users/sultan/DataScience/RAG/Test RAG 2024/RAG/Tutorial/chroma_persistent_storage/34ef67d4-af93-4251-ba55-1df699839d5f/data_leve

In [30]:
pip install torch

Collecting torch
  Downloading torch-2.9.1-cp310-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx>=2.5.1 (from torch)
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy>=1.13.3->torch)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Collecting MarkupSafe>=2.0 (from jinja2->torch)
  Downloading markupsafe-3.0.3-cp310-cp310-macosx_11_0_arm64.whl.metadata (2.7 kB)
Downloading torch-2.9.1-cp310-none-macosx_11_0_arm64.whl (74.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.5/74.5 MB[0m [31m58.0 MB/s[0m  [33m0:00:01[0m eta [36m0:00:01[0m
[?25hDownloading networkx-3.4.2-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m67.6 MB/s[0

In [31]:
from pathlib import Path
import torch

ckpt_path = Path("/Users/sultan/DataScience/LLM-From-Scratch-Project/V2/models/checkpoints_oscar_long/gpt_char_best.pt")

ckpt = torch.load(ckpt_path, map_location="cpu")
print(type(ckpt))

# imprime claves (o atributos) para ver qué guarda
if isinstance(ckpt, dict):
    print("keys:", list(ckpt.keys())[:50])

<class 'dict'>
keys: ['model_state_dict', 'optimizer_state_dict', 'epoch', 'global_step', 'val_loss', 'training_config']


In [32]:
cfg = ckpt["training_config"]
type(cfg), cfg

(dict,
 {'batch_size': 16,
  'learning_rate': 0.0003,
  'weight_decay': 0.01,
  'betas': (0.9, 0.95),
  'max_grad_norm': 1.0,
  'log_every': 10,
  'seed': 42,
  'device': 'mps'})

In [33]:
msd = ckpt["model_state_dict"]
keys = list(msd.keys())
print("n_keys:", len(keys))
print("sample keys:", keys[:30])

# intenta detectar la matriz de embeddings (casi siempre existe)
for k in keys:
    if "embed" in k.lower() or "tok_emb" in k.lower() or "token" in k.lower():
        print("candidate:", k, msd[k].shape)

n_keys: 69
sample keys: ['tok_embedding.embedding.weight', 'pos_embedding.pos_embedding.weight', 'blocks.0.ln1.ln.weight', 'blocks.0.ln1.ln.bias', 'blocks.0.attn.q_proj.weight', 'blocks.0.attn.q_proj.bias', 'blocks.0.attn.k_proj.weight', 'blocks.0.attn.k_proj.bias', 'blocks.0.attn.v_proj.weight', 'blocks.0.attn.v_proj.bias', 'blocks.0.attn.out_proj.weight', 'blocks.0.attn.out_proj.bias', 'blocks.0.ln2.ln.weight', 'blocks.0.ln2.ln.bias', 'blocks.0.ff.fc1.weight', 'blocks.0.ff.fc1.bias', 'blocks.0.ff.fc2.weight', 'blocks.0.ff.fc2.bias', 'blocks.1.ln1.ln.weight', 'blocks.1.ln1.ln.bias', 'blocks.1.attn.q_proj.weight', 'blocks.1.attn.q_proj.bias', 'blocks.1.attn.k_proj.weight', 'blocks.1.attn.k_proj.bias', 'blocks.1.attn.v_proj.weight', 'blocks.1.attn.v_proj.bias', 'blocks.1.attn.out_proj.weight', 'blocks.1.attn.out_proj.bias', 'blocks.1.ln2.ln.weight', 'blocks.1.ln2.ln.bias']
candidate: tok_embedding.embedding.weight torch.Size([2796, 128])
candidate: pos_embedding.pos_embedding.weight tor

In [34]:
print("tokenizer vocab size:", tokenizer.get_vocab_size())

tokenizer vocab size: 4096


In [35]:
vocab_ckpt = ckpt["model_state_dict"]["tok_embedding.embedding.weight"].shape[0]
print("vocab_ckpt:", vocab_ckpt)

vocab_ckpt: 2796


In [36]:
from pathlib import Path

ROOT = Path("/Users/sultan/DataScience/LLM-From-Scratch-Project")
tok_jsons = sorted(ROOT.glob("**/tokenizer.json"))
print("found tokenizer.json:", len(tok_jsons))
for p in tok_jsons[:40]:
    print(p)

found tokenizer.json: 1
/Users/sultan/DataScience/LLM-From-Scratch-Project/V2/models/tokenizers/oscar_bpe_v2/tokenizer.json


In [40]:
from tokenizers import Tokenizer

tok_path = "/Users/sultan/DataScience/LLM-From-Scratch-Project/V2/models/tokenizers/oscar_bpe_v2/tokenizer.json"
tok2 = Tokenizer.from_file(tok_path)
print("tok2 vocab size:", tok2.get_vocab_size())


tok2 vocab size: 4096


In [41]:
import torch
from pathlib import Path

ROOT = Path("/Users/sultan/DataScience/LLM-From-Scratch-Project")

ckpt_paths = sorted(
    list(ROOT.glob("**/*.pt")) + list(ROOT.glob("**/*.pth")),
    key=lambda p: p.stat().st_size,
    reverse=True
)

def get_vocab_from_ckpt(path):
    try:
        ckpt = torch.load(path, map_location="cpu")
        sd = ckpt.get("model_state_dict", ckpt)  # soporta checkpoints dict o state_dict directo
        # buscamos el embedding típico
        for k in ["tok_embedding.embedding.weight", "token_embedding.weight", "tok_embedding.weight", "wte.weight"]:
            if k in sd:
                return sd[k].shape[0], k
        return None, None
    except Exception:
        return None, None

hits = []
for p in ckpt_paths:
    vocab, key = get_vocab_from_ckpt(p)
    if vocab is not None:
        hits.append((vocab, str(p), key))

print("Found checkpoints with detectable token embedding:", len(hits))
for vocab, p, key in hits[:40]:
    print(f"vocab={vocab:<5} key={key:<35}  {p}")

Found checkpoints with detectable token embedding: 15
vocab=2796  key=tok_embedding.embedding.weight       /Users/sultan/DataScience/LLM-From-Scratch-Project/V1/models/checkpoints_oscar_long/gpt_char_best.pt
vocab=2796  key=tok_embedding.embedding.weight       /Users/sultan/DataScience/LLM-From-Scratch-Project/V2/models/checkpoints_oscar_long/gpt_char_best.pt
vocab=2796  key=tok_embedding.embedding.weight       /Users/sultan/DataScience/LLM-From-Scratch-Project/Base/models/checkpoints_oscar_long/gpt_char_best.pt
vocab=2796  key=tok_embedding.embedding.weight       /Users/sultan/DataScience/LLM-From-Scratch-Project/V1/models/checkpoints_oscar_debug/gpt_char_best.pt
vocab=2796  key=tok_embedding.embedding.weight       /Users/sultan/DataScience/LLM-From-Scratch-Project/V2/models/checkpoints_oscar_debug/gpt_char_best.pt
vocab=2796  key=tok_embedding.embedding.weight       /Users/sultan/DataScience/LLM-From-Scratch-Project/Base/models/checkpoints_oscar_debug/gpt_char_best.pt
vocab=2796  key

In [42]:
import torch
from pathlib import Path

tok_path = Path("/Users/sultan/DataScience/LLM-From-Scratch-Project/V2/models/checkpoints_oscar_long/char_tokenizer.pt")
tok = torch.load(tok_path, map_location="cpu")

print(type(tok))
if isinstance(tok, dict):
    print("keys:", list(tok.keys())[:30])
    for k,v in tok.items():
        if hasattr(v, "__len__"):
            print(k, "len=", len(v))

<class 'dict'>
keys: ['stoi', 'itos']
stoi len= 2796
itos len= 2796


In [43]:
import torch
from pathlib import Path

# 1) Cargar tokenizer dict
tok_path = Path("/Users/sultan/DataScience/LLM-From-Scratch-Project/V2/models/checkpoints_oscar_long/char_tokenizer.pt")
tok = torch.load(tok_path, map_location="cpu")
stoi, itos = tok["stoi"], tok["itos"]

unk_id = stoi.get("<unk>", None)  # si no existe, luego te digo qué hacemos

def encode(text: str):
    ids = []
    for ch in text:
        if ch in stoi:
            ids.append(stoi[ch])
        else:
            if unk_id is None:
                raise ValueError(f"Character not in vocab and no <unk> token found: {repr(ch)}")
            ids.append(unk_id)
    return ids

def decode(ids):
    return "".join(itos[i] for i in ids)

# 2) Prueba
text = "Un perro es un canino? Un gato es un felino. La Tierra es el 3er planeta del Sol."
ids = encode(text)

print("n_tokens:", len(ids))
print("first_ids:", ids[:40])
print("decoded_ok:", decode(ids) == text)

n_tokens: 81
first_ids: [71, 96, 18, 98, 87, 100, 100, 97, 18, 87, 101, 18, 103, 96, 18, 85, 83, 96, 91, 96, 97, 49, 18, 71, 96, 18, 89, 83, 102, 97, 18, 87, 101, 18, 103, 96, 18, 88, 87, 94]
decoded_ok: True


# Me parece que desde acá

In [46]:
# 1) Crear una sección “SANITY CHECKS (no training)” y reiniciar variables

# --- SANITY CHECKS (NO TRAINING) ---
# This notebook only reads artifacts and prints diagnostics.

import os, sys
from pathlib import Path

PROJECT_ROOT = Path("/Users/sultan/DataScience/LLM-From-Scratch-Project")
V2_ROOT = PROJECT_ROOT / "V2"
print("PROJECT_ROOT:", PROJECT_ROOT)
print("CWD:", Path.cwd())

PROJECT_ROOT: /Users/sultan/DataScience/LLM-From-Scratch-Project
CWD: /Users/sultan/DataScience/LLM-From-Scratch-Project/V2/notebooks


In [47]:
# 2) Verificar checkpoint elegido (existencia + tamaño + keys)

import torch

CKPT_PATH = V2_ROOT / "models/checkpoints_oscar_long/gpt_char_best.pt"
print("CKPT_PATH:", CKPT_PATH)
print("exists:", CKPT_PATH.exists())
print("size_mb:", CKPT_PATH.stat().st_size / (1024**2))

ckpt = torch.load(CKPT_PATH, map_location="cpu")
print("ckpt keys:", list(ckpt.keys()))
print("has model_state_dict:", "model_state_dict" in ckpt)

CKPT_PATH: /Users/sultan/DataScience/LLM-From-Scratch-Project/V2/models/checkpoints_oscar_long/gpt_char_best.pt
exists: True
size_mb: 13.439669609069824
ckpt keys: ['model_state_dict', 'optimizer_state_dict', 'epoch', 'global_step', 'val_loss', 'training_config']
has model_state_dict: True


In [49]:
# 3) Cargar tokenizer del checkpoint (stoi/itos) y re-test rápido de encode/decode

tok = ckpt.get("training_config", None)  # solo para no confundir nombres
char_tok_path = V2_ROOT / "models/checkpoints_oscar_long/char_tokenizer.pt"
print("char_tokenizer path:", char_tok_path, "exists:", char_tok_path.exists())

char_tok = torch.load(char_tok_path, map_location="cpu")  # {'stoi':..., 'itos':...}
stoi, itos = char_tok["stoi"], char_tok["itos"]
print("vocab size:", len(stoi), len(itos))

def encode_chars(s: str):
    return [stoi.get(ch, stoi.get("<unk>", 0)) for ch in s]

def decode_chars(ids):
    return "".join([itos[i] for i in ids])

text = "Un perro es un canino? Un gato es un felino."
ids = encode_chars(text)
back = decode_chars(ids)

print("n_tokens:", len(ids))
print("first_ids:", ids[:40])
print("decoded_ok:", back == text)

char_tokenizer path: /Users/sultan/DataScience/LLM-From-Scratch-Project/V2/models/checkpoints_oscar_long/char_tokenizer.pt exists: True
vocab size: 2796 2796
n_tokens: 44
first_ids: [71, 96, 18, 98, 87, 100, 100, 97, 18, 87, 101, 18, 103, 96, 18, 85, 83, 96, 91, 96, 97, 49, 18, 71, 96, 18, 89, 83, 102, 97, 18, 87, 101, 18, 103, 96, 18, 88, 87, 94]
decoded_ok: True


In [50]:
# 4) Encontrar la clase del modelo para poder generar next-token

import re

py_files = list(V2_ROOT.rglob("*.py"))
print("py_files:", len(py_files))

candidates = []
for p in py_files:
    txt = p.read_text(errors="ignore")
    if re.search(r"class\s+.*GPT|class\s+.*Transformer|def\s+forward", txt):
        candidates.append(p)

print("model candidates:", len(candidates))
for p in candidates[:30]:
    print(p)

py_files: 8891
model candidates: 288
/Users/sultan/DataScience/LLM-From-Scratch-Project/V2/.venv/lib/python3.13/site-packages/aiohttp/web_request.py
/Users/sultan/DataScience/LLM-From-Scratch-Project/V2/.venv/lib/python3.13/site-packages/huggingface_hub/hub_mixin.py
/Users/sultan/DataScience/LLM-From-Scratch-Project/V2/.venv/lib/python3.13/site-packages/huggingface_hub/hf_api.py
/Users/sultan/DataScience/LLM-From-Scratch-Project/V2/.venv/lib/python3.13/site-packages/streamlit/testing/v1/local_script_runner.py
/Users/sultan/DataScience/LLM-From-Scratch-Project/V2/.venv/lib/python3.13/site-packages/sympy/testing/runtests.py
/Users/sultan/DataScience/LLM-From-Scratch-Project/V2/.venv/lib/python3.13/site-packages/sympy/physics/control/lti.py
/Users/sultan/DataScience/LLM-From-Scratch-Project/V2/.venv/lib/python3.13/site-packages/sympy/parsing/ast_parser.py
/Users/sultan/DataScience/LLM-From-Scratch-Project/V2/.venv/lib/python3.13/site-packages/sympy/parsing/sympy_parser.py
/Users/sultan/Da

# De nuevo

In [None]:
# Paso 1) Re-hacer el “model candidates” EXCLUYENDO .venv y site-packages

import re
from pathlib import Path

V2_ROOT = Path("/Users/sultan/DataScience/LLM-From-Scratch-Project/V2")

py_files = [p for p in V2_ROOT.rglob("*.py") if ".venv" not in str(p) and "site-packages" not in str(p)]
print("py_files (filtered):", len(py_files))

candidates = []
for p in py_files:
    txt = p.read_text(errors="ignore")
    if re.search(r"class\s+.*GPT|class\s+.*Transformer|def\s+forward", txt):
        candidates.append(p)

print("model candidates (filtered):", len(candidates))
for p in candidates[:40]:
    print(p)

In [None]:
# Paso 2) Buscar “huellas” exactas de tu checkpoint (tok_embedding / pos_embedding / blocks.)

needles = [
    "tok_embedding",
    "pos_embedding",
    "q_proj",
    "k_proj",
    "v_proj",
    "out_proj",
    "blocks."
]

hits = []
for p in py_files:
    txt = p.read_text(errors="ignore")
    score = sum(n in txt for n in needles)
    if score >= 2:  # ajustable
        hits.append((score, p))

hits.sort(reverse=True, key=lambda x: x[0])

print("strong hits:", len(hits))
for score, p in hits[:30]:
    print(score, p)

In [None]:
# Paso 3) Inferir la “config” del modelo desde el checkpoint (para instanciar correcto)

import torch
from collections import defaultdict

CKPT_PATH = "/Users/sultan/DataScience/LLM-From-Scratch-Project/V2/models/checkpoints_oscar_long/gpt_char_best.pt"
ckpt = torch.load(CKPT_PATH, map_location="cpu")
sd = ckpt["model_state_dict"]

# vocab y d_model
vocab_size, d_model = sd["tok_embedding.embedding.weight"].shape

# max_seq_len
max_seq_len = sd["pos_embedding.pos_embedding.weight"].shape[0]

# n_layers
layer_ids = set()
for k in sd.keys():
    if k.startswith("blocks."):
        layer_ids.add(int(k.split(".")[1]))
n_layers = max(layer_ids) + 1 if layer_ids else 0

print("INFERRED:")
print("vocab_size:", vocab_size)
print("d_model:", d_model)
print("max_seq_len:", max_seq_len)
print("n_layers:", n_layers)

# si quieres ver una pista de heads/d_ff:
for name in ["blocks.0.attn.q_proj.weight", "blocks.0.ff.fc1.weight"]:
    if name in sd:
        print(name, sd[name].shape)