In [None]:
import torch
import torch.nn as nn

# Configurações
vocab_size = 96  # Apenas 10 palavras no nosso dicionário
block_size = 5   # Tamanho máximo da frase (Time)
n_embd = 32      # Tamanho do vetor de características (Channels)

# Criando as tabelas
token_table = nn.Embedding(vocab_size, n_embd) ## 10 

pos_table = nn.Embedding(block_size, n_embd)

# Simulação de entrada (Batch=1, Time=3) -> IDs: 2, 5, 1
idx = torch.tensor([[2, 5, 1]]) 

T = idx.shape[1] # T = 3

print("--- 1. Token Embeddings ---")
tok_emb = token_table(idx)
print(f"Shape: {tok_emb.shape}") 
# Esperado: torch.Size([1, 3, 4])
print("Valores (exemplo do primeiro token):\n", tok_emb)

print("\n--- 2. Positional Embeddings ---")
# Cria vetor [0, 1, 2]

positions = torch.arange(T) 
pos_emb = pos_table(positions)
print(f"Indices de posição: {positions}")
print(f"Shape: {pos_emb.shape}") 
# Esperado: torch.Size([3, 4])



In [None]:
with open("lolbas.txt",'r', encoding='utf-8') as f:
    text = f.read()
    chars = sorted(set(text))
    vocab_Size = len(chars)
    print(vocab_Size)

In [None]:
import torch
import torch.nn as nn

# Configurações
head_size = 4
num_heads = 3
n_embd = head_size * num_heads # 12

# Simulação da classe Head (simplificada)
class Head(nn.Module):
    def __init__(self, size):
        super().__init__()
        # Cada cabeça projeta para um tamanho pequeno
        self.linear = nn.Linear(n_embd, size) ## -- >> 12, 
    
    def forward(self, x):
        return self.linear(x)

# Nossa classe MultiHead (Explodida para teste)
class MultiHeadSim(nn.Module):
    def __init__(self):
        super().__init__()
        # Cria 3 cabeças independentes
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        # Projeção final para misturar
        self.proj = nn.Linear(n_embd, n_embd)

    def forward(self, x): # Forward escrito corretamente!
        # 1. Cada cabeça processa
        head_outputs = [h(x) for h in self.heads]
        print(head_outputs.shape)
        
        # 2. Concatenamos (Cole os vetores lado a lado)
        out = torch.cat(head_outputs, dim=-1)
        print(out)
        print(f"Shape após CAT: {out.shape}") # Deve ser (Batch, T, 12)
        
        # 3. Projeção
        out = self.proj(out)
        return out

# Teste
model = MultiHeadSim()
x = torch.randn(1, 10, n_embd) # (Batch, Time, Channels=12)
output = model(x)

[tensor([[[-0.7511, -0.0145,  0.6098,  0.2055],
         [ 0.4603,  1.3308, -0.1973, -0.7487],
         [ 0.2193,  0.3633, -0.0208,  0.7879],
         [ 0.2667, -0.4752,  0.2586,  0.0215],
         [ 1.0746,  0.8932, -0.3398, -0.6778],
         [ 1.3327, -0.6830, -1.0598, -0.0645],
         [-0.0712,  0.5277,  0.1527, -0.1633],
         [ 1.3867,  0.0329, -0.8127, -0.1333],
         [ 0.9844,  0.4528,  0.0932, -0.1519],
         [ 0.5773,  1.1063, -0.1016, -0.5399]]], grad_fn=<ViewBackward0>), tensor([[[ 0.2372,  0.6217,  0.0210,  0.6227],
         [ 1.0184,  0.3353, -0.4608, -0.6644],
         [ 0.4604,  0.5118, -0.1035,  0.7490],
         [-0.1434, -1.0228, -0.2137, -0.4070],
         [ 0.4483, -0.1489,  0.9989,  0.7128],
         [ 0.1009, -0.2841, -0.3198, -0.2053],
         [ 0.1754,  0.1336, -0.1278,  0.6057],
         [-0.2005,  0.0565,  1.4174,  0.2233],
         [ 0.8746, -0.4146, -0.1881, -0.1749],
         [ 0.7901, -0.1415, -0.4115, -0.1692]]], grad_fn=<ViewBackward0>), ten