In [12]:
#Passo 1: Preparação dos Dados
#Vamos usar um corpus de texto simples (ex.: "ola mundo, como vai você?").
#Tokenização: Caracteres individuais.
#Inputs/Saídas: Para cada sequência de N caracteres, prever o próximo.

import numpy as np

# Dataset de exemplo
texto = "ola mundo, como vai você?"
vocabulario = sorted(list(set(texto)))
token_para_id = {c: i for i, c in enumerate(vocabulario)}
id_para_token = {i: c for i, c in enumerate(vocabulario)}
vocab_size = len(vocabulario)

# Parâmetros
seq_length = 5  # Tamanho da sequência de input
d_model = 8     # Dimensão dos embeddings (reduzida para simplificar)

# Criar pares (input, target)
inputs, targets = [], []
for i in range(len(texto) - seq_length):
    input_seq = texto[i:i + seq_length]
    target_char = texto[i + seq_length]
    inputs.append([token_para_id[c] for c in input_seq])
    targets.append(token_para_id[target_char])

inputs = np.array(inputs)  # Shape: [n_exemplos, seq_length]
targets = np.array(targets)  # Shape: [n_exemplos]

In [13]:
#Passo 2: Inicialização do Modelo
#Definimos:
#Embeddings: vocab_size x d_model.
#Pesos da Self-Attention: Q, K, V.
#MLP: 1 camada oculta.
# Embeddings (aleatórios, mas normalizados)
embedding = np.random.randn(vocab_size, d_model) * 0.1

# Self-Attention (projetores Q, K, V)
W_Q = np.random.randn(d_model, d_model) * 0.1
W_K = np.random.randn(d_model, d_model) * 0.1
W_V = np.random.randn(d_model, d_model) * 0.1

# MLP (1 camada oculta)
W1 = np.random.randn(d_model, 4 * d_model) * 0.1
b1 = np.random.randn(4 * d_model) * 0.1
W2 = np.random.randn(4 * d_model, vocab_size) * 0.1
b2 = np.random.randn(vocab_size) * 0.1


In [14]:
#Passo 3: Forward Pass
#Calculamos a saída do modelo para um input (com atenção simplificada).

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

def forward(x):
    # x: sequência de tokens (ex.: [5, 3, 1, 0, 2])
    # 1. Embeddings
    x_embed = embedding[x]  # Shape: [seq_length, d_model]
    
    # 2. Self-Attention (single head)
    Q = np.dot(x_embed, W_Q)  # [seq_length, d_model]
    K = np.dot(x_embed, W_K)
    V = np.dot(x_embed, W_V)
    scores = np.dot(Q, K.T) / np.sqrt(d_model)
    attn_weights = softmax(scores)
    attn_output = np.dot(attn_weights, V)  # [seq_length, d_model]
    
    # 3. MLP (apenas no último token)
    last_token = attn_output[-1]  # Pega o último token
    hidden = np.maximum(0, np.dot(last_token, W1) + b1)  # ReLU
    logits = np.dot(hidden, W2) + b2  # [vocab_size]
    return logits

# Exemplo:
logits = forward(inputs[0])
print("Logits:", logits)

Logits: [-0.08157956 -0.05113903 -0.0379167   0.00635051  0.0763397  -0.06914389
  0.06945129  0.11150198 -0.01258988  0.11689556  0.02346421  0.068696
  0.12067095  0.26868934]


In [15]:
#Passo 4: Loss e Backpropagation
#Usamos cross-entropy e calculamos os gradientes manualmente.

def compute_loss(logits, target):
    probs = softmax(logits)
    loss = -np.log(probs[target])
    return loss

# Gradientes (via diferenças finitas ou derivadas analíticas)
def backward(x, target, learning_rate=0.01):
    # Forward pass (guardando valores intermediários)
    x_embed = embedding[x]
    Q = np.dot(x_embed, W_Q)
    K = np.dot(x_embed, W_K)
    V = np.dot(x_embed, W_V)
    scores = np.dot(Q, K.T) / np.sqrt(d_model)
    attn_weights = softmax(scores)
    attn_output = np.dot(attn_weights, V)
    last_token = attn_output[-1]
    hidden = np.maximum(0, np.dot(last_token, W1) + b1)
    logits = np.dot(hidden, W2) + b2
    
    # 1. Gradiente da cross-entropy
    probs = softmax(logits)
    d_logits = probs
    d_logits[target] -= 1
    
    # 2. Gradiente do MLP
    d_W2 = np.outer(hidden, d_logits)
    d_b2 = d_logits.copy()
    d_hidden = np.dot(W2, d_logits)
    d_hidden[hidden <= 0] = 0  # Gradiente da ReLU
    
    d_W1 = np.outer(last_token, d_hidden)
    d_b1 = d_hidden.copy()
    
    # 3. Gradiente da atenção (simplificado)
    # (Implementação completa requer mais cuidado)
    d_attn_output = np.dot(W1, d_hidden)  # [d_model]
    
    # Atualização dos pesos (SGD)
    W2 -= learning_rate * d_W2
    b2 -= learning_rate * d_b2
    W1 -= learning_rate * d_W1
    b1 -= learning_rate * d_b1
    
    # Gradiente dos embeddings (opcional)
    # ...

In [16]:
#Passo 5: Treinamento
#Loop sobre os dados para ajustar os pesos.

# --- 5. Treinamento ---
n_epochs = 100
learning_rate = 0.01

for epoch in range(n_epochs):
    total_loss = 0
    for i in range(len(inputs)):
        x = inputs[i]
        target = targets[i]
        
        # Forward pass (guardando cache para backprop)
        logits, cache = forward(x)  # Agora forward retorna (logits, cache)
        loss = compute_loss(logits, target)
        
        # Backward pass
        ## Extrai variáveis do cache
        x_embed, Q, K, V, attn_weights, attn_output, last_token, hidden = cache
        
        # Gradiente da cross-entropy
        probs = softmax(logits)
        d_logits = probs
        d_logits[target] -= 1
        
        # Gradiente do MLP
        d_W2 = np.outer(hidden, d_logits)
        d_b2 = d_logits.copy()
        d_hidden = np.dot(W2, d_logits)
        d_hidden[hidden <= 0] = 0  # Gradiente da ReLU
        
        d_W1 = np.outer(last_token, d_hidden)
        d_b1 = d_hidden.copy()
        
        # Atualização dos pesos
        W2 -= learning_rate * d_W2
        b2 -= learning_rate * d_b2
        W1 -= learning_rate * d_W1
        b1 -= learning_rate * d_b1
        
        total_loss += loss
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss / len(inputs)}")

ValueError: too many values to unpack (expected 2)

In [6]:
#Passo 6: Geração de Texto
#Após o treino, use o modelo para prever caracteres.

def generate_text(start_seq, max_length=10):
    tokens = [token_para_id[c] for c in start_seq]
    for _ in range(max_length):
        logits = forward(tokens[-seq_length:])
        next_token = np.argmax(logits)
        tokens.append(next_token)
    return ''.join([id_para_token[t] for t in tokens])

# Exemplo:
print(generate_text("ola m", max_length=5))

ola miiiii
