In [2]:
import numpy as np

# --- 1. Preparação dos Dados ---
texto = "ola mundo, como vai você?"
vocabulario = sorted(list(set(texto)))
token_para_id = {c: i for i, c in enumerate(vocabulario)}
id_para_token = {i: c for i, c in enumerate(vocabulario)}
vocab_size = len(vocabulario)

# Parâmetros
seq_length = 5  # Tamanho da sequência de input
d_model = 8     # Dimensão dos embeddings
learning_rate = 0.01
n_epochs = 100

# Criar pares (input, target)
inputs, targets = [], []
for i in range(len(texto) - seq_length):
    inputs.append([token_para_id[c] for c in texto[i:i + seq_length]])
    targets.append(token_para_id[texto[i + seq_length]])
inputs = np.array(inputs)
targets = np.array(targets)

# --- 2. Inicialização do Modelo ---
np.random.seed(42)
embedding = np.random.randn(vocab_size, d_model) * 0.1

# Self-Attention
W_Q = np.random.randn(d_model, d_model) * 0.1
W_K = np.random.randn(d_model, d_model) * 0.1
W_V = np.random.randn(d_model, d_model) * 0.1

# MLP
W1 = np.random.randn(d_model, 4 * d_model) * 0.1
b1 = np.zeros(4 * d_model)
W2 = np.random.randn(4 * d_model, vocab_size) * 0.1
b2 = np.zeros(vocab_size)

# --- 3. Funções do Modelo ---
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

def forward(x):
    # Embeddings
    x_embed = embedding[x]  # [seq_length, d_model]
    
    # Self-Attention
    Q = np.dot(x_embed, W_Q)
    K = np.dot(x_embed, W_K)
    V = np.dot(x_embed, W_V)
    scores = np.dot(Q, K.T) / np.sqrt(d_model)
    attn_weights = softmax(scores)
    attn_output = np.dot(attn_weights, V)  # [seq_length, d_model]
    
    # MLP (último token)
    last_token = attn_output[-1]
    hidden = np.maximum(0, np.dot(last_token, W1) + b1)  # ReLU
    logits = np.dot(hidden, W2) + b2  # [vocab_size]
    return logits, (x_embed, Q, K, V, attn_weights, attn_output, last_token, hidden)

def compute_loss(logits, target):
    probs = softmax(logits)
    return -np.log(probs[target])

# --- 4. Backpropagation Manual ---
def backward(x, target, cache):
    x_embed, Q, K, V, attn_weights, attn_output, last_token, hidden = cache
    
    # Gradiente da cross-entropy
    probs = softmax(np.dot(hidden, W2) + b2)
    d_logits = probs
    d_logits[target] -= 1
    
    # Gradiente do MLP
    d_W2 = np.outer(hidden, d_logits)
    d_b2 = d_logits.copy()
    d_hidden = np.dot(W2, d_logits)
    d_hidden[hidden <= 0] = 0  # ReLU gradient
    
    d_W1 = np.outer(last_token, d_hidden)
    d_b1 = d_hidden.copy()
    d_last_token = np.dot(W1, d_hidden)
    
    # Gradiente da atenção (simplificado)
    d_attn_output = np.zeros_like(attn_output)
    d_attn_output[-1] = d_last_token
    
    d_V = np.dot(attn_weights.T, d_attn_output)
    d_attn_weights = np.dot(d_attn_output, V.T)
    
    # Atualização dos pesos
    global W_Q, W_K, W_V, W1, b1, W2, b2, embedding
    W2 -= learning_rate * d_W2
    b2 -= learning_rate * d_b2
    W1 -= learning_rate * d_W1
    b1 -= learning_rate * d_b1
    
    # (Otimização: gradientes de Q, K, V e embedding omitidos por simplicidade)

# --- 5. Treinamento ---
for epoch in range(n_epochs):
    total_loss = 0
    for i in range(len(inputs)):
        x = inputs[i]
        target = targets[i]
        
        logits, cache = forward(x)
        loss = compute_loss(logits, target)
        backward(x, target, cache)
        
        total_loss += loss
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss / len(inputs)}")

# --- 6. Geração de Texto ---
def generate_text(start_seq, max_length=10):
    tokens = [token_para_id[c] for c in start_seq]
    for _ in range(max_length):
        logits, _ = forward(tokens[-seq_length:])
        next_token = np.argmax(logits)
        tokens.append(next_token)
    return ''.join([id_para_token[t] for t in tokens])

# Teste
print("\nTexto gerado:", generate_text("ola m", max_length=10))

SyntaxError: name 'W1' is used prior to global declaration (3429934536.py, line 93)