In [1]:
from transformers_model import Transformer
import numpy as np

# Importation des données

In [2]:
# Dataset simplifié
data = [
    ('hello', 'bonjour'),
    ('how are you', 'comment ça va'),
    ('good morning', 'bonjour'),
    ('good night', 'bonne nuit'),
    ('thank you', 'merci'),
    ('see you later', 'à plus tard'),
    ('yes', 'oui'),
    ('no', 'non'),
    ('please', 's\'il vous plaît'),
    ('i love you', 'je t\'aime')
]

# Construction du vocabulaire
def build_vocab(sentences):
    vocab = {'<PAD>': 0, '<UNK>': 1}
    idx = 2
    for sentence in sentences:
        for word in sentence.strip().split():
            if word not in vocab:
                vocab[word] = idx
                idx += 1
    return vocab

input_sentences = [pair[0] for pair in data]
target_sentences = [pair[1] for pair in data]

input_vocab = build_vocab(input_sentences)
target_vocab = build_vocab(target_sentences)

input_vocab_size = len(input_vocab)
target_vocab_size = len(target_vocab)
max_seq_length = 5  # Limitation pour simplifier

# Encodage des phrases
def encode_sentence(sentence, vocab, max_length):
    tokens = sentence.strip().split()
    token_ids = [vocab.get(token, vocab['<UNK>']) for token in tokens]
    if len(token_ids) < max_length:
        token_ids += [vocab['<PAD>']] * (max_length - len(token_ids))
    else:
        token_ids = token_ids[:max_length]
    return token_ids

# Préparation des données d'entraînement
X_train = []
y_train_input = []
y_train_output = []

for src_sentence, tgt_sentence in data:
    src_encoded = encode_sentence(src_sentence, input_vocab, max_seq_length)
    tgt_encoded = encode_sentence(tgt_sentence, target_vocab, max_seq_length)
    X_train.append(src_encoded)
    y_train_input.append([target_vocab['<PAD>']] + tgt_encoded[:-1])
    y_train_output.append(tgt_encoded)

X_train = np.array(X_train)
y_train_input = np.array(y_train_input)
y_train_output = np.array(y_train_output)

# Création du modèle

In [3]:
# Paramètres du modèle
d_model = 32
n_heads = 4
num_encoder_layers = 2
num_decoder_layers = 2
d_ff = 64

# Création du modèle Transformer
transformer = Transformer(
    d_model=d_model,
    n_heads=n_heads,
    num_encoder_layers=num_encoder_layers,
    num_decoder_layers=num_decoder_layers,
    d_ff=d_ff,
    input_vocab_size=input_vocab_size,
    target_vocab_size=target_vocab_size,
    max_seq_length=max_seq_length
)

# Entraînement du modèle

In [5]:
# Entraînement du modèle
num_epochs = 50
learning_rate = 0.001

for epoch in range(num_epochs):
    # Réinitialisation des gradients
    for param in transformer.parameters:
        param.grad = np.zeros_like(param.data)

    # Propagation avant
    output = transformer.forward(X_train, y_train_input)

    # Calcul de la perte (entropie croisée)
    batch_size, seq_length, vocab_size = output.shape
    output_flat = output.reshape(-1, vocab_size)
    y_true_flat = y_train_output.reshape(-1)
    loss = -np.sum(np.log(output_flat[np.arange(batch_size * seq_length), y_true_flat] + 1e-9)) / (batch_size * seq_length)
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch+1}, Loss: {loss}")

    # Calcul du gradient de la perte par rapport à la sortie
    d_output = output.copy()
    d_output = d_output.reshape(-1, vocab_size)
    d_output[np.arange(batch_size * seq_length), y_true_flat] -= 1
    d_output = d_output.reshape(batch_size, seq_length, vocab_size) / (batch_size * seq_length)

    # Rétropropagation
    transformer.backward(d_output)

    # Mise à jour des poids
    transformer.step(learning_rate)

Epoch 1, Loss: 2.9673544402646352


ValueError: non-broadcastable output operand with shape (32,) doesn't match the broadcast shape (5,32)