# Cours LLM
# AIFOUTE Othmane


# RNN

Lien du dataset: https://github.com/VincentChen95/Machine-Translation-Based-On-RNN-Model/tree/master/data

In [None]:
import collections
import numpy as np
import pickle
import nltk
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import (
    Embedding, SimpleRNN, GRU, LSTM, Bidirectional,
    Input, TimeDistributed, Dense
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

# Import custom helper module
import helper


In [None]:
# Load English data
english_sentences = helper.load_data('data/small_vocab_en')
# Load French data
french_sentences = helper.load_data('data/small_vocab_fr')

print('Dataset Loaded')

Dataset Loaded


In [None]:
for sample_i in range(2):
    print('small_vocab_en Line {}:  {}'.format(sample_i + 1, english_sentences[sample_i]))
    print('small_vocab_fr Line {}:  {}'.format(sample_i + 1, french_sentences[sample_i]))

small_vocab_en Line 1:  new jersey is sometimes quiet during autumn , and it is snowy in april .
small_vocab_fr Line 1:  new jersey est parfois calme pendant l' automne , et il est neigeux en avril .
small_vocab_en Line 2:  the united states is usually chilly during july , and it is usually freezing in november .
small_vocab_fr Line 2:  les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .


In [None]:
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])

print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()
print('{} French words.'.format(len([word for sentence in french_sentences for word in sentence.split()])))
print('{} unique French words.'.format(len(french_words_counter)))
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

1823250 English words.
227 unique English words.
10 Most common words in the English dataset:
"is" "," "." "in" "it" "during" "the" "but" "and" "sometimes"

1961295 French words.
355 unique French words.
10 Most common words in the French dataset:
"est" "." "," "en" "il" "les" "mais" "et" "la" "parfois"


In [None]:
def tokenize(x):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    x_tk = Tokenizer()
    x_tk.fit_on_texts(x)
    return x_tk.texts_to_sequences(x), x_tk

In [None]:
def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    # TODO: Implement
    if length == None:
        length = max([len(sentence) for sentence in x])
    print(length)
    return pad_sequences(x,maxlen=length,padding='post')


### Preprocess Pipeline

In [None]:
def preprocess(x, y):
    """
    Preprocess x and y
    :param x: Feature List of sentences
    :param y: Label List of sentences
    :return: Tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer =\
    preprocess(english_sentences, french_sentences)

max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

15
21
Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 199
French vocabulary size: 344


In [None]:
def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

print('`logits_to_text` function loaded.')

`logits_to_text` function loaded.


In [None]:
def adjust_labels(y, vocab_size):

    y_adjusted = np.array(y)
    y_adjusted[y_adjusted >= vocab_size] = vocab_size - 1
    return y_adjusted

preproc_french_sentences = adjust_labels(preproc_french_sentences, french_vocab_size)


## Metrique BLEU

In [None]:

def calculate_bleu(model, x_test, y_test, tokenizer, max_output_length=30):
    """
    Calcule le BLEU score sur un jeu de test
    :param model: Le modèle entraîné
    :param x_test: Les données d'entrée du jeu de test
    :param y_test: Les véritables phrases (références)
    :param tokenizer: Le tokenizer pour convertir les indices en mots
    :param max_output_length: Longueur maximale de la séquence de sortie
    :return: BLEU score moyen
    """
    total_bleu = 0
    num_samples = len(x_test)

    for i in range(num_samples):
        # Prédire la séquence pour une phrase en entrée
        pred = model.predict(x_test[i:i+1])  # Prediction for one sample at a time

        # Convertir les logits en indices de mots (utilisation de np.argmax pour obtenir les indices)
        pred_indices = np.argmax(pred[0], axis=-1)

        # Convertir les indices en mots à l'aide du tokenizer
        pred_sentence = [tokenizer.index_word.get(idx, '<UNK>') for idx in pred_indices]
        pred_sentence = ' '.join(pred_sentence).split('<PAD>')[0]  # Enlever le padding

        # Convertir les véritables indices en mots (références)
        true_sentence = [tokenizer.index_word.get(int(idx), '<UNK>') for idx in y_test[i]]  # Convertir chaque élément en int
        true_sentence = ' '.join(true_sentence).split('<PAD>')[0]  # Enlever le padding

        # Calculer le BLEU score pour cette prédiction
        bleu_score = sentence_bleu([true_sentence.split()], pred_sentence.split())
        total_bleu += bleu_score

    # Calculer le BLEU score moyen
    avg_bleu = total_bleu / num_samples
    return avg_bleu





In [None]:


def simple_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a basic RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    #Config Hyperparameters
    learning_rate = 0.01

    #Config Model
    inputs = Input(shape=input_shape[1:])
    hidden_layer = SimpleRNN(output_sequence_length, return_sequences=True)(inputs)
    # The output is the french_vocab_size~
    outputs = TimeDistributed(Dense(french_vocab_size, activation='softmax'))(hidden_layer)
    #Create Model from parameters defined above
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model


# Reshaping the input to work with a basic RNN
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

x_train, x_test, y_train, y_test = train_test_split(tmp_x, preproc_french_sentences, test_size=0.2, random_state=42)

# Train the neural network
simple_rnn_model = simple_model(
        x_train.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)
simple_rnn_model.fit(x_train, y_train, batch_size=1024, epochs=50, validation_split=0.2)
simple_rnn_model.summary()
print(logits_to_text(simple_rnn_model.predict(x_test[:1])[0], french_tokenizer))


Epoch 1/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 236ms/step - accuracy: 0.3208 - loss: 3.7272 - val_accuracy: 0.4512 - val_loss: 2.4589
Epoch 2/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 228ms/step - accuracy: 0.4710 - loss: 2.4034 - val_accuracy: 0.5135 - val_loss: 2.2613
Epoch 3/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 257ms/step - accuracy: 0.5122 - loss: 2.2396 - val_accuracy: 0.5251 - val_loss: 2.1576
Epoch 4/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 232ms/step - accuracy: 0.5270 - loss: 2.1196 - val_accuracy: 0.5300 - val_loss: 2.0227
Epoch 5/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 246ms/step - accuracy: 0.5295 - loss: 2.0077 - val_accuracy: 0.5321 - val_loss: 1.9588
Epoch 6/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 257ms/step - accuracy: 0.5308 - loss: 1.9464 - val_accuracy: 0.5310 - val_loss: 1.9183
Epoch 7/50
[1m87/87[

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 419ms/step
californie est est est en en et il est il est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [None]:
modelnamefile = 'rnn.sav'
pickle.dump(simple_rnn_model, open(modelnamefile, 'wb'))

In [None]:
#x_train, x_test, y_train, y_test = train_test_split(tmp_x, preproc_french_sentences, test_size=0.2, random_state=42)

# Sélectionner les 500 dernières lignes pour l'évaluation BLEU
bleu_score = calculate_bleu(simple_rnn_model, tmp_x[-500:], preproc_french_sentences[-500:], french_tokenizer)
print(f'BLEU Score: {bleu_score}')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step


  true_sentence = [tokenizer.index_word.get(int(idx), '<UNK>') for idx in y_test[i]]  # Convertir chaque élément en int


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5

---------------

# LSTM

In [None]:


def lstm_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train an LSTM-based model on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    # Config Hyperparameters
    learning_rate = 0.001

    # Config Model
    inputs = Input(shape=input_shape[1:])
    # Remplacer SimpleRNN par LSTM
    hidden_layer = LSTM(output_sequence_length, return_sequences=True)(inputs)
    # The output is the french_vocab_size
    outputs = TimeDistributed(Dense(french_vocab_size, activation='softmax'))(hidden_layer)
    # Create Model from parameters defined above
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model


# Reshaping the input to work with an LSTM
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

x_train, x_test, y_train, y_test = train_test_split(tmp_x, preproc_french_sentences, test_size=0.2, random_state=42)

# Train the neural network with LSTM model
lstm_model_instance = lstm_model(
        x_train.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)

# Train the model
lstm_model_instance.fit(x_train, y_train, batch_size=1024, epochs=50, validation_split=0.2)
lstm_model_instance.summary()

# Test the model and print the result
print(logits_to_text(lstm_model_instance.predict(x_test[:1])[0], french_tokenizer))


Epoch 1/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 268ms/step - accuracy: 0.2028 - loss: 5.4386 - val_accuracy: 0.4089 - val_loss: 3.7371
Epoch 2/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 246ms/step - accuracy: 0.4079 - loss: 3.3938 - val_accuracy: 0.4093 - val_loss: 2.8868
Epoch 3/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 254ms/step - accuracy: 0.4078 - loss: 2.8411 - val_accuracy: 0.4096 - val_loss: 2.7274
Epoch 4/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 252ms/step - accuracy: 0.4103 - loss: 2.7034 - val_accuracy: 0.4292 - val_loss: 2.6249
Epoch 5/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 264ms/step - accuracy: 0.4356 - loss: 2.6040 - val_accuracy: 0.4740 - val_loss: 2.5308
Epoch 6/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 259ms/step - accuracy: 0.4768 - loss: 2.5114 - val_accuracy: 0.4858 - val_loss: 2.4510
Epoch 7/50
[1m87/87[

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 469ms/step
la est est parfois en en et il est est est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [None]:
modelnamefile = 'lstm.sav'
pickle.dump(lstm_model_instance, open(modelnamefile, 'wb'))

In [None]:
#x_train, x_test, y_train, y_test = train_test_split(tmp_x, preproc_french_sentences, test_size=0.2, random_state=42)

# Sélectionner les 500 dernières lignes pour l'évaluation BLEU
bleu_score = calculate_bleu(lstm_model_instance, tmp_x[-500:], preproc_french_sentences[-500:], french_tokenizer)
print(f'BLEU Score: {bleu_score}')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step


  true_sentence = [tokenizer.index_word.get(int(idx), '<UNK>') for idx in y_test[i]]  # Convertir chaque élément en int


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45

----

# BILSTM

In [None]:

def simple_bilstm_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a basic BiLSTM model on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    # Config Hyperparameters
    learning_rate = 0.001

    # Config Model
    inputs = Input(shape=input_shape[1:])
    hidden_layer = Bidirectional(LSTM(output_sequence_length, return_sequences=True))(inputs)
    # The output is the french_vocab_size
    outputs = TimeDistributed(Dense(french_vocab_size, activation='softmax'))(hidden_layer)

    # Create Model from parameters defined above
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model


# Reshaping the input to work with a BiLSTM
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

x_train, x_test, y_train, y_test = train_test_split(tmp_x, preproc_french_sentences, test_size=0.2, random_state=42)


# Train the neural network with BiLSTM
simple_bilstm_model = simple_bilstm_model(
        x_train.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)
simple_bilstm_model.fit(x_train, y_train, batch_size=1024, epochs=50, validation_split=0.2)

simple_bilstm_model.summary()



Epoch 1/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 285ms/step - accuracy: 0.1836 - loss: 5.4870 - val_accuracy: 0.4528 - val_loss: 3.3523
Epoch 2/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 272ms/step - accuracy: 0.4667 - loss: 2.9606 - val_accuracy: 0.4856 - val_loss: 2.5038
Epoch 3/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 270ms/step - accuracy: 0.4840 - loss: 2.4528 - val_accuracy: 0.4949 - val_loss: 2.3142
Epoch 4/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 271ms/step - accuracy: 0.4941 - loss: 2.2774 - val_accuracy: 0.5046 - val_loss: 2.1532
Epoch 5/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 273ms/step - accuracy: 0.5102 - loss: 2.1204 - val_accuracy: 0.5311 - val_loss: 2.0160
Epoch 6/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 276ms/step - accuracy: 0.5354 - loss: 1.9867 - val_accuracy: 0.5457 - val_loss: 1.9077
Epoch 7/50
[1m87/87[

In [None]:
modelnamefile = 'bilstm.sav'
pickle.dump(simple_bilstm_model, open(modelnamefile, 'wb'))

In [None]:

# Apply BLEU score on the 500 last sentences
bleu_score = calculate_bleu(simple_bilstm_model, tmp_x[-500:], preproc_french_sentences[-500:], french_tokenizer)
print(f'BLEU Score: {bleu_score}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 675ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step


  true_sentence = [tokenizer.index_word.get(int(idx), '<UNK>') for idx in y_test[i]]  # Convertir chaque élément en int


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46

----

# GRU

In [None]:
def gru_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a basic RNN with GRU on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    # Config Hyperparameters
    learning_rate = 0.01

    # Config Model
    inputs = Input(shape=input_shape[1:])
    hidden_layer = GRU(output_sequence_length, return_sequences=True)(inputs)
    outputs = TimeDistributed(Dense(french_vocab_size, activation='softmax'))(hidden_layer)

    # Create Model
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

# Example usage:
# Reshaping the input to work with a basic RNN
tmp_x = pad_sequences(preproc_english_sentences, maxlen=max_french_sequence_length, padding='post')
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

x_train, x_test, y_train, y_test = train_test_split(tmp_x, preproc_french_sentences, test_size=0.2, random_state=42)

# Train the neural network
gru_model = gru_model(
    x_train.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size
)
gru_model.fit(x_train, y_train, batch_size=1024, epochs=50, validation_split=0.2)
gru_model.summary()
print(logits_to_text(gru_model.predict(x_test[:1])[0], french_tokenizer))

Epoch 1/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 289ms/step - accuracy: 0.4076 - loss: 3.7276 - val_accuracy: 0.4876 - val_loss: 2.3737
Epoch 2/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 242ms/step - accuracy: 0.4792 - loss: 2.3273 - val_accuracy: 0.4862 - val_loss: 2.1941
Epoch 3/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 243ms/step - accuracy: 0.4991 - loss: 2.1357 - val_accuracy: 0.5452 - val_loss: 1.9463
Epoch 4/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 266ms/step - accuracy: 0.5484 - loss: 1.8989 - val_accuracy: 0.5686 - val_loss: 1.7802
Epoch 5/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 247ms/step - accuracy: 0.5717 - loss: 1.7430 - val_accuracy: 0.5835 - val_loss: 1.6493
Epoch 6/50
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 245ms/step - accuracy: 0.5856 - loss: 1.6231 - val_accuracy: 0.5887 - val_loss: 1.5763
Epoch 7/50
[1m87/87[

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 509ms/step
californie est parfois parfois en l' et il il parfois parfois parfois en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [None]:
modelnamefile = 'gru.sav'
pickle.dump(gru_model, open(modelnamefile, 'wb'))

In [None]:

# Apply BLEU score on the 500 last sentences
bleu_score = calculate_bleu(gru_model, tmp_x[-500:], preproc_french_sentences[-500:], french_tokenizer)
print(f'BLEU Score: {bleu_score}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step

  true_sentence = [tokenizer.index_word.get(int(idx), '<UNK>') for idx in y_test[i]]  # Convertir chaque élément en int


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52

-----

# BERT

In [None]:
from transformers import CamembertModel, CamembertTokenizer
from transformers import pipeline

In [None]:
tokenizer = CamembertTokenizer.from_pretrained("camembert/camembert-base-wikipedia-4gb")
camembert = CamembertModel.from_pretrained("camembert/camembert-base-wikipedia-4gb")


sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/509 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

In [None]:
camembert_fill_mask  = pipeline("fill-mask", model="camembert/camembert-base-wikipedia-4gb", tokenizer="camembert/camembert-base-wikipedia-4gb")
results = camembert_fill_mask("Le camembert est un fromage de <mask>!")

config.json:   0%|          | 0.00/455 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/793k [00:00<?, ?B/s]

In [None]:
phrases = ["La france est située en <mask>!", "J'adore manger des <mask>", "Le chocolat est issu de la fêve de <mask>!",
           "Emmanuel <mask> est le président de la France", "Le panda <mask> fait partie de la famille des ours",
             "J'ai pris un coup de <mask> à la plage"]

result = []
for phrase in phrases:
    result.append(camembert_fill_mask(phrase))

In [None]:
result

[[{'score': 0.06966320425271988,
   'token': 845,
   'token_str': 'mer',
   'sequence': 'La france est située en mer !'},
  {'score': 0.047582078725099564,
   'token': 1107,
   'token_str': 'terre',
   'sequence': 'La france est située en terre !'},
  {'score': 0.03956795111298561,
   'token': 1114,
   'token_str': '!',
   'sequence': 'La france est située en ! !'},
  {'score': 0.038430895656347275,
   'token': 90,
   'token_str': 'France',
   'sequence': 'La france est située en France !'},
  {'score': 0.02393495850265026,
   'token': 2301,
   'token_str': 'forêt',
   'sequence': 'La france est située en forêt !'}],
 [{'score': 0.0340404212474823,
   'token': 17431,
   'token_str': 'champignon',
   'sequence': "J'adore manger des champignon"},
  {'score': 0.02699596807360649,
   'token': 7260,
   'token_str': 'poissons',
   'sequence': "J'adore manger des poissons"},
  {'score': 0.025112522765994072,
   'token': 18401,
   'token_str': 'champignons',
   'sequence': "J'adore manger des 

----

# GPT

In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)

model.safetensors:  65%|######5   | 357M/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
generator("This tree is,", max_length=30, num_return_sequences=5)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


[{'generated_text': 'This tree is, after all, for sure:\n\n\nCynthia\n\nThe Great Tree of Athens (and later the Great Tree of Rhodes'},
 {'generated_text': 'This tree is, apparently, still in the works.\n\nAnd that\'s an important point.\n\nThe word "tree" is a generic'},
 {'generated_text': 'This tree is, as ever, much more resilient than the one before it. It grows and grows every day, but the soil will lose its protective'},
 {'generated_text': 'This tree is, of course, very much like the trees of the Old World.\n\nThat\'s what we learned in my book, "The'},
 {'generated_text': 'This tree is, I\'m glad you\'re here with her," the owner said to them. "What are you getting for the extra money?"\n'}]

In [None]:
generator("After having a few beers, I ", max_length=30, num_return_sequences=5)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


[{'generated_text': "After having a few beers, I iced the coffee with my favorite coffee grind: Fertiliser.\n\nOn to today's recipe,"},
 {'generated_text': 'After having a few beers, I \xa0hoped to see what would happen.\nA couple of beers later, I was in a car on'},
 {'generated_text': 'After having a few beers, I \xa0changed the topic.\nI know this is a pretty difficult topic to answer and may take a while,'},
 {'generated_text': 'After having a few beers, I \xa0thought to myself this is a much better opportunity for me to show a little bit more humility. I mean'},
 {'generated_text': 'After having a few beers, I \xa0had this thought: \xa0"Hey, hey, are you all right?" And so I walked over'}]

In [None]:
generator("This bear is eating fish ", max_length=30, num_return_sequences=5)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


[{'generated_text': "This bear is eating fish urchin, but I think that's it.\n\nAs a bear, we usually eat fish, especially in cold"},
 {'generated_text': "This bear is eating fish \xa0with an \xa0incredible number of teeth. That's right, the bears are swallowing your teeth.\nB"},
 {'generated_text': 'This bear is eating fish \xa0for fish. He\'s not a bad man and he hasn\'t got a clue what he\'s doing."\nI'},
 {'generated_text': "This bear is eating fish \xa0or dead or somewhere, which seems odd to most people, but it's a very curious behavior to most humans."},
 {'generated_text': 'This bear is eating fish !! (I just started looking for this bear and got one and it was dead).\n\nTired hunting bears!!'}]

In [None]:
generator("My family is having fun", max_length=30, num_return_sequences=5)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


[{'generated_text': "My family is having fun with our children and with my grandchildren. It's all wonderful, I have a great sense of humor and an appreciation for people"},
 {'generated_text': 'My family is having fun," she said when asked about her current social standing.\n\nIn addition, Marder mentioned that her grandfather grew up'},
 {'generated_text': "My family is having fun at this year's event. We decided to have a barbecue with our friends, a nice lunch that was great in our honor"},
 {'generated_text': 'My family is having fun and the girls are going through it like they do almost every weekend," he said. "They got in a good fight."'},
 {'generated_text': 'My family is having fun together. It\'s such a lovely, beautiful summer in Toronto."\n\nGrimness and laughter, however, have been'}]

In [None]:
generator("I have graduated this year", max_length=30, num_return_sequences=5)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


[{'generated_text': 'I have graduated this year from the university with the first class of 2019 to get a new one to show for the new one I have just finished doing'},
 {'generated_text': "I have graduated this year in the science career as well as in my engineering career from Johns Hopkins, where I've performed most successfully for over eight years"},
 {'generated_text': 'I have graduated this year from college in Illinois, attended Yale, and attended Harvard, which is a great opportunity to do well in college. There are'},
 {'generated_text': "I have graduated this year.\n\nI wish to express my gratitude for everyone's support and prayers in the past months for continuing the success of our"},
 {'generated_text': 'I have graduated this year into the U.S. military. There are now 2,800 combat assignments by the Marine Corps Office of Special Plans—'}]

In [None]:
generator("Since this morning I ate", max_length=30, num_return_sequences=5)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


[{'generated_text': "Since this morning I ate our new food in the afternoon and got up in time at midnight to make my breakfast. It's a small bowl which comes"},
 {'generated_text': 'Since this morning I ate a bunch of peanut butter and jelly with a spoon and came out without stopping to get some coffee. I was really surprised,'},
 {'generated_text': "Since this morning I ate. After that I went back to sleep.\nIt's not clear what took her so long, but it's been quite"},
 {'generated_text': 'Since this morning I ate it, because I knew how good it could be. It has a nice bite and is not too sweet.'},
 {'generated_text': 'Since this morning I ate a sandwich with some fresh chicken, some vegetables, a lot of lettuce and some onions. It turned into a delicious meal.'}]

----

# Fine-Tuning

In [1]:
!pip install datasets
!pip install transformers



In [22]:
from datasets import load_dataset

books = load_dataset("opus_books", "en-fr")

In [23]:
books = books["train"].train_test_split(test_size=0.2)

In [24]:
books["train"][0]

{'id': '110530',
 'translation': {'en': 'My uncle retraced his steps.',
  'fr': 'Mon oncle revint sur ses pas.'}}

In [25]:
books['train']

Dataset({
    features: ['id', 'translation'],
    num_rows: 101668
})

In [26]:
books['train'] = books['train'].select(range(100))
books['test'] = books['test'].select(range(100))


In [27]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [28]:
source_lang = "en"
target_lang = "fr"
prefix = "translate English to French: "


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [29]:
tokenized_books = books.map(preprocess_function, batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [30]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")

In [31]:
!pip install evaluate



In [32]:
!pip install sacrebleu



In [33]:
import evaluate

metric = evaluate.load("sacrebleu")

In [34]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [35]:
from transformers import AdamWeightDecay

optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [36]:
from transformers import TFAutoModelForSeq2SeqLM

model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [41]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_books["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    tokenized_books["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [42]:
import tensorflow as tf

model.compile(optimizer=optimizer)  # No loss argument!

In [43]:
train_validation_split = tokenized_books["train"].train_test_split(test_size=0.2)
tokenized_books["train"] = train_validation_split["train"]
tokenized_books["validation"] = train_validation_split["test"]

tf_validation_set = model.prepare_tf_dataset(
    tokenized_books["validation"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [44]:
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7fa6f6e78e50>

In [45]:
text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."

In [46]:
from transformers import pipeline

translator = pipeline("translation_en_to_fr", model= model, tokenizer = tokenizer)
translator(text)

Device set to use 0


[{'translation_text': "Les légumes partagent leurs ressources avec les bactéries fixatrices d'azote."}]

In [40]:
for batch in test_data:
    print(batch)
    break

({'input_ids': array([[13959,  1566,    12, ...,     0,     0,     0],
       [13959,  1566,    12, ...,   239,     5,     1],
       [13959,  1566,    12, ...,     0,     0,     0],
       ...,
       [13959,  1566,    12, ...,     0,     0,     0],
       [13959,  1566,    12, ...,     0,     0,     0],
       [13959,  1566,    12, ...,     0,     0,     0]]), 'attention_mask': array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])}, array([[ 2180, 24308,     3, ...,  -100,  -100,  -100],
       [  695,   197,   798, ...,  -100,  -100,  -100],
       [  802,     3,  3925, ...,  -100,  -100,  -100],
       ...,
       [ 6400,  4502,  3931, ...,  -100,  -100,  -100],
       [ 1636, 10857, 15419, ...,  -100,  -100,  -100],
       [    3,   104, 18926, ...,  -100,  -100,  -100]]))


In [47]:
phrases = train_validation_split["test"]['translation']


{'en': '"Hé! the Mystery," said Gringoire.',
 'fr': '– Hé ! le mystère, dit Gringoire.'}

In [54]:
# Access the first key-value pair in the dictionary
phrase_anglais = []
phrase_francais = []
for phrase in phrases[:20]:
  first_key = list(phrase.keys())[0]
  first_value = phrase[first_key]
  phrase_anglais.append(first_value)
  first_key = list(phrase.keys())[1]
  first_value = phrase[first_key]
  phrase_francais.append(first_value)      # Get the value for the first key


In [62]:
predictions = []
for phrase in phrase_anglais:
  translator = pipeline("translation_en_to_fr", model= model, tokenizer = tokenizer)
  predictions.append(translator(phrase))

Device set to use 0
Device set to use 0
Device set to use 0
Device set to use 0
Device set to use 0
Device set to use 0
Device set to use 0
Device set to use 0
Device set to use 0
Device set to use 0
Device set to use 0
Device set to use 0
Device set to use 0
Device set to use 0
Device set to use 0
Device set to use 0


In [56]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

In [63]:
predictions

[[{'translation_text': '"Hé! das Mystery", sagte Gringoire.'}],
 [{'translation_text': '"Have you been long here?'}],
 [{'translation_text': '"Let him come in," sagte ich.'}],
 [{'translation_text': "Un dieser chieftains s'est rapproché du Nautilus, et l'a examiné avec soin."}],
 [{'translation_text': 'Sie verschwunden während mehrere Wochen.'}],
 [{'translation_text': 'Fogg dropped, she saw that he was meditating some serious project.'}],
 [{'translation_text': 'Warum waren Sie so schön?'}],
 [{'translation_text': 'stung him as if his skin had been branded with a red-hot iron. When his thoughts settled on the pain this gash caused him, he suffered cruelly.'}],
 [{'translation_text': 'The Nautilus kept to its southeasterly heading.'}],
 [{'translation_text': 'He struggled, gave his word of honour that he was too much in a hurry.'}],
 [{'translation_text': 'My child, mychild!"'}],
 [{'translation_text': 'Ohne zu denken über men oder matrimonial, weder über matrimonial noch über matrimon

In [66]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

# Extract predicted text from the nested structure
predictions_text = [item[0]['translation_text'] for item in predictions]

# Ensure the reference sentences are tokenized
tokenized_references = [real.split() for real in phrase_francais]

# Tokenize the predicted sentences
tokenized_predictions = [pred.split() for pred in predictions_text]

# Calculate BLEU score for each sentence (sentence-by-sentence)
bleu_scores = [
    sentence_bleu([real], pred)
    for real, pred in zip(tokenized_references, tokenized_predictions)
]
print("Scores BLEU par phrase:", bleu_scores)

# Calculate BLEU score for the entire corpus
corpus_bleu_score = corpus_bleu(
    [[real] for real in tokenized_references],
    tokenized_predictions
)
print("Score BLEU pour l'ensemble du corpus:", corpus_bleu_score)


Scores BLEU par phrase: [8.166726842395623e-232, 0, 0, 6.223629500679345e-155, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Score BLEU pour l'ensemble du corpus: 1.5046411150799745e-155


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
from transformers import MarianMTModel

# Load the pre-trained model for English to French translation
model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
