# MarianMT Translation: English → French

In [1]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google'

### Set Project Directory on Google Drive

In [2]:
import os

project_path = "/content/drive/My Drive/en-fon"
os.chdir(project_path)

print("Répertoire actuel :", os.getcwd())


Répertoire actuel : /content/drive/My Drive/en-fon


### Define Languages and File Paths for Training and Testing

In [3]:
source_lang = "fr"
target_lang = "fon"

data_path = "/content/drive/My Drive/en-fon"  # à adapter si nécessaire
bpe_codes = f"{data_path}/bpe.codes.4000"
src_vocab = f"{data_path}/src_vocab.txt"
trg_vocab = f"{data_path}/trg_vocab.txt"
test_src = f"{data_path}/test.fr"
test_trg = f"{data_path}/test.fon"


### Create Directory to Save the Model Checkpoint

In [4]:
import os

save_path = f"/content/drive/My Drive/masakhane/{source_lang}-{target_lang}-checkpoint"
os.makedirs(save_path, exist_ok=True)
print("Dossier de sauvegarde :", save_path)


Dossier de sauvegarde : /content/drive/My Drive/masakhane/fr-fon-checkpoint


### Load MarianMT Model and Tokenizer from Hugging Face Hub

In [5]:
from transformers import MarianTokenizer, MarianMTModel
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"  # English -> French


tokenizer = MarianTokenizer.from_pretrained(model_checkpoint)
model = MarianMTModel.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### Test on model Translation capacity ( Function Using MarianMT Model)

In [6]:
from transformers import MarianTokenizer, MarianMTModel

# Load the model and tokenizer
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_checkpoint)
model = MarianMTModel.from_pretrained(model_checkpoint)

# Translation function
def translate(text):
    # Prepare the input
    inputs = tokenizer(text, return_tensors="pt", padding=True)

    # Generate the translation
    translated = model.generate(**inputs)

    # Decode the result
    translation = tokenizer.decode(translated[0], skip_special_tokens=True)

    return translation

# Test
text_en = "Hello, how are you today?"
text_fr = translate(text_en)
print(f"English: {text_en}")
print(f"French: {text_fr}")

# Translate multiple sentences
texts = [
    "The weather is beautiful today.",
    "I love learning new languages.",
    "Machine translation is fascinating."
]

for text in texts:
    print(f"\n'{text}' → '{translate(text)}'")


English: Hello, how are you today?
French: Bonjour, comment allez-vous aujourd'hui ?

'The weather is beautiful today.' → 'Le temps est beau aujourd'hui.'

'I love learning new languages.' → 'J'adore apprendre de nouvelles langues.'

'Machine translation is fascinating.' → 'La traduction automatique est fascinante.'


### Translate Full Paragraphs Using MarianMT Model

In [7]:
# Pour des paragraphes entiers
long_text = """
Artificial intelligence has revolutionized many industries.
Machine learning algorithms can now perform tasks that were
once thought to be exclusively human. The future of AI is
both exciting and challenging.
"""

translation = translate(long_text)
print(translation)

L'intelligence artificielle a révolutionné de nombreuses industries. Les algorithmes d'apprentissage automatique peuvent maintenant effectuer des tâches qui étaient autrefois considérées comme exclusivement humaines. L'avenir de l'IA est à la fois passionnant et stimulant.


### Read and Translate a File

In [8]:
def translate_file(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        text = f.read()

    translation = translate(text)

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(translation)

    print(f"Translation saved in {output_file}")


### Interactive English → French Translator

In [9]:
print("=== English → French Translator ===")
print("Type 'quit' to exit\n")

while True:
    text = input("English: ")
    if text.lower() == 'quit':
        break

    translation = translate(text)
    print(f"French: {translation}\n")


### Bidirectional English ↔ French Translation Test

In [10]:
model_fr_en = "Helsinki-NLP/opus-mt-fr-en"
tokenizer_fr_en = MarianTokenizer.from_pretrained(model_fr_en)
model_fr_en_loaded = MarianMTModel.from_pretrained(model_fr_en)

def translate_fr_to_en(text):
    # Prepare input
    inputs = tokenizer_fr_en(text, return_tensors="pt", padding=True)

    # Generate translation
    translated = model_fr_en_loaded.generate(**inputs)

    # Decode result
    return tokenizer_fr_en.decode(translated[0], skip_special_tokens=True)

# Bidirectional test
en_text = "I love Python programming."
fr_text = translate(en_text)
back_to_en = translate_fr_to_en(fr_text)

print(f"Original (EN): {en_text}")
print(f"French: {fr_text}")
print(f"Back to English: {back_to_en}")


Original (EN): I love Python programming.
French: J'adore la programmation de Python.
Back to English: I love Python programming.


## Evaluation of MarianMT Machine Translation Models (EN↔FR) – Speed, Quality, Memory, and Stress Tests

In [11]:
from transformers import MarianTokenizer, MarianMTModel
import time
import torch
from sacrebleu.metrics import BLEU, CHRF
import numpy as np
import psutil
import os

# ===============================
# 1. Set up models and tokenizers
# ===============================

models = {
    "en-fr": "Helsinki-NLP/opus-mt-en-fr",
    "fr-en": "Helsinki-NLP/opus-mt-fr-en"
}

tokenizers = {}
model_objects = {}

for key, checkpoint in models.items():
    tokenizers[key] = MarianTokenizer.from_pretrained(checkpoint)
    model_objects[key] = MarianMTModel.from_pretrained(checkpoint)
    model_objects[key].eval()

# ===============================
# 2. Translation function
# ===============================

def translate(texts, model_key="en-fr"):
    """Translate text or list of texts using specified model."""
    tokenizer = tokenizers[model_key]
    model = model_objects[model_key]

    if isinstance(texts, str):
        texts = [texts]

    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model.generate(**inputs)
    translations = [tokenizer.decode(t, skip_special_tokens=True) for t in outputs]

    return translations if len(translations) > 1 else translations[0]

# ===============================
# 3. Speed measurement
# ===============================

def measure_speed(texts, model_key="en-fr", num_runs=3):
    times = []
    for _ in range(num_runs):
        start_time = time.time()
        translate(texts, model_key)
        end_time = time.time()
        times.append(end_time - start_time)

    avg_time = np.mean(times)
    std_time = np.std(times)
    return {
        'avg_time': avg_time,
        'std_time': std_time,
        'texts_per_second': len(texts) / avg_time,
        'ms_per_text': (avg_time / len(texts)) * 1000
    }

# ===============================
# 4. Quality measurement (BLEU & chrF)
# ===============================

def calculate_bleu(predictions, references):
    bleu = BLEU()
    return bleu.corpus_score(predictions, [references]).score

def calculate_chrf(predictions, references):
    chrf = CHRF()
    return chrf.corpus_score(predictions, [references]).score

def evaluate_quality(test_dataset, model_key="en-fr"):
    sources = [item["source"] for item in test_dataset]
    references = [item["reference"] for item in test_dataset]
    predictions = [translate(src, model_key) for src in sources]

    bleu_score = calculate_bleu(predictions, references)
    chrf_score = calculate_chrf(predictions, references)

    return predictions, bleu_score, chrf_score

# ===============================
# 5. Memory measurement
# ===============================

def measure_memory(texts, model_key="en-fr"):
    model = model_objects[model_key]
    tokenizer = tokenizers[model_key]

    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()
        inputs = tokenizer(texts, return_tensors="pt", padding=True).to('cuda')
        model.to('cuda')
        with torch.no_grad():
            model.generate(**inputs)
        memory_mb = torch.cuda.max_memory_allocated() / 1024 / 1024
        model.to('cpu')
    else:
        process = psutil.Process(os.getpid())
        memory_mb = process.memory_info().rss / 1024 / 1024

    return memory_mb

# ===============================
# 6. Stress test (scalability)
# ===============================

def stress_test(text, model_key="en-fr", batch_sizes=[1,5,10,20,50]):
    results = []
    for batch_size in batch_sizes:
        texts = [text] * batch_size
        start_time = time.time()
        translate(texts, model_key)
        end_time = time.time()
        duration = end_time - start_time
        throughput = batch_size / duration
        results.append({'batch_size': batch_size, 'time': duration, 'throughput': throughput})
        print(f"Batch size {batch_size:3d}: {duration:.3f}s ({throughput:.2f} texts/s)")
    return results

# ===============================
# 7. Test datasets
# ===============================

test_dataset_en_fr = [
    {"source": "Hello, how are you?", "reference": "Bonjour, comment allez-vous ?"},
    {"source": "The weather is beautiful today.", "reference": "Le temps est magnifique aujourd'hui."},
    {"source": "I love learning new languages.", "reference": "J'adore apprendre de nouvelles langues."},
    {"source": "Machine translation is fascinating.", "reference": "La traduction automatique est fascinante."},
    {"source": "Thank you very much for your help.", "reference": "Merci beaucoup pour votre aide."}
]

test_dataset_fr_en = [
    {"source": "Bonjour, comment allez-vous ?", "reference": "Hello, how are you?"},
    {"source": "Le temps est magnifique aujourd'hui.", "reference": "The weather is beautiful today."},
    {"source": "J'adore apprendre de nouvelles langues.", "reference": "I love learning new languages."},
    {"source": "La traduction automatique est fascinante.", "reference": "Machine translation is fascinating."},
    {"source": "Merci beaucoup pour votre aide.", "reference": "Thank you very much for your help."}
]

# ===============================
# 8. Run evaluations
# ===============================

for model_key, test_dataset in [("en-fr", test_dataset_en_fr), ("fr-en", test_dataset_fr_en)]:
    print("\n" + "="*60)
    print(f"Performance evaluation for model {model_key}")
    print("="*60)

    # Speed
    sources = [item["source"] for item in test_dataset]
    speed = measure_speed(sources, model_key)
    print(f"Average time: {speed['avg_time']:.3f}s ± {speed['std_time']:.3f}s")
    print(f"Texts per second: {speed['texts_per_second']:.2f}")
    print(f"Milliseconds per text: {speed['ms_per_text']:.2f}ms")

    # Quality
    predictions, bleu, chrf = evaluate_quality(test_dataset, model_key)
    print(f"BLEU score: {bleu:.2f}")
    print(f"chrF score: {chrf:.2f}")

    print("\nTranslations comparison:")
    for i, (src, pred, ref) in enumerate(zip([item["source"] for item in test_dataset],
                                             predictions,
                                             [item["reference"] for item in test_dataset]), 1):
        print(f"{i}. Source: {src}")
        print(f"   Prediction: {pred}")
        print(f"   Reference:  {ref}")

    # Memory
    mem = measure_memory(sources, model_key)
    print(f"\nMemory usage: {mem:.2f} MB")

    # Stress test
    print("\nStress test:")
    stress_test("This is a performance test sentence.", model_key)



Performance evaluation for model en-fr
Average time: 1.041s ± 0.017s
Texts per second: 4.80
Milliseconds per text: 208.20ms
BLEU score: 87.48
chrF score: 92.56

Translations comparison:
1. Source: Hello, how are you?
   Prediction: Bonjour, comment allez-vous ?
   Reference:  Bonjour, comment allez-vous ?
2. Source: The weather is beautiful today.
   Prediction: Le temps est beau aujourd'hui.
   Reference:  Le temps est magnifique aujourd'hui.
3. Source: I love learning new languages.
   Prediction: J'adore apprendre de nouvelles langues.
   Reference:  J'adore apprendre de nouvelles langues.
4. Source: Machine translation is fascinating.
   Prediction: La traduction automatique est fascinante.
   Reference:  La traduction automatique est fascinante.
5. Source: Thank you very much for your help.
   Prediction: Merci beaucoup pour votre aide.
   Reference:  Merci beaucoup pour votre aide.

Memory usage: 2631.77 MB

Stress test:
Batch size   1: 1.140s (0.88 texts/s)
Batch size   5: 1.54

In [12]:

save_path = "./my_translation_model"
tokenizer.save_pretrained(save_path)
model.save_pretrained(save_path)
print(f"Modèle sauvegardé dans {save_path}")



Modèle sauvegardé dans ./my_translation_model
