In [None]:
pip install datasets

In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AdamW
from datasets import load_dataset
from datasets import Dataset, DatasetDict, load_dataset

In [2]:
# Load the dataset
dataset = load_dataset("wmt14", "fr-en", split="train", streaming=True)

streamed_samples = []
for i, sample in enumerate(dataset.take(5500)):
    # Flatten the sample (remove "translation" key)
    streamed_samples.append({
        "fr": sample["translation"]["fr"],
        "en": sample["translation"]["en"]
    })

# Convert the list of samples into a Hugging Face Dataset
train_dataset = Dataset.from_dict({
    "fr": [item["fr"] for item in streamed_samples],
    "en": [item["en"] for item in streamed_samples]
})
# Load the tokenizer and model for XLM-RoBERTa
model_name = "facebook/m2m100_418M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]



M2M100ForConditionalGeneration(
  (model): M2M100Model(
    (shared): M2M100ScaledWordEmbedding(128112, 1024, padding_idx=1)
    (encoder): M2M100Encoder(
      (embed_tokens): M2M100ScaledWordEmbedding(128112, 1024, padding_idx=1)
      (embed_positions): M2M100SinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0-11): 12 x M2M100EncoderLayer(
          (self_attn): M2M100Attention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
       

In [3]:
# Define a custom dataset class
class TranslationDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=128):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        data = self.dataset[idx]
        source_text = data["fr"]
        target_text = data["en"]

        # Tokenize the source and target texts
        inputs = self.tokenizer(
            source_text,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        targets = self.tokenizer(
            target_text,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        # Return tokenized input and target data
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": targets["input_ids"].squeeze(),
        }

# Prepare the dataset and DataLoader
train_dataset = TranslationDataset(train_dataset, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

In [4]:
# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 4
model.train()

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    epoch_loss = 0
    batch_count = 0
    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )

        loss = outputs.loss
        epoch_loss += loss.item()
        batch_count += 1

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    avg_epoch_loss = epoch_loss / batch_count
    print(f"Epoch {epoch + 1} Average Loss: {avg_epoch_loss:.4f}\n")



Epoch 1/4
Epoch 1 Average Loss: 0.9590

Epoch 2/4
Epoch 2 Average Loss: 0.3808

Epoch 3/4
Epoch 3 Average Loss: 0.2983

Epoch 4/4
Epoch 4 Average Loss: 0.2344



In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

external_texts = [
    "Bonjour, comment ça va ?",
    "J'aime apprendre de nouvelles choses.",
    "La machine traduit ce texte.",
    "C'est un test de traduction automatique."
]

model_name = "facebook/m2m100_418M"
tokenizer = AutoTokenizer.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) 
def tokenize_function(texts):
    return tokenizer(
        texts,   
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

inputs = tokenize_function(external_texts)
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)

model.eval()
with torch.no_grad():
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=128,
        forced_bos_token_id=tokenizer.lang_code_to_id["en"]
    )

translated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)

for original, translated in zip(external_texts, translated_texts):
    print(f"Original (FR): {original}")
    print(f"Translated (EN): {translated}\n")



Original (FR): Bonjour, comment ça va ?
Translated (EN): Hello, how is it?

Original (FR): J'aime apprendre de nouvelles choses.
Translated (EN): I like to learn new things.

Original (FR): La machine traduit ce texte.
Translated (EN): The machine translates the text.

Original (FR): C'est un test de traduction automatique.
Translated (EN): This is an automatic translation test.



In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from nltk.translate.bleu_score import corpus_bleu
from datasets import load_dataset

dataset = load_dataset("wmt14", "fr-en", split="validation")

def translate(text, model, tokenizer, device):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=128
        ).to(device)

        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=128,
            forced_bos_token_id=tokenizer.lang_code_to_id["en"]
        )

        return tokenizer.decode(outputs[0], skip_special_tokens=True)

test_examples = dataset.select(range(2000, 2500))

predictions = []
references = []

print("Evaluating BLEU score on 10 samples...")
for i, example in enumerate(test_examples):
    if i >= 10:
        break

    french_text = example["translation"]["fr"]
    expected_english = example["translation"]["en"]

    try:
        translated_text = translate(french_text, model, tokenizer, device)

        predictions.append(translated_text.split())
        references.append([expected_english.split()])

        print(f"French: {french_text}")
        print(f"Expected English: {expected_english}")
        print(f"Model Output: {translated_text}")
        print("-" * 50)
    except Exception as e:
        print(f"Error during translation: {e}")

bleu_score = corpus_bleu(references, predictions)
print(f"\nBLEU Score: {bleu_score:.4f}")