# **Language translation**

In this notebook, we use the [M2M-100](https://huggingface.co/facebook/m2m100_418M) multilingual translation model to translate the news article from English into: French, German, Spanish, Polish, Russian.

This represents the first step for our first experiment. With this translated articles, we should afterwards train a classifier to group them into an opinion, report or satire piece.

In [None]:
!pip install pandas transformers torch nltk datasets accelerate scikit-learn

## **Model**

In [None]:
import csv
import pandas as pd
import torch
import nltk

from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from nltk.tokenize import sent_tokenize
nltk.download('punkt_tab')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
model.to(device)

In [None]:
# translate Chinesse to English
text = "生活就像一盒巧克力。"

# tokenize the input using the Chinesse version
tokenizer.src_lang = "zh"
encoded_zh = tokenizer(text, return_tensors="pt").to(device)
# encode the tokens from the input text
generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
# decode the tokens into the translation
translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

print(f"ZH: {text} ---> EN: {translation[0]}")

## **Dataset**


In [None]:
!wget https://raw.githubusercontent.com/VladWero08/mt-pattern-preserve/refs/heads/main/data/articles_en.csv -O articles_en.csv

In [None]:
articles_en = pd.read_csv("articles_en.csv")

In [None]:
text = articles_en.iloc[0]["full_articles"]
sentences = sent_tokenize(text)
print(f"Split into {len(sentences)} sentences")

In [None]:
translations = []
tokenizer.src_lang = "en"

for sent in sentences:
    if len(sent.strip()) < 3:
        continue
    encoded = tokenizer(sent, return_tensors="pt", truncation=True, max_length=512).to(device)
    generated = model.generate(
      **encoded,
      forced_bos_token_id=tokenizer.get_lang_id("fr"),
      max_new_tokens=128,
      num_beams=4
    )
    translated = tokenizer.batch_decode(generated, skip_special_tokens=True)[0]
    translations.append(translated)

full_translation = " ".join(translations)
print(f"FR: {full_translation}")


## **Translation**

In [None]:
# all the translation will be done between EN -> target language
target_languages = ["fr", "de", "es", "pl", "ru"]
tokenizer.src_lang = "en"
batch_size = 50

for target_language in target_languages:
    target_language_dataset = []

    for start_idx in range(0, len(articles_en), batch_size):
        end_idx = min(start_idx + batch_size, len(articles_en))

        # extract id, genre and articles batches
        batch_ids = articles_en["id"].iloc[start_idx:end_idx]
        batch_genres = articles_en["genre"].iloc[start_idx:end_idx]
        batch_articles = articles_en["full_articles"].iloc[start_idx:end_idx]

        print(f"Translating batch {start_idx + 1} to {end_idx}...")

        for id_, genre, article in zip(batch_ids, batch_genres, batch_articles):
            # break the article into sentences before feeding it to the translation model,
            # because articles can get larger than the tokenizer.model_max_length
            sentences = sent_tokenize(article)
            translation = []

            # translate each chunk that concatenate all translations
            for sentence in sentences:
              encoded = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512).to(device)
              generated_tokens = model.generate(
                  **encoded,
                  forced_bos_token_id=tokenizer.get_lang_id(target_language),
                  max_new_tokens=128,
                  num_beams=5,
              )
              translated = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
              translation.append(translated)

            translation = " ".join(translation)
            target_language_dataset.append({
                "id": id_,
                "genre": genre,
                "full_articles": translation
            })

    # Save the translated dataset
    output_csv = f"articles_{target_language}.csv"
    with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=["id", "genre", "full_articles"])
        writer.writeheader()
        writer.writerows(target_language_dataset)
    print(f"Successfully saved .csv for EN -> {target_language.upper()} translations!")
