In [None]:
from datasets import load_dataset, Dataset
import pandas as pd
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import torch

### LOAD AND PREPROCESS DATA :

In [None]:
# Load your data from the Tatoeba-style file
src_texts, tgt_texts = [], []
with open('ara.txt', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) >= 2:
            tgt_texts.append(parts[0])  # English
            src_texts.append(parts[1])  # Arabic

# Convert to Hugging Face Dataset
df = pd.DataFrame({'translation': [{'ar': ar, 'en': en} for ar, en in zip(src_texts, tgt_texts)]})
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)

### LOAD MBART MODEL AND TOKENIZER :

In [None]:
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

# Set language codes for tokenizer
tokenizer.src_lang = "ar_AR"
tokenizer.tgt_lang = "en_XX"


### TOKENIZE THE DATA :

In [None]:
def tokenize_function(examples):
    source_texts = [item['ar'] for item in examples['translation']]
    target_texts = [item['en'] for item in examples['translation']]

    model_inputs = tokenizer(source_texts, max_length=128, truncation=True, padding="max_length")

    # Tokenize targets with same tokenizer
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(target_texts, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_dataset = dataset.map(tokenize_function, batched=True)

###  TRAINING ARGUMENTS :

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./mbart-finetuned-ar-en",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),  # optional, depending on hardware
    logging_dir="./logs",
    logging_steps=10,
)

### DEFINE TRAINER :

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)


### TRAIN THE MODEL :

In [None]:
trainer.train()


### SAVE THE MODEL :

In [None]:
model.save_pretrained("./mbart-finetuned-ar-en")
tokenizer.save_pretrained("./mbart-finetuned-ar-en")

###  Load the Model and Tokenizer from Folder :

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

# Path to your saved model
model_path = "./mbart-finetuned-ar-en"

# Load the tokenizer and model
tokenizer = MBart50TokenizerFast.from_pretrained(model_path)
model = MBartForConditionalGeneration.from_pretrained(model_path)

# Put model on GPU if available
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [2]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

# Path to your saved model
model_path = "./mbart-finetuned-ar-en"

# Load the tokenizer and model
tokenizer = MBart50TokenizerFast.from_pretrained(model_path)
model = MBartForConditionalGeneration.from_pretrained(model_path)

# Put model on GPU if available
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [3]:
def translate_arabic(text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Move model to device (if not already there)
    model.to(device)

    tokenizer.src_lang = "ar_AR"
    encoded = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)

    # Move input tensors to the same device as model
    encoded = {k: v.to(device) for k, v in encoded.items()}

    generated_tokens = model.generate(
        **encoded,
        max_length=128,
        forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]
    )

    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

In [13]:
translate_arabic("الترجمة من اللغة العربية الى الانجليزية")

'Translate from Arabic to English'

In [None]:
translate_arabic("مرحبًا")

'Hello!'

In [None]:
translate_arabic('اخفض رأسك!')

'Duck!'

In [None]:
translate_arabic('الذكاءالصناعي')

'Artificial Intelligence'