In [None]:
!pip install transformers datasets sacrebleu torch sentencepiece

###  Load Pre-trained Model & Dataset

In [None]:
from datasets import load_dataset
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch

# Load the dataset (e.g., FLORES-200)
dataset = load_dataset("facebook/flores", "eng_Latn-arb_Arab")

# Load tokenizer and model
model_name = "Helsinki-NLP/opus-mt-en-ar"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)


### Tokenize the Data

In [None]:
def preprocess_data(examples):
    inputs = tokenizer(examples["sentence_eng"], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(examples["sentence_arb"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply preprocessing
tokenized_dataset = dataset.map(preprocess_data, batched=True)
train_dataset = tokenized_dataset["train"]
val_dataset = tokenized_dataset["validation"]


### Fine-Tuning the Model

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./bert_translation",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=100
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

trainer.train()


### Evaluating the Model

In [None]:
from sacrebleu import corpus_bleu

# Function to translate a sentence
def translate(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(**inputs)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

# Example Evaluation
references = ["مرحبًا بك في العالم"]
predictions = [translate("Welcome to the world")]
bleu_score = corpus_bleu(predictions, [references])

print(f"BLEU Score: {bleu_score.score}")


In [None]:
model.save_pretrained("./bert_translation_finetuned")
tokenizer.save_pretrained("./bert_translation_finetuned")

# Load Later
from transformers import MarianMTModel, MarianTokenizer
model = MarianMTModel.from_pretrained("./bert_translation_finetuned")
tokenizer = MarianTokenizer.from_pretrained("./bert_translation_finetuned")
