In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install  datasets sacrebleu tensorboard -q


In [4]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


# importing Libraries

In [5]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from datasets import Dataset
from evaluate import load as load_metric
import pandas as pd
import torch


# Data preparation

In [6]:

data = []
with open("/kaggle/input/arabic-english-parallel-data/ara_.txt", "r", encoding="utf-8") as f:
    for line in f:
        if '\t' in line:
            en, ar = line.strip().split('\t')
            data.append({'translation': {'ar': ar, 'en': en}})

df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)


# Model Translation 

In [7]:
model_name = "Helsinki-NLP/opus-mt-ar-en"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

# Preprocessing

In [8]:
def preprocess(examples):
    inputs = [ex['ar'] for ex in examples['translation']]
    targets = [ex['en'] for ex in examples['translation']]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=True)


Map:   0%|          | 0/10742 [00:00<?, ? examples/s]



model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

# Training

In [9]:
training_args = TrainingArguments(
    output_dir="./results",
    logging_dir="./logs",
    logging_steps=100,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    report_to="tensorboard",
    fp16=torch.cuda.is_available(),
)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

trainer.train()




Step,Training Loss
100,0.6084
200,0.0357
300,0.0339
400,0.0254
500,0.0234
600,0.022
700,0.0207
800,0.0164
900,0.016
1000,0.017




TrainOutput(global_step=1008, training_loss=0.08138397770623367, metrics={'train_runtime': 466.464, 'train_samples_per_second': 69.086, 'train_steps_per_second': 2.161, 'total_flos': 1092408708169728.0, 'train_loss': 0.08138397770623367, 'epoch': 3.0})

In [11]:
model.save_pretrained("./my_finetuned_model")
tokenizer.save_pretrained("./my_finetuned_model")


('./my_finetuned_model/tokenizer_config.json',
 './my_finetuned_model/special_tokens_map.json',
 './my_finetuned_model/vocab.json',
 './my_finetuned_model/source.spm',
 './my_finetuned_model/target.spm',
 './my_finetuned_model/added_tokens.json')

# function for translation

In [12]:
def translate(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(model.device)
    output_ids = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)


In [14]:
for src, pred, ref in zip(srcs[:5], preds[:5], refs[:5]):
    print(f"Source: {src}")
    print(f"Predicted: {pred}")
    print(f"Reference: {ref}")
    print("="*50)


Source: مرحبًا.
Predicted: Hi.
Reference: Hi.
Source: اركض!
Predicted: Run!
Reference: Run!
Source: النجدة!
Predicted: Help!
Reference: Help!
Source: اقفز!
Predicted: Jump!
Reference: Jump!
Source: قف!
Predicted: Stop!
Reference: Stop!


# **Evaluation**

In [20]:
from evaluate import load as load_metric

metric = load_metric("sacrebleu")
sample = dataset.select(range(300)) 

srcs = [example['translation']['ar'] for example in sample]
refs = [example['translation']['en'] for example in sample]

preds = [translate(text) for text in srcs]

refs_lower = [ref.lower() for ref in refs]
preds_lower = [pred.lower() for pred in preds]

results = metric.compute(predictions=preds_lower, references=[[ref] for ref in refs_lower])
print("BLEU score:", results["score"])


BLEU score: 74.91640002191458


In [21]:
translate("البرمجة مفيدة")

'Programming is useful.'

In [22]:
translate("اخر سنة في الجامعة")

'Last year at the university.'

In [23]:
translate("ماذا تفعل الان؟")

'What are you doing now?'

In [25]:
translate("ما هو موضوع مشروع تخرجك؟")

"What's the subject of your graduate project?"