# Install Library

In [1]:
!pip install transformers datasets evaluate sacrebleu

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.0-py3-none-any.whl.metadata (8.5 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-2.10.0-py3-none-any.whl (18 kB)
Installing collected packages: portalocker, sacrebleu, evaluate
Successfully installed evaluate-0.4.2 portalocker-2.10.0 sacrebleu-2.4.2


# Login HF

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
device

cuda


device(type='cuda')

# Download Data

In [4]:
from datasets import load_dataset

data = load_dataset("Helsinki-NLP/un_ga", "ar_to_en")

Downloading readme:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/74067 [00:00<?, ? examples/s]

In [5]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 74067
    })
})

In [6]:
data = data['train'].train_test_split(test_size=0.2)

In [7]:
data['train'][0]

{'id': '68548',
 'translation': {'ar': 'وإذ تشير كذلك إلى نتائج مؤتمر القمة العالمي لعام 2005([1]) المرجع نفسه.)، بما فيها الإقرار بالحاجة إلى تلبية الاحتياجات الخاصة لأفريقيا، وإذ تشير أيضا إلى قرارها 60/265 المؤرخ 30 حزيران/يونيه 2006،',
  'en': 'Recalling further the 2005 World Summit Outcome,Ibid. including the recognition of the need to meet the special needs of Africa, and recalling also its resolution 60/265 of 30 June 2006,'}}

# Loading Model

In [8]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ar-en")

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]



In [9]:
source_lang = "ar"
target_lang = "en"
prefix = "translate Arabic to English: "


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [10]:
tokenized_data = data.map(preprocess_function, batched=True)

Map:   0%|          | 0/59253 [00:00<?, ? examples/s]

Map:   0%|          | 0/14814 [00:00<?, ? examples/s]

# Loading CollectionSeq2Seq

In [11]:
from transformers import DataCollatorForSeq2Seq

checkpoint = 'Helsinki-NLP/opus-mt-ar-en'
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

2024-07-05 04:04:07.394401: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-05 04:04:07.394498: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-05 04:04:07.528178: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [12]:
import evaluate

metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [13]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Loading Model

In [14]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

# Building Arguments

In [15]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./Helsinki-mt-ar-en",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    num_train_epochs=1,
    fp16=True,
    warmup_steps=2000,
    logging_steps=2000
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train THe Model

In [16]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
2000,0.3438
4000,0.3059
6000,0.2897


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}


TrainOutput(global_step=7407, training_loss=0.30681530486496095, metrics={'train_runtime': 999.5494, 'train_samples_per_second': 59.28, 'train_steps_per_second': 7.41, 'total_flos': 1610634728964096.0, 'train_loss': 0.30681530486496095, 'epoch': 1.0})

# Testing The Model

In [23]:
text = tokenized_data['train'][0]['translation']['ar']

tokenized_data['train'][0]['translation']

{'ar': 'وإذ تشير كذلك إلى نتائج مؤتمر القمة العالمي لعام 2005([1]) المرجع نفسه.)، بما فيها الإقرار بالحاجة إلى تلبية الاحتياجات الخاصة لأفريقيا، وإذ تشير أيضا إلى قرارها 60/265 المؤرخ 30 حزيران/يونيه 2006،',
 'en': 'Recalling further the 2005 World Summit Outcome,Ibid. including the recognition of the need to meet the special needs of Africa, and recalling also its resolution 60/265 of 30 June 2006,'}

In [27]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/Helsinki-mt-ar-en/checkpoint-3000")
inputs = tokenizer(text, return_tensors="pt").input_ids



In [28]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/working/Helsinki-mt-ar-en/checkpoint-3000")
outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)

In [29]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

'Recalling further the 2005 World Summit Outcome,Ibid. including the recognition of the need to address the special needs of Africa, and recalling also its resolution 60/265 of 30 June 2006,'

In [37]:
def predict(text, model_checkpoint):
    print('Input: ', text)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    inputs = tokenizer(text, return_tensors="pt").input_ids
    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
    outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
    return "\n\nOutput:  " + tokenizer.decode(outputs[0], skip_special_tokens=True)

In [38]:
predict(text, '/kaggle/working/Helsinki-mt-ar-en/checkpoint-7000')

Input:  وإذ تشير كذلك إلى نتائج مؤتمر القمة العالمي لعام 2005([1]) المرجع نفسه.)، بما فيها الإقرار بالحاجة إلى تلبية الاحتياجات الخاصة لأفريقيا، وإذ تشير أيضا إلى قرارها 60/265 المؤرخ 30 حزيران/يونيه 2006،


'\n\nOutput:  Recalling further the 2005 World Summit Outcome,Ibid., including the recognition of the need to address the special needs of Africa, and recalling also its resolution 60/265 of 30 June 2006,'