In [1]:
#Importing necessary APIs and transformer to use, and loading necessary variables

from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model_checkpoint = "./v1/en_to_fil/v1.0"
translator_en2fil = pipeline("translation", model=model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="tf")
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
#Loading dataset from HuggingFace and sacreBLEU to evaluate BLEU Score

from datasets import load_dataset, load_metric
raw_bible_dataset = load_dataset('bible_para', lang1='en', lang2='tl')
bible_dataset = raw_bible_dataset['train'].train_test_split(train_size=0.92, test_size=0.08)
bible_dataset["validation"] = bible_dataset.pop('test')
metric = load_metric('sacrebleu')

In [None]:
#Class to tokenize dataset

max_input_length = 256
max_target_length = 256


def preprocess_function(bibledataset):
    inputs = [x["en"] for x in bibledataset["translation"]]
    targets = [y["tl"] for y in bibledataset["translation"]]
    
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
#Tokenization of dataset

tokenized_bible_dataset = bible_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=bible_dataset["train"].column_names,
)

In [None]:
#Function to detokenize outputs for BLEU evalutation

import numpy as np

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}


In [None]:
#Defining necessary arguments for transformer training

args = Seq2SeqTrainingArguments(
    output_dir = "v2/en_to_fil/v2.0",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
)

In [None]:
#Setting the trainer API

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_bible_dataset["train"],
    eval_dataset=tokenized_bible_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
#Training of transformer using the traner API

trainer.train()