In [None]:
#Importing necessary APIs and transformer to use, and loading necessary variables

from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, AdamW, get_scheduler

model_checkpoint = "./v2/en_to_fil/v2.0"
translator_en2fil = pipeline("translation", model=model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="tf")
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
#Loading dataset from HuggingFace and sacreBLEU to evaluate BLEU Score

from datasets import load_dataset, concatenate_datasets, load_metric

raw_tatoeba_dataset = load_dataset('tatoeba', lang1='en', lang2='tl')
raw_tatoeba_dataset = raw_tatoeba_dataset.remove_columns(['id'])
raw_tedtalks1_dataset = load_dataset('ted_talks_iwslt', language_pair=("en", "tl"), year="2014")
raw_tedtalks2_dataset = load_dataset('ted_talks_iwslt', language_pair=("en", "tl"), year="2015")
raw_tedtalks3_dataset = load_dataset('ted_talks_iwslt', language_pair=("en", "tl"), year="2016")
raw_gnome_dataset = load_dataset('opus_gnome', lang1='en', lang2='tl')
raw_gnome_dataset = raw_gnome_dataset.remove_columns(['id'])
raw_paracrawl_dataset = load_dataset('opus_paracrawl', lang1='en', lang2='tl')
raw_paracrawl_dataset = raw_paracrawl_dataset.remove_columns(['id'])
raw_subtitles_dataset = load_dataset("open_subtitles", lang1="en", lang2="tl")
raw_subtitles_dataset = raw_subtitles_dataset.remove_columns(['id'])
raw_subtitles_dataset = raw_subtitles_dataset.remove_columns(['meta'])
raw_ubuntu_dataset = load_dataset('opus_ubuntu', lang1='en', lang2='tl')
raw_ubuntu_dataset = raw_ubuntu_dataset.remove_columns(['id'])
raw_multiparacrawl_dataset = load_dataset('multi_para_crawl', lang1='en', lang2='tl')
raw_multiparacrawl_dataset = raw_multiparacrawl_dataset.remove_columns(['id'])
raw_qedamara_dataset = load_dataset('qed_amara', lang1 = 'en', lang2 = 'tl')
raw_qedamara_dataset = raw_qedamara_dataset.remove_columns(['id'])


raw_combined_dataset = concatenate_datasets([raw_tatoeba_dataset['train'], raw_tedtalks1_dataset['train'], raw_tedtalks2_dataset['train'], raw_tedtalks3_dataset['train'], raw_gnome_dataset['train'], raw_paracrawl_dataset['train'], raw_subtitles_dataset['train'], raw_ubuntu_dataset['train'], raw_multiparacrawl_dataset['train'], raw_qedamara_dataset['train']])

combined_dataset = raw_combined_dataset.train_test_split(train_size=0.92, test_size=0.08)
combined_dataset["validation"] = combined_dataset.pop('test')

metric = load_metric('sacrebleu')

In [None]:
#Class to tokenize dataset

max_input_length = 256
max_target_length = 256


def preprocess_function(combineddataset):
    inputs = [x["en"] for x in combineddataset["translation"]]
    targets = [y["tl"] for y in combineddataset["translation"]]
    
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
#Tokenization of dataset

tokenized_combined_dataset = combined_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=combined_dataset["train"].column_names,
)

In [None]:
#Building the DataLoader and setting dataset to return PyTorch tensors

from torch.utils.data import DataLoader

tokenized_combined_dataset.set_format("torch")

train_dataloader = DataLoader(
    tokenized_combined_dataset["train"].shard(num_shards=5, index=0),
    shuffle=True,
    collate_fn=data_collator,
    batch_size=48,
)

#eval_dataloader = DataLoader(
#    tokenized_combined_dataset["validation"], collate_fn=data_collator, batch_size=48
#)

In [None]:
#Importing Accelerator and preparing settings

from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader = accelerator.prepare(
    model, 
    optimizer, 
    train_dataloader
    #,eval_dataloader
)

In [None]:
#Preparing scheduler and output directory

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

output_dir = "./v3/en_to_fil/v3.0"

In [None]:
#Initializing post-processing function

import numpy as np

def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels

In [None]:
#Training and evaluating the transformer using PyTorch and Accelerate and saving to output directory

from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    #model.eval()
    #for batch in tqdm(eval_dataloader):
    #    with torch.no_grad():
    #        generated_tokens = accelerator.unwrap_model(model).generate(
    #            batch["input_ids"],
    #            attention_mask=batch["attention_mask"],
    #            max_length=256,
    #        )
    #    labels = batch["labels"]

    #    generated_tokens = accelerator.pad_across_processes(
    #        generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
    #    )
    #    labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

    #    predictions_gathered = accelerator.gather(generated_tokens)
    #    labels_gathered = accelerator.gather(labels)

    #    decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
    #    metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    #results = metric.compute()
    #print(f"epoch {epoch}, BLEU score: {results['score']:.2f}")

    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)