In [1]:
#Importing Necessary libraries

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, AdamW
from datasets import load_dataset, concatenate_datasets, load_metric
from torch.utils.data import DataLoader
from accelerate import Accelerator
from tqdm.auto import tqdm
import torch
import numpy as np
accelerator = Accelerator()

In [2]:
#Importing SalamaThanks Transformer v3 for English-to-Filipino Translation

model_v3_checkpoint_en2fil = "SalamaThanks/SalamaThanksTransformer_en2fil_v3"
tokenizer_v3_en2fil = AutoTokenizer.from_pretrained(model_v3_checkpoint_en2fil, return_tensors="tf")
model_v3_en2fil = AutoModelForSeq2SeqLM.from_pretrained(model_v3_checkpoint_en2fil)
data_collator_v3_en2fil = DataCollatorForSeq2Seq(tokenizer_v3_en2fil, model=model_v3_en2fil)
optimizer_v3_en2fil = AdamW(model_v3_en2fil.parameters(), lr=2e-5)



In [3]:
#Importing SalamaThanks Transformer v3 for Filipino-to-English Translation

model_v3_checkpoint_fil2en = "SalamaThanks/SalamaThanksTransformer_fil2en_v3"
tokenizer_v3_fil2en = AutoTokenizer.from_pretrained(model_v3_checkpoint_fil2en, return_tensors="tf")
model_v3_fil2en = AutoModelForSeq2SeqLM.from_pretrained(model_v3_checkpoint_fil2en)
data_collator_v3_fil2en = DataCollatorForSeq2Seq(tokenizer_v3_fil2en, model=model_v3_fil2en)
optimizer_v3_fil2en = AdamW(model_v3_fil2en.parameters(), lr=2e-5)

In [4]:
#Loading sacreBLEU to evaluate BLEU Score and defining length of dataset sentences.

metric = load_metric('sacrebleu')

max_input_length = 256
max_target_length = 256

In [5]:
#Loading bible_para dataset from Huggingface

raw_bible_dataset = load_dataset('bible_para', lang1='en', lang2='tl')
raw_bible_dataset = raw_bible_dataset.remove_columns(['id'])
bible_dataset = raw_bible_dataset['train'].train_test_split(train_size=0.92, test_size=0.08)
bible_dataset["validation"] = bible_dataset.pop('test')

Using custom data configuration en-tl-lang1=en,lang2=tl
Reusing dataset bible_para (C:\Users\Mary\.cache\huggingface\datasets\bible_para\en-tl-lang1=en,lang2=tl\0.0.0\b6cc20bcbfb0299beeba1dcc80a8420b975938ca0eef75b3ed30b50df7d950b1)


  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
#Loading datasets from Huggingface and combining them into one dataset

raw_tatoeba_dataset = load_dataset('tatoeba', lang1='en', lang2='tl')
raw_tatoeba_dataset = raw_tatoeba_dataset.remove_columns(['id'])
raw_tedtalks1_dataset = load_dataset('ted_talks_iwslt', language_pair=("en", "tl"), year="2014")
raw_tedtalks2_dataset = load_dataset('ted_talks_iwslt', language_pair=("en", "tl"), year="2015")
raw_tedtalks3_dataset = load_dataset('ted_talks_iwslt', language_pair=("en", "tl"), year="2016")
raw_gnome_dataset = load_dataset('opus_gnome', lang1='en', lang2='tl')
raw_gnome_dataset = raw_gnome_dataset.remove_columns(['id'])
raw_paracrawl_dataset = load_dataset('opus_paracrawl', lang1='en', lang2='tl')
raw_paracrawl_dataset = raw_paracrawl_dataset.remove_columns(['id'])
raw_subtitles_dataset = load_dataset("open_subtitles", lang1="en", lang2="tl")
raw_subtitles_dataset = raw_subtitles_dataset.remove_columns(['id'])
raw_subtitles_dataset = raw_subtitles_dataset.remove_columns(['meta'])
raw_ubuntu_dataset = load_dataset('opus_ubuntu', lang1='en', lang2='tl')
raw_ubuntu_dataset = raw_ubuntu_dataset.remove_columns(['id'])
raw_multiparacrawl_dataset = load_dataset('multi_para_crawl', lang1='en', lang2='tl')
raw_multiparacrawl_dataset = raw_multiparacrawl_dataset.remove_columns(['id'])
raw_qedamara_dataset = load_dataset('qed_amara', lang1 = 'en', lang2 = 'tl')
raw_qedamara_dataset = raw_qedamara_dataset.remove_columns(['id'])


raw_combined_dataset = concatenate_datasets([raw_tatoeba_dataset['train'], raw_tedtalks1_dataset['train'], raw_tedtalks2_dataset['train'], raw_tedtalks3_dataset['train'], raw_gnome_dataset['train'], raw_paracrawl_dataset['train'], raw_subtitles_dataset['train'], raw_ubuntu_dataset['train'], raw_multiparacrawl_dataset['train'], raw_qedamara_dataset['train']])

combined_dataset = raw_combined_dataset.train_test_split(train_size=0.92, test_size=0.08)
combined_dataset["validation"] = combined_dataset.pop('test')

Using custom data configuration en-tl-lang1=en,lang2=tl
Reusing dataset tatoeba (C:\Users\Mary\.cache\huggingface\datasets\tatoeba\en-tl-lang1=en,lang2=tl\0.0.0\b3ea9c6bb2af47699c5fc0a155643f5a0da287c7095ea14824ee0a8afd74daf6)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration en_tl_2014-e2e7a3ad21b13d2b
Reusing dataset ted_talks_iwslt (C:\Users\Mary\.cache\huggingface\datasets\ted_talks_iwslt\en_tl_2014-e2e7a3ad21b13d2b\1.1.0\43935b3fe470c753a023642e1f54b068c590847f9928bd3f2ec99f15702ad6a6)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration en_tl_2015-e6db70baf321c7f2
Reusing dataset ted_talks_iwslt (C:\Users\Mary\.cache\huggingface\datasets\ted_talks_iwslt\en_tl_2015-e6db70baf321c7f2\1.1.0\43935b3fe470c753a023642e1f54b068c590847f9928bd3f2ec99f15702ad6a6)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration en_tl_2016-9907aa0a67465528
Reusing dataset ted_talks_iwslt (C:\Users\Mary\.cache\huggingface\datasets\ted_talks_iwslt\en_tl_2016-9907aa0a67465528\1.1.0\43935b3fe470c753a023642e1f54b068c590847f9928bd3f2ec99f15702ad6a6)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration en-tl-lang1=en,lang2=tl
Reusing dataset opus_gnome (C:\Users\Mary\.cache\huggingface\datasets\opus_gnome\en-tl-lang1=en,lang2=tl\0.0.0\c00e5dfb1b3b508d7898e160feee1d391e67a3651a06570b45d54ab6a8886217)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration en-tl-lang1=en,lang2=tl
Reusing dataset opus_para_crawl (C:\Users\Mary\.cache\huggingface\datasets\opus_para_crawl\en-tl-lang1=en,lang2=tl\0.0.0\d0becb3ac754eb295ccf6b4b87f391d12d2f4217dbc4f87f2a9718ba1f2de4a3)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration en-tl-lang1=en,lang2=tl
Reusing dataset open_subtitles (C:\Users\Mary\.cache\huggingface\datasets\open_subtitles\en-tl-lang1=en,lang2=tl\0.0.0\c1ec973ca4b6e588740d8f167cc0e24ea3f626e70bc7ffe467e944730500e198)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration en-tl-lang1=en,lang2=tl
Reusing dataset opus_ubuntu (C:\Users\Mary\.cache\huggingface\datasets\opus_ubuntu\en-tl-lang1=en,lang2=tl\0.0.0\7ac83b46edf6d0b6ff96bc86d5aadfb8b877c2f136a94af490988c442d3814b8)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration en-tl-lang1=en,lang2=tl
Reusing dataset multi_para_crawl (C:\Users\Mary\.cache\huggingface\datasets\multi_para_crawl\en-tl-lang1=en,lang2=tl\0.0.0\923bd780ac54acc2d7228bf36806e2a2309aaab30aa3bee613145aaff39eb83c)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration en-tl-lang1=en,lang2=tl
Reusing dataset qed_amara (C:\Users\Mary\.cache\huggingface\datasets\qed_amara\en-tl-lang1=en,lang2=tl\0.0.0\3662cb8fbbe21ebafc420ac0a1b3d1898312661d4f898adc79149fa09d073ba0)


  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
bible_dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 57219
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 4976
    })
})

In [8]:
combined_dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 489822
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 42594
    })
})

In [9]:
#BLEU Evaluation of SalamaThanks v3 Transformer (English-to-Filipino) using Bible Dataset

def preprocess_function1(bibledataset):
    inputs = [x["en"] for x in bibledataset["translation"]]
    targets = [y["tl"] for y in bibledataset["translation"]]
    
    model_inputs = tokenizer_v3_en2fil(inputs, max_length=max_input_length, truncation=True)
    with tokenizer_v3_en2fil.as_target_tokenizer():
        labels = tokenizer_v3_en2fil(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_bible_dataset1 = bible_dataset.map(
    preprocess_function1,
    batched=True,
    remove_columns=bible_dataset["train"].column_names,
)

tokenized_bible_dataset1.set_format("torch")

eval_dataloader_v3_en2fil = DataLoader(
    tokenized_bible_dataset1["validation"].shard(num_shards=5, index=0), collate_fn=data_collator_v3_en2fil, batch_size=48
)

model_v3_en2fil, optimizer_v3_en2fil, eval_dataloader_v3_en2fil = accelerator.prepare(
    model_v3_en2fil, optimizer_v3_en2fil, eval_dataloader_v3_en2fil
)

def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer_v3_en2fil.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer_v3_en2fil.pad_token_id)
    decoded_labels = tokenizer_v3_en2fil.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels

model_v3_en2fil.eval()
for batch in tqdm(eval_dataloader_v3_en2fil):
    with torch.no_grad():
        generated_tokens = accelerator.unwrap_model(model_v3_en2fil).generate(
            batch["input_ids"],
            attention_mask=batch["attention_mask"],
            max_length=256,
        )
    labels = batch["labels"]

    generated_tokens = accelerator.pad_across_processes(
        generated_tokens, dim=1, pad_index=tokenizer_v3_en2fil.pad_token_id
    )
    labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

    predictions_gathered = accelerator.gather(generated_tokens)
    labels_gathered = accelerator.gather(labels)

    decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
    metric.add_batch(predictions=decoded_preds, references=decoded_labels)

results_v3_en2fil = metric.compute()
print(f"BLEU score: {results_v3_en2fil['score']:.2f}")
print("Precisions:", list(results_v3_en2fil['precisions']))
print(f"Brevity Penalty: {results_v3_en2fil['bp']:.2f}")

  0%|          | 0/58 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/21 [00:00<?, ?it/s]

BLEU score: 67.34
Precisions: [84.52908273788354, 71.41950355794609, 62.17990807616546, 54.79403027808597]
Brevity Penalty: 1.00


In [10]:
#BLEU Evaluation of SalamaThanks v3 Transformer (Filipino-to-English) using Bible Dataset

def preprocess_function2(bibledataset):
    inputs = [x["tl"] for x in bibledataset["translation"]]
    targets = [y["en"] for y in bibledataset["translation"]]
    
    model_inputs = tokenizer_v3_fil2en(inputs, max_length=max_input_length, truncation=True)
    with tokenizer_v3_fil2en.as_target_tokenizer():
        labels = tokenizer_v3_fil2en(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_bible_dataset2 = bible_dataset.map(
    preprocess_function2,
    batched=True,
    remove_columns=bible_dataset["train"].column_names,
)

tokenized_bible_dataset2.set_format("torch")

eval_dataloader_v3_fil2en = DataLoader(
    tokenized_bible_dataset2["validation"].shard(num_shards=5, index=0), collate_fn=data_collator_v3_fil2en, batch_size=48
)

model_v3_fil2en, optimizer_v3_fil2en, eval_dataloader_v3_fil2en = accelerator.prepare(
    model_v3_fil2en, optimizer_v3_fil2en, eval_dataloader_v3_fil2en
)

def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer_v3_fil2en.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer_v3_fil2en.pad_token_id)
    decoded_labels = tokenizer_v3_fil2en.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels

model_v3_fil2en.eval()
for batch in tqdm(eval_dataloader_v3_fil2en):
    with torch.no_grad():
        generated_tokens = accelerator.unwrap_model(model_v3_fil2en).generate(
            batch["input_ids"],
            attention_mask=batch["attention_mask"],
            max_length=256,
        )
    labels = batch["labels"]

    generated_tokens = accelerator.pad_across_processes(
        generated_tokens, dim=1, pad_index=tokenizer_v3_fil2en.pad_token_id
    )
    labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

    predictions_gathered = accelerator.gather(generated_tokens)
    labels_gathered = accelerator.gather(labels)

    decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
    metric.add_batch(predictions=decoded_preds, references=decoded_labels)

results_v3_fil2en = metric.compute()
print(f"BLEU score: {results_v3_fil2en['score']:.2f}")
print("Precisions:", list(results_v3_fil2en['precisions']))
print(f"Brevity Penalty: {results_v3_fil2en['bp']:.2f}")

  0%|          | 0/58 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/21 [00:00<?, ?it/s]

BLEU score: 59.15
Precisions: [80.0895008605852, 64.47565409567264, 53.784463005395814, 45.61430435116261]
Brevity Penalty: 0.99


In [11]:
#BLEU Evaluation of SalamaThanks v3 Transformer (English-to-Filipino) using Combined Dataset

def preprocess_function3(combineddataset):
    inputs = [x["en"] for x in combineddataset["translation"]]
    targets = [y["tl"] for y in combineddataset["translation"]]
    
    model_inputs = tokenizer_v3_en2fil(inputs, max_length=max_input_length, truncation=True)
    with tokenizer_v3_en2fil.as_target_tokenizer():
        labels = tokenizer_v3_en2fil(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_combined_dataset1 = combined_dataset.map(
    preprocess_function3,
    batched=True,
    remove_columns=combined_dataset["train"].column_names,
)

tokenized_combined_dataset1.set_format("torch")

eval_dataloader_v3_en2fil = DataLoader(
    tokenized_combined_dataset1["validation"].shard(num_shards=5, index=0), collate_fn=data_collator_v3_en2fil, batch_size=48
)

model_v3_en2fil, optimizer_v3_en2fil, eval_dataloader_v3_en2fil = accelerator.prepare(
    model_v3_en2fil, optimizer_v3_en2fil, eval_dataloader_v3_en2fil
)

def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer_v3_en2fil.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer_v3_en2fil.pad_token_id)
    decoded_labels = tokenizer_v3_en2fil.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels

model_v3_en2fil.eval()
for batch in tqdm(eval_dataloader_v3_en2fil):
    with torch.no_grad():
        generated_tokens = accelerator.unwrap_model(model_v3_en2fil).generate(
            batch["input_ids"],
            attention_mask=batch["attention_mask"],
            max_length=256,
        )
    labels = batch["labels"]

    generated_tokens = accelerator.pad_across_processes(
        generated_tokens, dim=1, pad_index=tokenizer_v3_en2fil.pad_token_id
    )
    labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

    predictions_gathered = accelerator.gather(generated_tokens)
    labels_gathered = accelerator.gather(labels)

    decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
    metric.add_batch(predictions=decoded_preds, references=decoded_labels)

results_v3_en2fil = metric.compute()
print(f"BLEU score: {results_v3_en2fil['score']:.2f}")
print("Precisions:", list(results_v3_en2fil['precisions']))
print(f"Brevity Penalty: {results_v3_en2fil['bp']:.2f}")

  0%|          | 0/490 [00:00<?, ?ba/s]

  0%|          | 0/43 [00:00<?, ?ba/s]

  0%|          | 0/178 [00:00<?, ?it/s]

BLEU score: 40.15
Precisions: [66.9912465608786, 45.137650555294854, 33.52748688452313, 25.622672960057628]
Brevity Penalty: 1.00


In [12]:
#BLEU Evaluation of SalamaThanks v3 Transformer (Filipino-to-English) using Combined Dataset

def preprocess_function4(combineddataset):
    inputs = [x["tl"] for x in combineddataset["translation"]]
    targets = [y["en"] for y in combineddataset["translation"]]
    
    model_inputs = tokenizer_v3_fil2en(inputs, max_length=max_input_length, truncation=True)
    with tokenizer_v3_fil2en.as_target_tokenizer():
        labels = tokenizer_v3_fil2en(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_combined_dataset2 = combined_dataset.map(
    preprocess_function4,
    batched=True,
    remove_columns=combined_dataset["train"].column_names,
)

tokenized_combined_dataset2.set_format("torch")

eval_dataloader_v3_fil2en = DataLoader(
    tokenized_combined_dataset2["validation"].shard(num_shards=5, index=0), collate_fn=data_collator_v3_fil2en, batch_size=48
)

model_v3_fil2en, optimizer_v3_fil2en, eval_dataloader_v3_fil2en = accelerator.prepare(
    model_v3_fil2en, optimizer_v3_fil2en, eval_dataloader_v3_fil2en
)

def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer_v3_fil2en.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer_v3_fil2en.pad_token_id)
    decoded_labels = tokenizer_v3_fil2en.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels

model_v3_fil2en.eval()
for batch in tqdm(eval_dataloader_v3_fil2en):
    with torch.no_grad():
        generated_tokens = accelerator.unwrap_model(model_v3_fil2en).generate(
            batch["input_ids"],
            attention_mask=batch["attention_mask"],
            max_length=256,
        )
    labels = batch["labels"]

    generated_tokens = accelerator.pad_across_processes(
        generated_tokens, dim=1, pad_index=tokenizer_v3_fil2en.pad_token_id
    )
    labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

    predictions_gathered = accelerator.gather(generated_tokens)
    labels_gathered = accelerator.gather(labels)

    decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
    metric.add_batch(predictions=decoded_preds, references=decoded_labels)

results_v3_fil2en = metric.compute()
print(f"BLEU score: {results_v3_fil2en['score']:.2f}")
print("Precisions:", list(results_v3_fil2en['precisions']))
print(f"Brevity Penalty: {results_v3_fil2en['bp']:.2f}")

  0%|          | 0/490 [00:00<?, ?ba/s]

  0%|          | 0/43 [00:00<?, ?ba/s]

  0%|          | 0/178 [00:00<?, ?it/s]

BLEU score: 39.92
Precisions: [68.67504890118707, 46.87815247153769, 34.97988827338837, 26.730042656916513]
Brevity Penalty: 0.96
