In [8]:
from datasets import load_dataset
from bpemb import BPEmb

dataset = load_dataset("copenlu/answerable_tydiqa")
train_set = dataset["train"]
validation_set = dataset["validation"]

# Define the languages of interest
languages = ["arabic", "bengali", "indonesian"]
language_to_bpe = {'bengali': BPEmb(lang="bn", dim=50), 'indonesian': BPEmb(lang="id", dim=50), 'arabic': BPEmb(lang="ar", dim=50)}

Looking at our sample

In [18]:
sample_row = train_set.filter(lambda example: example['language'] == 'indonesian' and example['document_plaintext'] is not None)[0]
sample_row['document_plaintext']

Filter:   0%|          | 0/116067 [00:00<?, ? examples/s]

'Ernest Douwes Dekker wafat dini hari tanggal 28 Agustus 1950 (tertulis di batu nisannya; 29 Agustus 1950 versi van der Veur, 2006) dan dimakamkan di TMP Cikutra, Bandung.'

Tokenizing using BPEmp

In [27]:
bpemb_model = language_to_bpe[sample_row['language']]

question_bpe = bpemb_model.encode(sample_row['question_text'])
document_bpe = bpemb_model.encode(sample_row['document_plaintext'])
print("BPE tokens:", document_bpe[:10])

BPE tokens: ['▁ern', 'est', '▁d', 'ou', 'w', 'es', '▁dek', 'ker', '▁wafat', '▁dini']


Tokenize using GPT2

In [28]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('indonesian-nlp/gpt2-medium-indonesian')

question_gpt2 = tokenizer.tokenize(sample_row['question_text'])
document_gpt2 = tokenizer.tokenize(sample_row['document_plaintext'])
print("GPT2 tokens:", document_gpt2[:10])

GPT2 tokens: ['Er', 'nest', 'ĠD', 'ouw', 'es', 'ĠDek', 'ker', 'Ġwafat', 'Ġdini', 'Ġhari']


In [30]:
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten
from nltk.lm.preprocessing import pad_both_ends

n = 2

padded_text = list(pad_both_ends(document_gpt2, n=n))
print(padded_text[:10])

['<s>', 'Er', 'nest', 'ĠD', 'ouw', 'es', 'ĠDek', 'ker', 'Ġwafat', 'Ġdini']


In [31]:
padded_n_grams = ngrams(padded_text, n=n)
print(list(padded_n_grams))

[('<s>', 'Er'), ('Er', 'nest'), ('nest', 'ĠD'), ('ĠD', 'ouw'), ('ouw', 'es'), ('es', 'ĠDek'), ('ĠDek', 'ker'), ('ker', 'Ġwafat'), ('Ġwafat', 'Ġdini'), ('Ġdini', 'Ġhari'), ('Ġhari', 'Ġtanggal'), ('Ġtanggal', 'Ġ28'), ('Ġ28', 'ĠAgustus'), ('ĠAgustus', 'Ġ1950'), ('Ġ1950', 'Ġ('), ('Ġ(', 'ter'), ('ter', 'tulis'), ('tulis', 'Ġdi'), ('Ġdi', 'Ġbatu'), ('Ġbatu', 'Ġnis'), ('Ġnis', 'annya'), ('annya', ';'), (';', 'Ġ29'), ('Ġ29', 'ĠAgustus'), ('ĠAgustus', 'Ġ1950'), ('Ġ1950', 'Ġversi'), ('Ġversi', 'Ġvan'), ('Ġvan', 'Ġder'), ('Ġder', 'ĠV'), ('ĠV', 'eur'), ('eur', ','), (',', 'Ġ2006'), ('Ġ2006', ')'), (')', 'Ġdan'), ('Ġdan', 'Ġdimakamkan'), ('Ġdimakamkan', 'Ġdi'), ('Ġdi', 'ĠT'), ('ĠT', 'MP'), ('MP', 'ĠC'), ('ĠC', 'ikut'), ('ikut', 'ra'), ('ra', ','), (',', 'ĠBandung'), ('ĠBandung', '.'), ('.', '</s>')]


In [48]:
from nltk.lm import MLE
from nltk.lm.preprocessing import padded_everygram_pipeline

train_data, padded_sents = padded_everygram_pipeline(n, [document_gpt2])
model = MLE(n)
model.fit(train_data, padded_sents)
print(model.generate(20, random_seed=7))

['nest', 'ĠD', 'ouw', 'es', 'ĠDek', 'ker', 'Ġwafat', 'Ġdini', 'Ġhari', 'Ġtanggal', 'Ġ28', 'ĠAgustus', 'Ġ1950', 'Ġversi', 'Ġvan', 'Ġder', 'ĠV', 'eur', ',', 'Ġ2006']


In [50]:
from transformers import GPT2Tokenizer

language = 'indonesian'
n = 2

tokenizer = GPT2Tokenizer.from_pretrained('indonesian-nlp/gpt2-medium-indonesian')
gpt2_tokens = tokenizer.tokenize(" ".join(row['document_plaintext'] for row in train_set if row['language'] == language))
train_data, padded_sents = padded_everygram_pipeline(n, [gpt2_tokens])
model = MLE(n)
model.fit(train_data, padded_sents)

In [59]:
gpt2_tokens = tokenizer.tokenize(" ".join(row['document_plaintext'] for row in validation_set if row['language'] == language))
# validation_data, _ = padded_everygram_pipeline(n, [gpt2_tokens])
padded_tokens_validation = list(pad_sequence(gpt2_tokens, pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>", n=n))
perplexity = model.perplexity(bigrams(padded_tokens_validation))
# Calculate perplexity
print(list(bigrams(padded_tokens_validation))[0])

('<s>', 'Kol')
