In [13]:
from datasets import load_dataset
from bpemb import BPEmb

dataset = load_dataset("copenlu/answerable_tydiqa")
train_set = dataset["train"]
validation_set = dataset["validation"]

# Define the languages of interest
languages = ["arabic", "bengali", "indonesian"]
language_to_bpe = {'bengali': BPEmb(lang="bn", dim=50), 'indonesian': BPEmb(lang="id", dim=50), 'arabic': BPEmb(lang="ar", dim=50)}

Looking at our sample

In [14]:
sample_row = train_set.filter(lambda example: example['language'] == 'indonesian' and example['document_plaintext'] is not None)[0]
sample_row['document_plaintext']

'Ernest Douwes Dekker wafat dini hari tanggal 28 Agustus 1950 (tertulis di batu nisannya; 29 Agustus 1950 versi van der Veur, 2006) dan dimakamkan di TMP Cikutra, Bandung.'

Tokenizing using BPEmp

In [15]:
bpemb_model = language_to_bpe[sample_row['language']]

question_bpe = bpemb_model.encode(sample_row['question_text'])
document_bpe = bpemb_model.encode(sample_row['document_plaintext'])
print("BPE tokens:", document_bpe[:10])

BPE tokens: ['▁ern', 'est', '▁d', 'ou', 'w', 'es', '▁dek', 'ker', '▁wafat', '▁dini']


Tokenize using GPT2

In [16]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('indonesian-nlp/gpt2-medium-indonesian')

sample_question_gpt2 = tokenizer.tokenize(sample_row['question_text'])
sample_document_gpt2 = tokenizer.tokenize(sample_row['document_plaintext'])
print("GPT2 tokens:", sample_document_gpt2[:10])

GPT2 tokens: ['Er', 'nest', 'ĠD', 'ouw', 'es', 'ĠDek', 'ker', 'Ġwafat', 'Ġdini', 'Ġhari']


In [17]:
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten
from nltk.lm.preprocessing import pad_both_ends

n = 3

padded_text = list(pad_both_ends(sample_document_gpt2, n=n))
print(padded_text[:10])

['<s>', '<s>', 'Er', 'nest', 'ĠD', 'ouw', 'es', 'ĠDek', 'ker', 'Ġwafat']


In [18]:
padded_n_grams = ngrams(padded_text, n=n)
print(list(padded_n_grams))

[('<s>', '<s>', 'Er'), ('<s>', 'Er', 'nest'), ('Er', 'nest', 'ĠD'), ('nest', 'ĠD', 'ouw'), ('ĠD', 'ouw', 'es'), ('ouw', 'es', 'ĠDek'), ('es', 'ĠDek', 'ker'), ('ĠDek', 'ker', 'Ġwafat'), ('ker', 'Ġwafat', 'Ġdini'), ('Ġwafat', 'Ġdini', 'Ġhari'), ('Ġdini', 'Ġhari', 'Ġtanggal'), ('Ġhari', 'Ġtanggal', 'Ġ28'), ('Ġtanggal', 'Ġ28', 'ĠAgustus'), ('Ġ28', 'ĠAgustus', 'Ġ1950'), ('ĠAgustus', 'Ġ1950', 'Ġ('), ('Ġ1950', 'Ġ(', 'ter'), ('Ġ(', 'ter', 'tulis'), ('ter', 'tulis', 'Ġdi'), ('tulis', 'Ġdi', 'Ġbatu'), ('Ġdi', 'Ġbatu', 'Ġnis'), ('Ġbatu', 'Ġnis', 'annya'), ('Ġnis', 'annya', ';'), ('annya', ';', 'Ġ29'), (';', 'Ġ29', 'ĠAgustus'), ('Ġ29', 'ĠAgustus', 'Ġ1950'), ('ĠAgustus', 'Ġ1950', 'Ġversi'), ('Ġ1950', 'Ġversi', 'Ġvan'), ('Ġversi', 'Ġvan', 'Ġder'), ('Ġvan', 'Ġder', 'ĠV'), ('Ġder', 'ĠV', 'eur'), ('ĠV', 'eur', ','), ('eur', ',', 'Ġ2006'), (',', 'Ġ2006', ')'), ('Ġ2006', ')', 'Ġdan'), (')', 'Ġdan', 'Ġdimakamkan'), ('Ġdan', 'Ġdimakamkan', 'Ġdi'), ('Ġdimakamkan', 'Ġdi', 'ĠT'), ('Ġdi', 'ĠT', 'MP'), ('ĠT', '

In [19]:
from nltk.lm import Laplace
from nltk.lm.preprocessing import padded_everygram_pipeline

train_data, padded_sents = padded_everygram_pipeline(n, [sample_document_gpt2])
model = Laplace(n)
model.fit(train_data, padded_sents)
print(model.generate(20, random_seed=7))

['nest', 'ĠD', 'ouw', 'es', 'ĠDek', 'ker', 'Ġwafat', 'Ġdini', 'Ġhari', 'Ġtanggal', 'Ġ28', 'ĠAgustus', 'Ġ1950', 'Ġversi', 'Ġvan', 'Ġder', 'ĠV', 'eur', ',', 'Ġ2006']


Now on all data

In [29]:
from transformers import GPT2Tokenizer

language = 'indonesian'
n = 3

tokenizer = GPT2Tokenizer.from_pretrained('indonesian-nlp/gpt2-medium-indonesian')
gpt2_tokens = tokenizer.tokenize(" ".join(row['document_plaintext'] for row in train_set if row['language'] == language))
train_data, padded_sents = padded_everygram_pipeline(n, [gpt2_tokens])
model = Laplace(n)
model.fit(train_data, padded_sents)

In [30]:
gpt2_tokens = tokenizer.tokenize(" ".join(row['document_plaintext'] for row in validation_set if row['language'] == language))
padded_tokens_validation = list(pad_sequence(gpt2_tokens, pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>", n=n))


# Calculate perplexity
perplexity = model.perplexity(ngrams(padded_tokens_validation, n=n))
print(perplexity)
# list(bigrams(padded_tokens_validation))[:10]

21322.709156468547


In [31]:
print(model.score("<s>", "Kol"))
print(len(model.vocab))

2.8512773722627736e-05
35072
