In [2]:
from nltk.lm import Laplace
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.util import ngrams
from datasets import load_dataset
from bpemb import BPEmb

dataset = load_dataset("copenlu/answerable_tydiqa")
train_set = dataset["train"]
validation_set = dataset["validation"]

# Define the languages of interest
languages = ["arabic", "bengali", "indonesian"]
language_to_bpe = {
    'bengali': BPEmb(lang="bn", dim=50), 
    'indonesian': BPEmb(lang="id", dim=50), 
    'arabic': BPEmb(lang="ar", dim=50)
}

# Define an order for the n-grams
N = 3

def tokenize_dataset(dataset, lang, language_to_bpe, question_only=False, document_only=False):
    tokenized_data = []
    for entry in dataset:
        if entry['language'] != lang:
            continue
        
        
        if question_only:
            text = entry['question_text']
        elif document_only:
            text = entry['document_plaintext']
        else:
            text = entry['question_text'] + ' ' + entry['document_plaintext']
            
        bpe_text = language_to_bpe[lang].encode(text)
        tokenized_data.append(bpe_text)
    return tokenized_data

# Create and evaluate a Laplace model for each language
for lang in languages:
    for n in range(1, N+1):
        tokenized_train = tokenize_dataset(train_set, lang, language_to_bpe, question_only=True)
        tokenized_validation = tokenize_dataset(validation_set, lang, language_to_bpe, question_only=True)

        train_data, padded_vocab = padded_everygram_pipeline(N, tokenized_train)
        validation_data, _ = padded_everygram_pipeline(N, tokenized_validation)

        lm = Laplace(n)
        lm.fit(train_data, padded_vocab)

        # Calculate perplexity on validation data for the current language
        validation_ngrams = [ng for sent in tokenized_validation for ng in ngrams(sent, n)]
        perplexity = lm.perplexity(validation_ngrams)
        print(f"Perplexity for {lang} using {n}-grams in question: {perplexity}")
        

# Create and evaluate a Laplace model for each language
for lang in languages:
    for n in range(1, N+1):
        tokenized_train = tokenize_dataset(train_set, lang, language_to_bpe, document_only=True)
        tokenized_validation = tokenize_dataset(validation_set, lang, language_to_bpe, document_only=True)

        train_data, padded_vocab = padded_everygram_pipeline(N, tokenized_train)
        validation_data, _ = padded_everygram_pipeline(N, tokenized_validation)

        lm = Laplace(n)
        lm.fit(train_data, padded_vocab)

        # Calculate perplexity on validation data for the current language
        validation_ngrams = [ng for sent in tokenized_validation for ng in ngrams(sent, n)]
        perplexity = lm.perplexity(validation_ngrams)
        print(f"Perplexity for {lang} using {n}-grams document: {perplexity}")

  from .autonotebook import tqdm as notebook_tqdm


Perplexity for arabic using 1-grams in question: 1113.8889298334905
Perplexity for arabic using 2-grams in question: 806.2036869725847
Perplexity for arabic using 3-grams in question: 2892.8206527586626
Perplexity for bengali using 1-grams in question: 1033.544124486604
Perplexity for bengali using 2-grams in question: 392.7413803882843
Perplexity for bengali using 3-grams in question: 907.161208880907
Perplexity for indonesian using 1-grams in question: 1054.0973057955289
Perplexity for indonesian using 2-grams in question: 690.5564623641836
Perplexity for indonesian using 3-grams in question: 2156.901599065793
Perplexity for arabic using 1-grams document: 2361.35061286874
Perplexity for arabic using 2-grams document: 1057.5043929730352
Perplexity for arabic using 3-grams document: 4318.201644807369
Perplexity for bengali using 1-grams document: 2322.6386501576
Perplexity for bengali using 2-grams document: 2030.7911510277181
Perplexity for bengali using 3-grams document: 5243.6071885