In [2]:
from datasets import load_dataset

dataset = load_dataset("copenlu/answerable_tydiqa")
train_set = dataset["train"]
validation_set = dataset["validation"]

# Define the languages of interest
languages = ["arabic", "bengali", "indonesian"]

In [3]:
def oracle(answer: str, answer_start: int, document: str) -> bool:
    """Returns True if the answer is in the document at the given index."""
    return document[answer_start:answer_start + len(answer)] == answer

In [4]:
def classifier_1(question: str, document: str, language: str) -> bool:
    return True # question in document

In [5]:
def classifier_bow(question: str, document: str, language: str) -> bool:
    """Returns True if the question is in the document."""
    tracker = [word in document for word in question.split()]
    contained_ratio = sum(tracker) / len(tracker)
    return contained_ratio > 0.4

In [6]:
from bpemb import BPEmb

language_to_bpe = {'bengali': BPEmb(lang="bn", dim=50), 'indonesian': BPEmb(lang="id", dim=50), 'arabic': BPEmb(lang="ar", dim=50)}

def classifier_bpe(question: str, document: str, language: str) -> bool:
    bpemb_model = language_to_bpe[language]
    # Tokenize question and document
    question_bpe = bpemb_model.encode(question)
    document_bpe = bpemb_model.encode(document)
    tracker = [token_bpe in document_bpe for token_bpe in question_bpe]
    contained_ratio = sum(tracker) / len(tracker)
    return contained_ratio > 0.2

In [7]:
import pandas as pd

# Try out the classifiers and evaluate them using the oracle function
results = []
for classifier in [classifier_1, classifier_bow, classifier_bpe]:
    for sample in validation_set:
        result = {}
        if sample['language'] not in languages:
            continue
        
        result['classifier'] = classifier.__name__
        result['language'] = sample['language']
        
        classification = classifier(sample["question_text"], sample["document_plaintext"], sample['language'])
        truth = oracle(sample["annotations"]["answer_text"][0], sample["annotations"]["answer_start"][0], sample["document_plaintext"])
        result['correct'] = classification == truth
        
        results.append(result)
        
results = pd.DataFrame(results)


In [None]:
# Display results
for language in languages:
    lang_results = results[results['language'] == language]
    print(f"Results for {language}")
    
    # Calculate the mean correctness for each classifier
    classifier_mean = lang_results.groupby('classifier')['correct'].mean().reset_index()
    print( classifier_mean)
    
    # Calculate the number of rows (samples) for each classifier
    classifier_count = len(lang_results)
    print("Count:", classifier_count)
    print()


Results for arabic
       classifier   correct
0    classifier_1  1.000000
1  classifier_bow  0.404858
2  classifier_bpe  0.704608
Count: 88794

Results for bengali
       classifier   correct
0    classifier_1  1.000000
1  classifier_bow  0.316384
2  classifier_bpe  0.680268
Count: 14337

Results for indonesian
       classifier   correct
0    classifier_1  1.000000
1  classifier_bow  0.250922
2  classifier_bpe  0.722222
Count: 34182

