In [39]:
from datasets import load_dataset
from scipy import spatial
from collections import Counter
from nlp import classifier_bow_cosine


dataset = load_dataset("copenlu/answerable_tydiqa")
train_set = dataset["train"]
validation_set = dataset["validation"]

# Define the languages of interest
languages = ["arabic", "bengali", "indonesian"]
validation_set

Dataset({
    features: ['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
    num_rows: 13325
})

In [40]:

# unanswerable = train_set[train_set["annotations"]["answer_start"][0] == -1]
unanswerable = train_set.filter(lambda example: example["annotations"]["answer_start"][0] == -1)
answerable = train_set.filter(lambda example: example["annotations"]["answer_start"][0] != -1)
unanswerable[0]

{'question_text': 'Milloin Charles Fort syntyi?',
 'document_title': 'Charles Fort',
 'language': 'finnish',
 'annotations': {'answer_start': [-1], 'answer_text': ['']},
 'document_plaintext': 'Fortin ystävät perustivat Fortean Societyn jo hänen elinaikanan, ja seuraa johti kirjailija Tiffany Thayer, puoliksi tosissaan ja puoliksi vitsin vuoksi, kuten Fortin itsensä työ. Fort kuitenkin torjui Societyn ja ankarana autoritaarisuuden vastustajana kieltäytyi sen johtajuudesta ja edelleen vastusti seuraa, sillä se saattaisi houkutella spiritualisteja, kiihkoilijoita ja niitä, jotka vastustivat tiedettä, koska se ei ollut hyväksynyt heitä. Se houkuttelisi niitä, jotka uskoivat omaan lempi-ilmiöönsä ylitse muiden: täysin vastakohtainen asenne fortilaisuudelle. On siis ironista, että sittemmin useita sellaisia "fortilaisia" seuroja on perustettu lisää. ',
 'document_url': 'https://fi.wikipedia.org/wiki/Charles%20Fort'}

In [41]:
def oracle(answer: str, answer_start: int, document: str) -> bool:
    """Returns True if the answer is in the document at the given index."""
    return answer_start != -1 and document[answer_start:answer_start + len(answer)] == answer

In [42]:
def classifier_1(question: str, document: str, language: str) -> bool:
    return True # question in document

In [43]:
def classifier_bow(question: str, document: str, language: str) -> bool:
    """Returns True if the question is in the document."""
    tracker = [word in document for word in question.split()]
    contained_ratio = sum(tracker) / len(tracker)
    return contained_ratio > 0.4

In [45]:
from bpemb import BPEmb

language_to_bpe = {'bengali': BPEmb(lang="bn", dim=50), 'indonesian': BPEmb(lang="id", dim=50), 'arabic': BPEmb(lang="ar", dim=50)}

def classifier_bpe(question: str, document: str, language: str) -> bool:
    bpemb_model = language_to_bpe[language]
    # Tokenize question and document
    question_bpe = bpemb_model.encode(question)
    document_bpe = bpemb_model.encode(document)
    tracker = [token_bpe in document_bpe for token_bpe in question_bpe]
    contained_ratio = sum(tracker) / len(tracker)
    return contained_ratio > 0.4

In [46]:
import pandas as pd

# Try out the classifiers and evaluate them using the oracle function
results = []
for classifier in [classifier_1, classifier_bow, classifier_bpe, classifier_bow_cosine]:
    for sample in validation_set:
        result = {}
        if sample['language'] not in languages:
            continue
        
        result['classifier'] = classifier.__name__
        result['language'] = sample['language']
        
        classification = classifier(sample["question_text"], sample["document_plaintext"], sample['language'])
        truth = oracle(sample["annotations"]["answer_text"][0], sample["annotations"]["answer_start"][0], sample["document_plaintext"])
        result['correct'] = classification == truth
        
        results.append(result)
        
results = pd.DataFrame(results)


0.4
0.6405126152203485
0.6154413221175971
0.6240377207533828
0.447213595499958
0.6154574548966637
0.5
1
0.6060915267313264
0.35355339059327373
0.5
0.7627700713964738
0.680336051416609
0.7035264706814484
0.316227766016838
0.7298004491997617
0.447213595499958
0.5
0.7071067811865475
0.7302967433402215
0.7071067811865475
0.7071067811865475
0.5163977794943222
0.4508348173337162
0.5345224838248487
0.44721359549995787
0.47140452079103157
0.7071067811865475
0.7071067811865475
0.8894991799933215
0.5222329678670935
0.5
0.6666666666666666
0.5163977794943222
0.408248290463863
0.5477225575051661
0.7071067811865475
0.5477225575051661
0.7427813527082074
0.4714045207910317
0.674199862463242
0.50709255283711
0.5
0.8
0.7071067811865475
0.6666666666666666
0.7071067811865475
0.40824829046386313
0.5262348115842175
0.5
1
0.408248290463863
0.6324555320336759
1
0.6324555320336759
0.7071067811865475
0.316227766016838
0.5773502691896257
0.447213595499958
0.6324555320336759
1
0.5
0.752772652709081
0.5
0.81649658

  dist = 1.0 - uv / np.sqrt(uu * vv)


0.7276068751089989
0.7071067811865475
0.6
0.33333333333333326
0.674199862463242
0.7745966692414833
0.408248290463863
0.447213595499958
0.6933752452815365
0.7302967433402215
0.6634888026970371
1
0.6069769786668839
0.5773502691896257
0.6324555320336759
0.6324555320336759
0.7745966692414833
0.447213595499958
0.816496580927726
0.8339078479367938
0.5715476066494082
0.6546536707079772
0.6831300510639732
0.408248290463863
0.6546536707079771
0.5
0.5797509043642028
0.7302967433402214
0.5
1
0.5773502691896258
0.7745966692414833
0.6531972647421809
0.6324555320336759
0.5773502691896257
0.5304671043874534
0.5773502691896257
0.6324555320336759
1
0.7302967433402215
0.6708203932499369
0.5
0.447213595499958
0.5
0.6681531047810609
0.539163866017192
1
0.6546536707079772
0.408248290463863
0.5
0.8333333333333334
0.5
0.5345224838248487
0.7027819284987273
0.5
0.6642296295965939
1
0.5773502691896257
0.5
0.7223151185146152
0.7071067811865475
0.5345224838248487
0.4714045207910317
0.5
0.7171371656006361
0.5
0.65

In [47]:
# Display results
for language in languages:
    lang_results = results[results['language'] == language]
    print(f"Results for {language}")
    
    # Calculate the mean correctness for each classifier
    classifier_mean = lang_results.groupby('classifier')['correct'].mean().reset_index()
    print( classifier_mean)
    
    # Calculate the number of rows (samples) for each classifier
    classifier_count = len(lang_results)
    print("Count:", classifier_count)
    print()


Results for arabic
              classifier   correct
0           classifier_1  0.500000
1         classifier_bow  0.707676
2  classifier_bow_cosine  0.425342
3         classifier_bpe  0.730810
Count: 7608

Results for bengali
              classifier   correct
0           classifier_1  0.500000
1         classifier_bow  0.714286
2  classifier_bow_cosine  0.366071
3         classifier_bpe  0.723214
Count: 896

Results for indonesian
              classifier   correct
0           classifier_1  0.501259
1         classifier_bow  0.669186
2  classifier_bow_cosine  0.404702
3         classifier_bpe  0.733837
Count: 4764

