# NLP Project

In [99]:
from datasets import load_dataset
import pandas as pd
import re
import spacy
from spacy.lang.fi import Finnish
from spacy.lang.en import English
from spacy.lang.ja import Japanese
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex

from enum import Enum

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
%%capture
!python -m spacy download en_core_web_sm
!python -m spacy download fi_core_news_sm
!python -m spacy download ja_core_news_sm

In [4]:
DATA_RELATIVE_PATH = "data"

## Q1.1a

First, lets define the path where the preprocessed data will be stored for later use.

In [5]:
path_train_set = DATA_RELATIVE_PATH + "/train_set.csv"
path_validation_set = DATA_RELATIVE_PATH + "/train_set.csv"

Let's download the dataset from the web.

In [6]:
%%capture
dataset_raw = load_dataset("copenlu/answerable_tydiqa")

In [7]:
train_set_raw = dataset_raw["train"].to_pandas()
validation_set_raw = dataset_raw["validation"].to_pandas()

In [8]:
train_set_raw.head()

Unnamed: 0,question_text,document_title,language,annotations,document_plaintext,document_url
0,Milloin Charles Fort syntyi?,Charles Fort,finnish,"{'answer_start': [18], 'answer_text': ['6. elo...",Charles Hoy Fort (6. elokuuta (joidenkin lähte...,https://fi.wikipedia.org/wiki/Charles%20Fort
1,“ダン” ダニエル・ジャドソン・キャラハンの出身はどこ,ダニエル・J・キャラハン,japanese,"{'answer_start': [35], 'answer_text': ['カリフォルニ...",“ダン”こと、ダニエル・ジャドソン・キャラハンは1890年7月26日、カリフォルニア州サンフ...,https://ja.wikipedia.org/wiki/%E3%83%80%E3%83%...
2,వేప చెట్టు యొక్క శాస్త్రీయ నామం ఏమిటి?,వేప,telugu,"{'answer_start': [12], 'answer_text': ['Azadir...","వేప (లాటిన్ Azadirachta indica, syn. Melia aza...",https://te.wikipedia.org/wiki/%E0%B0%B5%E0%B1%...
3,চেঙ্গিস খান কোন বংশের রাজা ছিলেন ?,চেঙ্গিজ খান,bengali,"{'answer_start': [414], 'answer_text': ['বোরজি...",চেঙ্গিজ খান (মঙ্গোলীয়: Чингис Хаан আ-ধ্ব-ব: ...,https://bn.wikipedia.org/wiki/%E0%A6%9A%E0%A7%...
4,రెయ్యలగడ్ద గ్రామ విస్తీర్ణత ఎంత?,రెయ్యలగడ్ద,telugu,"{'answer_start': [259], 'answer_text': ['27 హె...","రెయ్యలగడ్ద, విశాఖపట్నం జిల్లా, గంగరాజు మాడుగుల...",https://te.wikipedia.org/wiki/%E0%B0%B0%E0%B1%...


In [71]:
def print_preprocessing_summary(df):
    num_unanswered = df[df['document_answer_region'] == UNANSWERED].shape[0]
    num_answered_but_failed = df[df['document_answer_region'] == BAD_TOKENIZATION_OR_DATA].shape[0]
    num_answered = df.shape[0] - num_answered_but_failed - num_unanswered

    print("[Parsing Info] {} answered questions. {} unanswered questions. Failed to parse {} (answered) questions.".format(num_answered, num_unanswered, num_answered_but_failed))

Let's do some preprocessing!

In [72]:
def our_tokenizer(nlp):
    infixes = nlp.Defaults.infixes + [r"[\.\,]?\[\S*"]
    infix_re = spacy.util.compile_infix_regex(infixes)
    prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

    return Tokenizer(
        nlp.vocab,
        prefix_search=prefix_re.search,
        suffix_search=suffix_re.search,
        infix_finditer=infix_re.finditer,
        #token_match=nlp.tokenizer.token_match,
        rules=nlp.Defaults.tokenizer_exceptions)

In [73]:
def our_nlp(pipeline_name: str):
    nlp = spacy.load(pipeline_name)
    nlp.tokenizer = our_tokenizer(nlp)
    return nlp

In [105]:
class Annotation_error(Enum):
    UNANSWERED = -1
    BAD_TOKENIZATION_OR_DATA = -2
    IGNORED = -3

def preprocess_annotation(raw_sample, nlp):
    document_doc = nlp(raw_sample['document_plaintext'])
    annotations = raw_sample['annotations']

    # extract answer annotations
    start = annotations['answer_start'][0]
    if start == -1: # unanswered question
        region = UNANSWERED
    else:
        length = len(annotations['answer_text'][0])
        end = start + length

        span = document_doc.char_span(start, end) # or None if the span is within a token

        if span == None: # the answer region does not match token boundaries (either due to poor tokenzation or poor data labelling)
            region = BAD_TOKENIZATION_OR_DATA
        else:
            region = (span.start, span.end)

    return region

def preprocess_language(raw_df, nlp, preprocess_annotations: bool = False, num_max_rows=-1):
    rows = len(raw_df)
    
    if num_max_rows > 0: # @Remove
        rows = min(rows, num_max_rows)
    
    columns = ['language', 'question', 'document_title', 'document', 'document_answer_region']
    df = pd.DataFrame(columns=columns, index=range(rows))
    
    df['language'] = raw_df['language'].iloc[0]
 
    for i in range(rows):
        raw_sample = raw_df.iloc[i]

        df.at[i,'question'] = [t.text for t in nlp(raw_sample['question_text'])]
        df.at[i,'document_title'] = [t.text for t in nlp(raw_sample['document_title'])]
        df.at[i,'document'] = [t.text for t in nlp(raw_sample['document_plaintext'])]
        
        
        if preprocess_annotations:
            answer_region = preprocess_annotation(raw_sample, nlp)
        else:
            answer_region = IGNORED    
        
        df.at[i,'document_answer_region'] = answer_region
        
        
        if i % 1000  == 0:
            print("sample {}/{}".format(i, rows))
            
        if i == num_max_rows: # @Remove
            break
    
    return df

In [106]:
def preprocess(raw_data, max_rows_per_language=-1):
    # english
    raw_data_en = raw_data[raw_data['language'] == 'english']
    data_en = preprocess_language(raw_data_en, our_nlp('en_core_web_sm'), num_max_rows=max_rows_per_language)
    print_preprocessing_summary(data_en)
    
    # finnish
    raw_data_fi = raw_data[raw_data['language'] == 'finnish']
    data_fi = preprocess_language(raw_data_fi, our_nlp('fi_core_news_sm'), num_max_rows=max_rows_per_language)
    print_preprocessing_summary(data_fi)
    
    # japanese
    # @Note: for some reason, the pretrained pipeline doesn't work well with finding the answer. Japenese() works a lot better. However, maybe it just tokenizes each symbol
    raw_data_jp = raw_data[raw_data['language'] == 'japanese']
    data_jp = preprocess_language(raw_data_jp, our_nlp('ja_core_news_sm'), num_max_rows=max_rows_per_language) 
    print_preprocessing_summary(data_jp)
    
    # concat
    data = pd.concat([data_en, data_fi, data_jp])
    
    return data


Let's preprocess the training data

In [107]:
train_set = preprocess(train_set_raw, max_rows_per_language = 100)

sample 0/100
[Parsing Info] 100 answered questions. 0 unanswered questions. Failed to parse 0 (answered) questions.
sample 0/100
[Parsing Info] 100 answered questions. 0 unanswered questions. Failed to parse 0 (answered) questions.
sample 0/100
[Parsing Info] 100 answered questions. 0 unanswered questions. Failed to parse 0 (answered) questions.


In [108]:
train_set.head()

Unnamed: 0,language,question,document_title,document,document_answer_region
0,english,"[When, was, quantum, field, theory, developed, ?]","[Quantum, field, theory]","[Quantum, field, theory, naturally, began, wit...",-3
1,english,"[Who, was, the, first, Nobel, prize, winner, f...","[List, of, Nobel, laureates, in, Literature]","[The, Nobel, Prize, in, Literature, (, Swedish...",-3
2,english,"[When, is, the, dialectical, method, used, ?]",[Dialectic],"[Dialectic, or, dialectics, (, Greek, :, διαλε...",-3
3,english,"[Who, invented, Hangul, ?]","[Origin, of, Hangul]","[Hangul, was, personally, created, and, promul...",-3
4,english,"[What, do, Grasshoppers, eat, ?]",[Grasshopper],"[Grasshoppers, are, plant, -, eaters, ,, with,...",-3


Let's preprocess the validation data

In [109]:
validation_set = preprocess(validation_set_raw, max_rows_per_language = 100)

sample 0/100
[Parsing Info] 100 answered questions. 0 unanswered questions. Failed to parse 0 (answered) questions.
sample 0/100
[Parsing Info] 100 answered questions. 0 unanswered questions. Failed to parse 0 (answered) questions.
sample 0/100
[Parsing Info] 100 answered questions. 0 unanswered questions. Failed to parse 0 (answered) questions.


In [110]:
validation_set.head()

Unnamed: 0,language,question,document_title,document,document_answer_region
0,english,"[What, is, a, way, to, increase, your, wound, ...","[Wound, healing]","[Wound, care, encourages, and, speeds, wound, ...",-3
1,english,"[Who, founded, the, Burntisland, Shipbuilding,...","[Burntisland, Shipbuilding, Company]","[Brothers, Amos, and, Wilfrid, Ayre, founded, ...",-3
2,english,"[What, is, the, surface, area, of, the, human,...","[Cerebral, cortex]","[For, species, of, mammals, ,, larger, brains,...",-3
3,english,"[When, did, the, case, of, R, (, Factortame, L...","[R, (, Factortame, Ltd, ), v, Secretary, of, S...","[As, from, 31, March, 1989, ,, fishing, vessel...",-3
4,english,"[When, was, Quezon, City, founded, ?]","[Quezon, City]","[When, Quezon, City, was, created, in, 1939, ,...",-3


### Save pre-processed training and validation data

In [17]:
train_set.to_csv(path_train_set, index=False)

In [111]:
validation_set.to_csv(path_validation_set, index=False)

### Load pre-processed training and validation data

In [21]:
train_set2 = pd.read_csv(path_train_set)

In [20]:
train_set2.head()

Unnamed: 0,language,question,answer,answer_region
0,english,"['When', 'was', 'quantum', 'field', 'theory', ...","['Quantum', 'field', 'theory', 'naturally', 'b...","(25, 26)"
1,english,"['Who', 'was', 'the', 'first', 'Nobel', 'prize...","['The', 'Nobel', 'Prize', 'in', 'Literature', ...","(111, 113)"
2,english,"['When', 'is', 'the', 'dialectical', 'method',...","['Dialectic', 'or', 'dialectics', '(', 'Greek'...","(26, 49)"
3,english,"['Who', 'invented', 'Hangul', '?']","['Hangul', 'was', 'personally', 'created', 'an...","(15, 18)"
4,english,"['What', 'do', 'Grasshoppers', 'eat', '?']","['Grasshoppers', 'are', 'plant', '-', 'eaters'...","(0, 37)"


In [112]:
validation_set2 = pd.read_csv(path_validation_set)

In [113]:
validation_set2.head()

Unnamed: 0,language,question,document_title,document,document_answer_region
0,english,"['What', 'is', 'a', 'way', 'to', 'increase', '...","['Wound', 'healing']","['Wound', 'care', 'encourages', 'and', 'speeds...",-3
1,english,"['Who', 'founded', 'the', 'Burntisland', 'Ship...","['Burntisland', 'Shipbuilding', 'Company']","['Brothers', 'Amos', 'and', 'Wilfrid', 'Ayre',...",-3
2,english,"['What', 'is', 'the', 'surface', 'area', 'of',...","['Cerebral', 'cortex']","['For', 'species', 'of', 'mammals', ',', 'larg...",-3
3,english,"['When', 'did', 'the', 'case', 'of', 'R', '(',...","['R', '(', 'Factortame', 'Ltd', ')', 'v', 'Sec...","['As', 'from', '31', 'March', '1989', ',', 'fi...",-3
4,english,"['When', 'was', 'Quezon', 'City', 'founded', '?']","['Quezon', 'City']","['When', 'Quezon', 'City', 'was', 'created', '...",-3


## Q1.1b