# Install

Restart if de_core_news_sm is not loaded (apparently a bug on some colab VMs)

In [None]:
%%capture
!pip3 install spacy plotly
!python3 -m spacy download de_core_news_sm

https://stackoverflow.com/questions/69564817/typeerror-load-missing-1-required-positional-argument-loader-in-google-col

In [None]:
%%capture
!pip install pyyaml==5.4.1

# Utility

In [None]:
import requests
from pathlib import Path
from tqdm.auto import tqdm

def download_from_url(url: str) -> str:
    request = requests.get(
        url, 
        allow_redirects=True, 
        stream=True)
    total_size_in_bytes= int(request.headers.get('content-length', 0))
    block_size = 2**10 # 1 Kibibyte

    filename = url.split("/")[-1]

    with tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True) as progress_bar:
        with open(filename, 'wb') as f:
            for data in request.iter_content(block_size):
                progress_bar.update(len(data))
                f.write(data)

    return filename

In [None]:
import tarfile
import zipfile

def unpack_download(filename: str) -> None:
    if ".tar.gz" in filename:
        with tarfile.open(filename, 'r:gz') as tar_ref:
            for file in tqdm(iterable=tar_ref.getmembers(), total=len(tar_ref.getmembers())):
                tar_ref.extract(member=file)
    elif ".zip" in filename:
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            for file in tqdm(iterable=zip_ref.namelist(), total=len(zip_ref.namelist())):
                zip_ref.extract(member=file)
    else:
        raise ValueError(f"Unknown file extension '{filename}'.")

# Download dataset

In [None]:
germanquad_download_link = "https://germanquad.s3.amazonaws.com/GermanQuAD.zip" # From https://www.deepset.ai/germanquad
xquad_download_link = "https://github.com/deepmind/xquad/archive/refs/heads/master.zip" # From https://github.com/deepmind/xquad
mlqa_download_link = "https://dl.fbaipublicfiles.com/MLQA/MLQA_V1.zip" # From https://github.com/facebookresearch/MLQA
machine_translated_squad_train_link = "https://dl.fbaipublicfiles.com/MLQA/mlqa-translate-train.tar.gz" # From https://github.com/facebookresearch/MLQA

dataset_links = [
    germanquad_download_link,
    xquad_download_link,
    mlqa_download_link,
    machine_translated_squad_train_link,
]

In [None]:
for link in dataset_links:
    print("Downloading", link)
    filename = download_from_url(link)
    print("Unpacking", filename)
    unpack_download(filename)

Downloading https://germanquad.s3.amazonaws.com/GermanQuAD.zip


  0%|          | 0.00/2.73M [00:00<?, ?iB/s]

Unpacking GermanQuAD.zip


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading https://github.com/deepmind/xquad/archive/refs/heads/master.zip


0.00iB [00:00, ?iB/s]

Unpacking master.zip


  0%|          | 0/16 [00:00<?, ?it/s]

Downloading https://dl.fbaipublicfiles.com/MLQA/MLQA_V1.zip


  0%|          | 0.00/75.7M [00:00<?, ?iB/s]

Unpacking MLQA_V1.zip


  0%|          | 0/102 [00:00<?, ?it/s]

Downloading https://dl.fbaipublicfiles.com/MLQA/mlqa-translate-train.tar.gz


  0%|          | 0.00/63.4M [00:00<?, ?iB/s]

Unpacking mlqa-translate-train.tar.gz


  0%|          | 0/16 [00:00<?, ?it/s]

# Select dataset

In [None]:
from enum import Enum

class DatasetEnum(Enum):
    GermanQuAD = 1
    SQuAD = 2
    XQuAD = 3
    MLQA = 4
    MLQA_SQuAD_Machine_Translated = 5

dataset_to_use = "GermanQuAD" #@param ["GermanQuAD", "XQuAD", "MLQA", "MLQA_SQuAD_Machine_Translated"]
TRAIN_DATASET = None
if dataset_to_use == "GermanQuAD":
    TRAIN_DATASET = DatasetEnum.GermanQuAD
elif dataset_to_use == "XQuAD":
    TRAIN_DATASET = DatasetEnum.XQuAD
elif dataset_to_use == "MLQA":
    TRAIN_DATASET = DatasetEnum.MLQA
elif dataset_to_use == "MLQA_SQuAD_Machine_Translated":
    TRAIN_DATASET = DatasetEnum.MLQA_SQuAD_Machine_Translated
else:
    raise RuntimeError("Unknown dataset")

In [None]:
from pathlib import Path

# GermanQuAD
germanquad_path = Path("GermanQuAD")
germanquad_train = germanquad_path / "GermanQuAD_train.json"
germanquad_test = germanquad_path / "GermanQuAD_test.json"

# XQuAD
xquad_path = Path("xquad-master")
xquad = xquad_path / "xquad.de.json"

# MLQA
mlqa_path = Path("MLQA_V1")
mlqa_dev = mlqa_path / "dev" / "dev-context-de-question-de.json"
mlqa_test = mlqa_path / "test" / "test-context-de-question-de.json"

# SQuAD Machine Translated German (from MLQA Paper / Repository)
squad_machine_translated_path = Path("mlqa-translate-train")
squad_machine_translated_train = (squad_machine_translated_path /
                                  "de_squad-translate-train-train-v1.1.json")
squad_machine_translated_test = (squad_machine_translated_path /
                                  "de_squad-translate-train-dev-v1.1.json")

In [None]:
train_file = test_file = None

if TRAIN_DATASET is DatasetEnum.GermanQuAD:
  train_file = str(germanquad_train.resolve())
  test_file = str(germanquad_test.resolve())
elif TRAIN_DATASET is DatasetEnum.XQuAD:
  train_file = str(xquad.resolve())
elif TRAIN_DATASET is DatasetEnum.MLQA:
  train_file = str(mlqa_test.resolve())
  test_file = str(mlqa_dev.resolve())
elif TRAIN_DATASET is DatasetEnum.MLQA_SQuAD_Machine_Translated:
  train_file = str(squad_machine_translated_train.resolve())
  test_file = str(squad_machine_translated_test.resolve())
else:
  raise RuntimeError("Unknown dataset")

# Methods & shared vars

In [None]:
import spacy
nlp = None
if TRAIN_DATASET is DatasetEnum.SQuAD:
    nlp = spacy.load("en_core_web_sm")
else:
    nlp = spacy.load("de_core_news_sm")
disable_components = ["tok2vec", "morphologizer", "senter", "lemmatizer", "ner"]

In [None]:
from typing import Tuple, List


def ends_with_question_mark(text: str) -> bool:
    if not text:
        return False
    SENTENCE_PUNCTUATION = "$."
    text_doc = nlp(text, disable_components)
    return (text_doc[-1].tag_ == SENTENCE_PUNCTUATION and 
            text_doc[-1].text == "?")


def get_interrogative_pronouns(text: str) -> Tuple[bool, List[str], List[str]]:
    INTERROGATIVPRONOMEN_STTS_TAGS = ["PWS", "PWAT", "PWAV"]
    if not text:
        return False
    text_doc = nlp(text, disable_components)
    interrogative_pronouns = []
    for token in text_doc: # Nicht immer am Anfang des Satzes
        if token.tag_ in INTERROGATIVPRONOMEN_STTS_TAGS: 
            interrogative_pronouns.append((token.text, token.tag_))
    has_interrogative_pronoun = len(interrogative_pronouns) >= 1
    if not interrogative_pronouns:
        interrogative_pronouns.append(("Unbekannt", "Unbekannt"))

    return (has_interrogative_pronoun, 
            [p[0] for p in interrogative_pronouns],
            [p[1] for p in interrogative_pronouns])


def is_question(text):
    return (ends_with_question_mark(text) and 
            get_interrogative_pronouns(text)[0])


In [None]:
from typing import Iterator, Any
import json
from collections import namedtuple


def get_interrogative_pronouns_from_squad_file(filepath: Path) -> Iterator[Any]:
    with open(str(filepath), encoding="utf-8") as f:
        squad = json.load(f)
        for article in squad["data"]:
            title = article.get("title", "")
            for paragraph in article["paragraphs"]:
                questions = [qa["question"].strip() for qa in paragraph["qas"]]
                for question in questions:
                    yield (question,
                           is_question(question),
                           get_interrogative_pronouns(question)[0],
                           get_interrogative_pronouns(question)[1],
                           get_interrogative_pronouns(question)[2],
                           ends_with_question_mark(question))

In [None]:
import pandas

cols = ["text", 
        "is_question", 
        "interrogative pronoun count", 
        "interrogative pronoun token text", 
        "interrogative pronoun STTS tag", 
        "ends with question mark"]

# Training split

## Question words, classification and STTS tag

In [None]:
df = pandas.DataFrame(get_interrogative_pronouns_from_squad_file(train_file), columns=cols)
from google.colab import data_table
data_table.DataTable(df)

Unnamed: 0,text,is_question,interrogative pronoun count,interrogative pronoun token text,interrogative pronoun STTS tag,ends with question mark
0,Von welchem Gesetzt stammt das Amerikanische ab?,True,True,[welchem],[PWAT],True
1,Warum unterscheidet sich das amerikanische Rec...,True,True,[Warum],[PWAV],True
2,Wie viele ethnische Gruppen und indigenen Völk...,True,True,[Wie],[PWAV],True
3,Woher kommt die Sexuelle Orientierung von Mens...,True,True,[Woher],[PWAV],True
4,Seit wann gehört Guam zu dem Gebiet der Verein...,True,True,[wann],[PWAV],True
...,...,...,...,...,...,...
11513,Welchem Staat wurde Eritrea nach dem Zweiten W...,True,True,[Welchem],[PWAT],True
11514,In welchem Jahr annektierte Haile Selassie Eri...,True,True,[welchem],[PWAT],True
11515,Wie lange dauerte der Unabhängigkeitskampf in ...,True,True,[Wie],[PWAV],True
11516,Wann endete der Kampf um die Unabhängigkeit Er...,True,True,[Wann],[PWAV],True


## Distribution of interrogative words

In [None]:
interrogative_token_texts = list(df["interrogative pronoun token text"])
interrogative_tokens = []
for token_list in interrogative_token_texts:
    for token in token_list:
        interrogative_tokens.append(token.lower())

interrogative_token_df = pandas.DataFrame(interrogative_tokens, columns=["interrogative pronoun token text"])
new_df = (interrogative_token_df["interrogative pronoun token text"]
            .value_counts()
            .rename_axis('unique_values')
            .reset_index(name='counts'))
print(new_df["counts"].sum())
from google.colab import data_table
data_table.DataTable(new_df)

11540


Unnamed: 0,unique_values,counts
0,wie,2281
1,welche,1934
2,was,1822
3,wann,1292
4,wer,590
5,welcher,561
6,warum,542
7,wo,479
8,welchem,478
9,welchen,453


In [None]:
import plotly.express as px
fig = px.bar(new_df, 
             x='unique_values', 
             y='counts', 
             template="plotly_dark")
fig.show()
new_df['counts'].sum()

  defaults = yaml.load(f)


11540

In [None]:
import plotly.express as px
fig = px.pie(
    new_df, 
    values='counts', 
    names='unique_values', 
    template="plotly_dark")
fig.show()

## Number of questions vs. classified questions

In [None]:
is_question_list = list(df["is_question"])
is_question_df = pandas.DataFrame(is_question_list, columns=["is_question"])
new_is_question_df = (is_question_df["is_question"]
            .value_counts()
            .rename_axis('unique_values')
            .reset_index(name='counts'))
from google.colab import data_table
data_table.DataTable(new_is_question_df)

Unnamed: 0,unique_values,counts
0,True,11362
1,False,156


In [None]:
row = new_is_question_df.loc[new_is_question_df["unique_values"] == True]
print("total questions classified:", row["counts"][0])
print("total questions in dataset:", new_is_question_df["counts"].sum())
print("recall: ", row["counts"][0]/new_is_question_df["counts"].sum()) # TP / TP + FN

total questions classified: 11362
total questions in dataset: 11518
recall:  0.9864559819413092


In [None]:
import plotly.express as px
fig = px.pie(
    new_is_question_df, 
    values='counts', 
    names='unique_values', 
    title='Classified count return value is_question method',
    template="plotly_dark")
fig.show()

## Distribution of STTS tags for interrogative pronouns

In [None]:
stts_tags = list(df["interrogative pronoun STTS tag"])
stts_tokens = []
for token_list in stts_tags:
    for token in token_list:
        stts_tokens.append(token)
stts_tags_df = pandas.DataFrame(stts_tokens, columns=["interrogative pronoun STTS tag"])

new_stts_tags_df = (stts_tags_df["interrogative pronoun STTS tag"]
            .value_counts()
            .rename_axis('unique_values')
            .reset_index(name='counts'))
from google.colab import data_table
data_table.DataTable(new_stts_tags_df)

Unnamed: 0,unique_values,counts
0,PWAV,5049
1,PWAT,3800
2,PWS,2559
3,Unbekannt,132


In [None]:
import plotly.express as px
fig = px.pie(
    new_stts_tags_df, 
    values='counts', 
    names='unique_values',
    template="plotly_dark")
fig.update_traces(textfont_size=20)
fig.update_layout(legend = dict(font = dict(size = 24, color = "black")))
fig.show()
new_stts_tags_df['counts'].sum()

11540

## Ends with question mark count

In [None]:
ends_with_question_mark_list = list(df["ends with question mark"])
ends_with_question_mark_df = pandas.DataFrame(ends_with_question_mark_list, columns=["ends_with_question_mark"])
new_ends_with_question_mark_df = (ends_with_question_mark_df["ends_with_question_mark"]
            .value_counts()
            .rename_axis('unique_values')
            .reset_index(name='counts'))
from google.colab import data_table
data_table.DataTable(new_ends_with_question_mark_df)

Unnamed: 0,unique_values,counts
0,True,11490
1,False,28


In [None]:
row = new_ends_with_question_mark_df.loc[new_ends_with_question_mark_df["unique_values"] == True]
print("total questions classified:", row["counts"][0])
print("total questions in dataset:", new_ends_with_question_mark_df["counts"].sum())
print("recall: ", row["counts"][0]/new_ends_with_question_mark_df["counts"].sum()) # TP / TP + FN

total questions classified: 11490
total questions in dataset: 11518
recall:  0.9975690223997222


In [None]:
import plotly.express as px
fig = px.pie(
    new_ends_with_question_mark_df,
    values='counts',
    names='unique_values',
    title='Questions ending with question mark',
    template="plotly_dark")
fig.show()

# Test split

## Question words, classification and STTS tag

In [None]:
df = pandas.DataFrame(get_interrogative_pronouns_from_squad_file(test_file), columns=cols)
from google.colab import data_table
data_table.DataTable(df)

Unnamed: 0,text,is_question,interrogative pronoun count,interrogative pronoun token text,interrogative pronoun STTS tag,ends with question mark
0,Was kann den Verschleiß des seillosen Aufzuges...,True,True,[Was],[PWS],True
1,In welcher deutschen Stadt wird der seillose A...,True,True,[welcher],[PWAT],True
2,Wo wurde ein seilloser Aufzug entwickelt?,True,True,[Wo],[PWAV],True
3,Wie funktioniert ein seilloser Aufzug?,True,True,[Wie],[PWAV],True
4,Wann muss man die Zieletage in seillosen Aufzü...,True,True,[Wann],[PWAV],True
...,...,...,...,...,...,...
2199,In welchem Teil Indies entstand das tamilische...,True,True,[welchem],[PWAT],True
2200,Wie viele Dynastien regierten im 8. Jhd. über ...,True,True,[Wie],[PWAV],True
2201,Welche Persönlichkeiten führten die Unabhängig...,True,True,[Welche],[PWAT],True
2202,Wann wurde Bangladesch gegründet?,True,True,[Wann],[PWAV],True


## Distribution of interrogative words

In [None]:
interrogative_token_texts = list(df["interrogative pronoun token text"])
interrogative_tokens = []
for token_list in interrogative_token_texts:
    for token in token_list:
        interrogative_tokens.append(token.lower())

interrogative_token_df = pandas.DataFrame(interrogative_tokens, columns=["interrogative pronoun token text"])
new_df = (interrogative_token_df["interrogative pronoun token text"]
            .value_counts()
            .rename_axis('unique_values')
            .reset_index(name='counts'))
print(new_df["counts"].sum())
from google.colab import data_table
data_table.DataTable(new_df)

2217


Unnamed: 0,unique_values,counts
0,was,448
1,wie,398
2,welche,252
3,wann,249
4,wer,207
5,welcher,108
6,wo,107
7,welchem,77
8,warum,74
9,welches,62


In [None]:
import plotly.express as px
fig = px.bar(new_df, 
             x='unique_values', 
             y='counts', 
             template="plotly_dark")
fig.show()
new_df['counts'].sum()

2217

In [None]:
import plotly.express as px
fig = px.pie(
    new_df, 
    values='counts', 
    names='unique_values', 
    template="plotly_dark")
fig.show()

## Number of questions vs. classified questions

In [None]:
is_question_list = list(df["is_question"])
is_question_df = pandas.DataFrame(is_question_list, columns=["is_question"])
new_is_question_df = (is_question_df["is_question"]
            .value_counts()
            .rename_axis('unique_values')
            .reset_index(name='counts'))
from google.colab import data_table
data_table.DataTable(new_is_question_df)

Unnamed: 0,unique_values,counts
0,True,2146
1,False,58


## Distribution of STTS tags for interrogative pronouns

In [None]:
stts_tags = list(df["interrogative pronoun STTS tag"])
stts_tokens = []
for token_list in stts_tags:
    for token in token_list:
        stts_tokens.append(token)
stts_tags_df = pandas.DataFrame(stts_tokens, columns=["interrogative pronoun STTS tag"])

new_stts_tags_df = (stts_tags_df["interrogative pronoun STTS tag"]
            .value_counts()
            .rename_axis('unique_values')
            .reset_index(name='counts'))
from google.colab import data_table
data_table.DataTable(new_stts_tags_df)

Unnamed: 0,unique_values,counts
0,PWAV,930
1,PWS,686
2,PWAT,556
3,Unbekannt,45


In [None]:
import plotly.express as px
fig = px.pie(
    new_stts_tags_df, 
    values='counts', 
    names='unique_values',)
fig.update_traces(textfont_size=20)
fig.update_layout(legend = dict(font = dict(size = 24, color = "black")))
fig.show()
new_stts_tags_df['counts'].sum()

2217

## Ends with question mark count

In [None]:
ends_with_question_mark_list = list(df["ends with question mark"])
ends_with_question_mark_df = pandas.DataFrame(ends_with_question_mark_list, columns=["ends_with_question_mark"])
new_ends_with_question_mark_df = (ends_with_question_mark_df["ends_with_question_mark"]
            .value_counts()
            .rename_axis('unique_values')
            .reset_index(name='counts'))
from google.colab import data_table
data_table.DataTable(new_ends_with_question_mark_df)

Unnamed: 0,unique_values,counts
0,True,2189
1,False,15


In [None]:
row = new_ends_with_question_mark_df.loc[new_ends_with_question_mark_df["unique_values"] == True]
print("total questions classified:", row["counts"][0])
print("total questions in dataset:", new_ends_with_question_mark_df["counts"].sum())
print("recall: ", row["counts"][0]/new_ends_with_question_mark_df["counts"].sum()) # TP / TP + FN

total questions classified: 2189
total questions in dataset: 2204
recall:  0.9931941923774955


In [None]:
import plotly.express as px
fig = px.pie(
    new_ends_with_question_mark_df,
    values='counts',
    names='unique_values',
    title='Questions ending with question mark',
    template="plotly_dark")
fig.show()