In [None]:
import csv
import itertools
import os
from io import open

import spacy
from conllu import parse_incr, TokenList
from tqdm.notebook import tqdm

Zuerst werden die UD-HDT-Daten geladen, sodass diese später gefiltert werden können:

In [None]:
ud_hdt_sentences = {}
all_ud_hdt_sentences = []
filepath = "../../libs/UD_German-HDT"
for filename in tqdm(list(os.listdir("../../libs/UD_German-HDT"))):
    cur_sentences = []
    if filename.endswith('.conllu'):
        data_file = open(os.path.join(filepath, filename), "r", encoding="utf-8")
        for token_list in parse_incr(data_file):
            cur_sentences.append(token_list)
        ud_hdt_sentences[filename] = cur_sentences
        all_ud_hdt_sentences += cur_sentences
        
ud_hdt_sentences

Nun werden die Beispieldaten benötigt. Hierzu werden die Beispielsätze aus dem Konstruktikon verwendet, die nach Konstruktionen sortiert werden.

In [None]:
with open("../../data/pseudowords/CoMaPP_Dataset.csv", "r") as csv_file:
    data = [row for row in csv.DictReader(csv_file)]

# Group the dataset into a list of lists where the label of the dictionaries is identical:
data.sort(key=lambda x: x["label"])  # Grouping doesn't work without sorting first!
data = [list(group) for _, group in itertools.groupby(data, key=lambda x: x["label"])]

data

In [None]:
nlp = spacy.load("de_core_news_sm")

example_sentences = {}
kees = {}
for group in tqdm(data):
    example_sentences[group[0]["label"]] = set()
    kees[group[0]["label"]] = set()
    for example in group:
        if len(eval(example["pos_tags"])) > 0:
            # Wenn POS-Tags annotiert sind
            example_sentences[group[0]["label"]].add((example["text"], tuple(eval(example["pos_tags"]))))
        else:
            # Sonst nimm Spacy-POS-Tags
            example_sentences[group[0]["label"]].add((example["text"], tuple(token.pos_ for token in nlp(example["text"]))))
        kees[group[0]["label"]].add(example["ambiguous_word"].lower())
    
example_sentences

Nun können wir alle Korpussätze aus UD-HDT herausfiltern, die POS-Tag-Sequenzen enthalten, die in den Konstruktionen im Konstruktikon definiert sind.

In [None]:
matches = {}
for constr, group in tqdm(example_sentences.items()):
    matches[constr] = []
    for _, ex_pos in group:
        for corpus_sentence in ud_hdt_sentences["de_hdt-ud-test.conllu"]:  # in all_ud_hdt_sentences
            # Vergleiche ex_pos VS. tuple(token["upos"] for token in corpus_sentence)
            joined_ex_pos = ' '.join(ex_pos)
            joined_corpus_pos = ' '.join([token["upos"] for token in corpus_sentence])
            if (joined_ex_pos in joined_corpus_pos and 
                    any([kee in [token["form"].lower() for token in corpus_sentence] for kee in kees[constr]])):
                matches[constr].append(corpus_sentence)
                # print(".", end="")

In [None]:
matches

In [None]:
with open("../../out/matches_upos.txt", "w") as file:
    for key, match in matches.items(): 
        file.write(key + ":\n")
        for m in match:
            file.write("\t" + m.metadata["text"] + "\n")

Äquivalenter Test mit XPOS:

In [None]:
nlp = spacy.load("de_core_news_sm")

example_sentences = {}
kees = {}
for group in tqdm(data):
    example_sentences[group[0]["label"]] = set()
    kees[group[0]["label"]] = set()
    for example in group:
        if len(eval(example["xpos_tags"])) > 0:
            # Wenn POS-Tags annotiert sind
            example_sentences[group[0]["label"]].add((example["text"], tuple(eval(example["xpos_tags"]))))
        else:
            # Sonst nimm Spacy-POS-Tags
            example_sentences[group[0]["label"]].add((example["text"], tuple(token.tag_ for token in nlp(example["text"]))))
        kees[group[0]["label"]].add(example["ambiguous_word"].lower())
    
example_sentences

In [None]:
matches = {}
for constr, group in tqdm(example_sentences.items()):
    matches[constr] = []
    for _, ex_tag in group:
        for corpus_sentence in ud_hdt_sentences["de_hdt-ud-test.conllu"]:  # in all_ud_hdt_sentences
            # Vergleiche ex_tag VS. tuple(token["xpos"] for token in corpus_sentence)
            joined_ex_tag = ' '.join(ex_tag)
            joined_corpus_tag = ' '.join([str(token["xpos"]) for token in corpus_sentence])
            if (joined_ex_tag in joined_corpus_tag and
                    any([kee in [token["form"].lower() for token in corpus_sentence] for kee in kees[constr]])):
                matches[constr].append(corpus_sentence)
                # print(".", end="")

In [None]:
matches

In [None]:
with open("../../out/matches_xpos.txt", "w") as file:
    for key, match in matches.items(): 
        file.write(key + ":\n")
        for m in match:
            file.write("\t" + m.metadata["text"] + "\n")

Äquivalenter Test mit DEPs:

In [None]:
nlp = spacy.load("de_core_news_sm")

example_sentences = {}
kees = {}
for group in tqdm(data):
    example_sentences[group[0]["label"]] = set()
    kees[group[0]["label"]] = set()
    for example in group:
        if len(eval(example["dep_rels"])) > 0:
            # Wenn Dep-Rels annotiert sind
            example_sentences[group[0]["label"]].add((example["text"], tuple(ex.upper() for ex in eval(example["dep_rels"]))))
        else:
            # Sonst nimm Spacy-Dep-Rels
            example_sentences[group[0]["label"]].add((example["text"], tuple(str(token.dep_).upper() for token in nlp(example["text"]))))
        kees[group[0]["label"]].add(example["ambiguous_word"].lower())
    
example_sentences

In [None]:
matches = {}
for constr, group in tqdm(example_sentences.items()):
    matches[constr] = []
    for _, ex_dep in group:
        for corpus_sentence in ud_hdt_sentences["de_hdt-ud-test.conllu"]:  # in all_ud_hdt_sentences
            # Vergleiche ex_tag VS. tuple(token["xpos"] for token in corpus_sentence)
            joined_ex_dep = ' '.join(ex_dep)
            joined_corpus_dep = ' '.join([str(token["deprel"]).upper() for token in corpus_sentence])
            if (joined_ex_dep in joined_corpus_dep and
                    any([kee in [token["form"].lower() for token in corpus_sentence] for kee in kees[constr]])):
                matches[constr].append(corpus_sentence)
                # print(".", end="")

In [None]:
matches

In [None]:
with open("../../out/matches_dep.txt", "w") as file:
    for key, match in matches.items(): 
        file.write(key + ":\n")
        for m in match:
            file.write("\t" + m.metadata["text"] + "\n")

In [None]:
import spacy
from spacy import displacy

nlp = spacy.load("de_core_news_sm")
doc = nlp("Dies ist ein Beispielsatz.")
#displacy.render(doc, style="dep")

In [None]:
pass