In [1]:
import csv
import itertools
import os
import re
from io import open

import spacy
from spacy import displacy
from conllu import parse_incr, TokenList
#from spacy_conll import init_parser
#from spacy_conll.parser import ConllParser
from tqdm.notebook import tqdm

In [2]:
def get_id(input_string):
    match = re.search(r'\d+$', input_string)
    return int(match.group()) if match else -1

In [None]:
ud_hdt_sentences = {}
all_ud_hdt_sentences = []
faulty_sentences = {}
filepath = "../../libs/UD_German-HDT"
#nlp_conll = ConllParser(init_parser("de_core_news_sm", "spacy"))
for filename in tqdm(list(os.listdir("../../libs/UD_German-HDT"))):
    cur_sentences = []
    if filename.endswith('.conllu'):
        #for token_list in nlp_conll.parse_conll_file_as_spacy(os.path.join(filepath, filename)):
        #    cur_sentences.append(token_list)
        #ud_hdt_sentences[filename] = cur_sentences
        #all_ud_hdt_sentences += cur_sentences
            
        data_file = open(os.path.join(filepath, filename), "r", encoding="utf-8")
        for token_list in parse_incr(data_file):
            cur_sentences.append(token_list)
        ud_hdt_sentences[filename] = cur_sentences
        all_ud_hdt_sentences += cur_sentences
        
ud_hdt_sentences

  0%|          | 0/12 [00:00<?, ?it/s]

In [None]:
nlp = spacy.load("de_core_news_sm")

with open("../../data/pseudowords/CoMaPP_Dataset.csv", "r") as csv_file:
    data = [row for row in csv.DictReader(csv_file)]

# In case some attributes are not pre-annotated, add them using spaCy:
completed_data = []
for incomplete_example in tqdm(data):
    example = incomplete_example
    nlp_example = nlp(example["text"])
    example["pos_tags"] = eval(example["pos_tags"])
    example["xpos_tags"] = eval(example["xpos_tags"])
    example["dep_rels"] = eval(example["dep_rels"])
    example["dep_heads"] = eval(example["dep_heads"])
    if len(example["pos_tags"]) == 0:
        example["pos_tags"] = [str(token.pos_) for token in nlp_example]
    if len(example["xpos_tags"]) == 0:
        example["xpos_tags"] = [str(token.tag_) for token in nlp_example]
    if len(example["dep_rels"]) == 0:
        example["dep_rels"] = [str(token.dep_).upper() for token in nlp_example]
    # if len(example["dep_heads"]) == 0:
    example["dep_heads"] = [str(token.head).lower() for token in nlp_example]
    completed_data.append(example)
data = completed_data
        

# Group the dataset into a list of lists where the label of the dictionaries is identical:
data.sort(key=lambda x: get_id(x["label"]))  # Grouping doesn't work without sorting first!
data = {constr: list(group) for constr, group in itertools.groupby(data, key=lambda example: get_id(example["label"]))}

data

In [None]:
list(data.keys())

In [None]:
common_deps = {}
for constr, group in tqdm(data.items()):
    group_deps = []
    for example in group:
        # Variant with forms:
        tokens = [token for token in nlp(example["text"])]
        head_poss = [token.head.pos_ for token in nlp(example["text"])]
        head_tags = [token.head.tag_ for token in nlp(example["text"])]
        
        group_deps.append([
            (str(token).lower(), str(dep), str(head))       # token -dep-> token
            for token, dep, head in zip(tokens, example["dep_rels"], example["dep_heads"])
        ] + [
            (str(tag), str(dep), str(head))         # tag -dep-> token
            for tag, dep, head in zip(example["xpos_tags"], example["dep_rels"], example["dep_heads"])
        ] + [
            (str(pos), str(dep), str(head))         # pos -dep-> token
            for pos, dep, head in zip(example["pos_tags"], example["dep_rels"], example["dep_heads"])
        ] + [
            (str(token).lower(), str(dep), str(head_tag))   # token -dep-> tag
            for token, dep, head_tag in zip(tokens, example["dep_rels"], head_tags)
        ] + [
            (str(tag), str(dep), str(head_tag))     # tag -dep-> tag
            for tag, dep, head_tag in zip(example["xpos_tags"], example["dep_rels"], head_tags)
        ] + [
            (str(pos), str(dep), str(head_tag))     # pos -dep-> tag
            for pos, dep, head_tag in zip(example["pos_tags"], example["dep_rels"], head_tags)
        ] + [
            (str(token).lower(), str(dep), str(head_pos))   # token -dep-> pos
            for token, dep, head_pos in zip(tokens, example["dep_rels"], head_poss)
        ] + [
            (str(tag), str(dep), str(head_pos))     # tag -dep-> pos
            for tag, dep, head_pos in zip(example["xpos_tags"], example["dep_rels"], head_poss)
        ] + [
            (str(pos), str(dep), str(head_pos))     # pos -dep-> pos
            for pos, dep, head_pos in zip(example["pos_tags"], example["dep_rels"], head_poss)
        ])
        
    common_deps[constr] = set.intersection(*map(set, group_deps))  # get only the dependencies that are a match in all examples of one construction
common_deps

In [None]:
matches = {}
scores = {}
for constr, group in tqdm(common_deps.items()):
    matches[constr] = {}
    if set() in group:
        continue
    for i, corpus_sentence in enumerate(all_ud_hdt_sentences): # in ud_hdt_sentences["de_hdt-ud-test.conllu"]
        missing = 0.0  # all group elements have to be found in the sentence, so let's check that in the end!
        for ex_token, ex_dep, ex_head in group:
            for token in corpus_sentence:
                if not token["head"]:  # for some reason, there is no token["head"] sometimes...
                    continue
                deptoken = str(token["form"]).lower()
                deptoken_xpos = str(token["xpos"]).lower()
                deptoken_upos = str(token["upos"]).lower()
                deprel = str(token["deprel"]).upper()
                dephead = str(corpus_sentence[token["head"]-1]["form"]).lower()
                dephead_xpos = corpus_sentence[token["head"]-1]["xpos"]
                dephead_upos = corpus_sentence[token["head"]-1]["upos"]
                    
                if (
                    ex_token in {deptoken, deptoken_xpos, deptoken_upos} and
                    ex_dep == deprel and
                    ex_head in {dephead, dephead_xpos, dephead_upos}
                ):
                    missing += 1.0
            if missing > 0.0:  # if any matches have been found
                with open(f"../../out/matches/graphics/{constr}_{i}.svg", "w") as out_file:
                    nlp_corpus_sentence = nlp(corpus_sentence.metadata["text"])
                    svg = displacy.render(nlp_corpus_sentence, style="dep", jupyter=False)
                    out_file.write(svg)
                matches[constr][corpus_sentence.metadata["text"]] = missing / float(len(group))

In [None]:
matches.values()

In [None]:
with open("../../out/matches_dep_constr.txt", "w") as file:
    for key, match in matches.items(): 
        file.write(str(key) + ":\n")
        for sentence, score in match.items():
            file.write("\t" + str(score * 100) + "%\t" + sentence + "\n")