In [1]:
import json
import os
import time
from pathlib import Path

import pandas as pd
import requests
from tqdm.notebook import tqdm
import xml.etree.ElementTree as ET

from transformers import BertTokenizer

In [2]:
def parse_masked_sentences(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    sentences_data = []
    
    constr_id = int(root.attrib.get('id'))
    
    for sentence in root.findall('.//sentence'):
        if sentence.attrib.get('uid') is None:
            continue
        
        uid = sentence.attrib.get('uid')
        text = sentence.find('.//text').text.strip()
        
        text_pos = []
        text_xpos = []
        text_dep = []
        text_head = []
        kees = []
        kees_idx = []
        kes = []
        kes_idx = []
        
        # Loop through annotations:
        for layer in sentence.findall('.//layer'):
            layer_name = layer.attrib.get('name')
            
            # Get the KEE annotations:
            if "KE-" in layer_name or "KEE-" in layer_name:
                # Loop over all KEEs or KEs:
                for label in layer.findall('.//label'):
                    start = int(label.attrib.get('start'))
                    end = int(label.attrib.get('end'))
                    if "KEE-" in layer_name:
                        kees.append(text[start:end])  # Read the KEE
                        kees_idx.append((start, end))  # Log the position of the KEE
                    else:
                        kes.append(text[start:end])  # Read the KE
                        kes_idx.append((start, end))  # Log the position of the KE
            elif "UPOS" == layer_name:
                for label in layer.findall('.//label'):
                    text_pos.append(label.attrib.get('name'))
            elif "XPOS" == layer_name:
                for label in layer.findall('.//label'):
                    text_xpos.append(label.attrib.get('name'))
            elif "DEP_REL" == layer_name:
                for label in layer.findall('.//label'):
                    text_dep.append(label.attrib.get('name'))
            elif "DEP_HEAD" == layer_name:
                for label in layer.findall('../label'):
                    text_head.append(label.attrib.get('name'))
                    
        sentences_data.append({
            'uid': uid,
            'constr_id': constr_id,
            'text': text,
            'text_pos': text_pos,
            'text_xpos': text_xpos,
            'text_dep': text_dep,
            'text_head': text_head,
            'kees': kees,
            'kees_idx': kees_idx,
            'kes': kes,
            'kes_idx': kes_idx,
        })
    return sentences_data
    
sentence_list = []
xml_directory = ('../../data/constructicon/construction')

for filename in tqdm(list(os.listdir(xml_directory))):
    if filename.endswith('.xml'):
        constr_id = Path(filename).stem
        if b"fa-triangle-exclamation" in requests.get(f"https://gsw.phil.hhu.de/constructicon/construction?id={constr_id}").content:
            print(constr_id, "does not exist online!")
            continue
        xml_file = os.path.join(xml_directory, filename)
        data = parse_masked_sentences(xml_file)
        if data:
            sentence_list += data
        time.sleep(.5)

sentences = pd.DataFrame.from_dict(sentence_list)
# sentences.set_index('uid', inplace=True)
sentences

  0%|          | 0/212 [00:00<?, ?it/s]

1286 does not exist online!


Unnamed: 0,uid,constr_id,text,text_pos,text_xpos,text_dep,text_head,kees,kees_idx,kes,kes_idx
0,57B9D1BEB68F44E7A51D1920F1EA6951E39469B5,10,Und dann ist da noch das generelle Problem mit...,"[CCONJ, ADV, AUX, ADV, ADV, DET, ADJ, NOUN, AD...",[],"[CC, ADVMOD, COP, ROOT, ADVMOD, DET, AMOD, NSU...",[],[geschweige denn],"[(128, 143)]","[dass, nicht, jeder Sprecher und Führer , der ...","[(55, 59), (60, 65), (66, 110), (111, 125), (1..."
1,D86B38CFA5D2458D9F3615EDD55C9308DF423DB0,10,»Ohne Europa sind viele Fragen nicht mehr seri...,"[PUNCT, PROPN, AUX, ADJ, NOUN, PART, ADV, ADJ,...",[],"[PUNCT, OBJ, CCOMP, DET, NSUBJ, ADVMOD:NEG, AD...",[],[geschweige denn],"[(72, 87)]","[»Ohne Europa sind viele Fragen, nicht, mehr s...","[(0, 30), (31, 36), (37, 48), (49, 69), (88, 97)]"
2,7B3FAB8B01ED862D3D3FA18531B0904BCF24DFF0,10,"Dies lässt sich damit begründen , dass vor und...","[PRON, VERB, PRON, ADV, VERB, PUNCT, SCONJ, AD...",[],"[NSUBJ, ROOT, OBJ, ADVMOD, XCOMP, PUNCT, MARK,...",[],[geschweige denn],"[(126, 141)]",[dass vor und neben Alexander – soweit wir wis...,"[(34, 84), (85, 89), (90, 97), (98, 123), (142..."
3,B003E526D0AA301DA46458D32A82CEEFD7ADADA5,10,"Dennoch hat Hamas es nicht geschafft , ihre mi...","[ADV, AUX, PROPN, PRON, PART, VERB, PUNCT, DET...",[],"[ADVMOD, AUX, NSUBJ, EXPL, ADVMOD:NEG, ROOT, P...",[],[geschweige denn],"[(83, 98)]","[Dennoch hat Hamas es, nicht, geschafft, ihre ...","[(0, 20), (21, 26), (27, 36), (39, 80), (99, 1..."
4,C3FFF1CA55CA8FF0A806C33E14A0FB331582698E,10,"Abermillionen rings um die Welt , die das Spek...","[NOUN, ADV, ADP, DET, NOUN, PUNCT, PRON, DET, ...",[],"[NSUBJ, ADVMOD, CASE, DET, NMOD, PUNCT, NSUBJ,...",[],[geschweige denn],"[(183, 198)]","[Abermillionen rings um die Welt , die das Spe...","[(0, 92), (93, 98), (99, 180), (201, 246)]"
...,...,...,...,...,...,...,...,...,...,...,...
7023,ABB2D854C007CDAF94AA0F10FD7CEC5E1BF51033,99,Vor dieser Zeit waren Immobilien in Deutschlan...,"[ADP, DET, NOUN, AUX, NOUN, ADP, PROPN, ADJ, A...",[],"[CASE, DET, OBL, COP, NSUBJ, CASE, OBL, ADVMOD...",[],[er],"[(63, 65)]","[Immobilien in Deutschland, deutlich, billig, ...","[(22, 47), (48, 56), (57, 63), (66, 107)]"
7024,B8CBF505FD474F8EFBFF7FDDEC5DA1B3C775B622,99,"Wichtiger als die Frage , wer sich aus diesem ...","[ADJ, ADP, DET, NOUN, PUNCT, PRON, PRON, ADP, ...",[],"[ROOT, CASE, DET, OBL, PUNCT, NSUBJ, IOBJ, CAS...",[],[er],"[(7, 9)]","[Wichtig, als die Frage , wer sich aus diesem ...","[(0, 7), (10, 88), (109, 176)]"
7025,3AE63AAADC213E35AD70B3A83D13415FA5EBE61F,99,Damit wäre der Geländewagen deutlich günstiger...,"[ADV, AUX, DET, NOUN, ADJ, ADJ, ADP, ADJ, NOUN...",[],"[ADVMOD, COP, DET, NSUBJ, ADVMOD, ROOT, CASE, ...",[],[er],"[(44, 46)]","[der Geländewagen, deutlich, günstig, als heut...","[(11, 27), (28, 36), (37, 44), (47, 86)]"
7026,79339294317D620CA5F7C70A6F5DED39AAB8F0BD,99,Auch andere Unternehmen wie Uber kooperieren i...,"[ADV, ADJ, NOUN, ADP, PROPN, VERB, ADV, ADP, P...",[],"[ADVMOD, AMOD, NSUBJ, CASE, NMOD, ROOT, ADVMOD...",[],[er],"[(144, 146)]","[sicher, gestalten soll]","[(138, 144), (170, 184)]"


In [3]:
sentences = sentences.explode(["kees", "kees_idx"], ignore_index=True)
sentences = sentences.explode(["kes", "kes_idx"], ignore_index=True)
sentences

Unnamed: 0,uid,constr_id,text,text_pos,text_xpos,text_dep,text_head,kees,kees_idx,kes,kes_idx
0,57B9D1BEB68F44E7A51D1920F1EA6951E39469B5,10,Und dann ist da noch das generelle Problem mit...,"[CCONJ, ADV, AUX, ADV, ADV, DET, ADJ, NOUN, AD...",[],"[CC, ADVMOD, COP, ROOT, ADVMOD, DET, AMOD, NSU...",[],geschweige denn,"(128, 143)",dass,"(55, 59)"
1,57B9D1BEB68F44E7A51D1920F1EA6951E39469B5,10,Und dann ist da noch das generelle Problem mit...,"[CCONJ, ADV, AUX, ADV, ADV, DET, ADJ, NOUN, AD...",[],"[CC, ADVMOD, COP, ROOT, ADVMOD, DET, AMOD, NSU...",[],geschweige denn,"(128, 143)",nicht,"(60, 65)"
2,57B9D1BEB68F44E7A51D1920F1EA6951E39469B5,10,Und dann ist da noch das generelle Problem mit...,"[CCONJ, ADV, AUX, ADV, ADV, DET, ADJ, NOUN, AD...",[],"[CC, ADVMOD, COP, ROOT, ADVMOD, DET, AMOD, NSU...",[],geschweige denn,"(128, 143)","jeder Sprecher und Führer , der redet , auch","(66, 110)"
3,57B9D1BEB68F44E7A51D1920F1EA6951E39469B5,10,Und dann ist da noch das generelle Problem mit...,"[CCONJ, ADV, AUX, ADV, ADV, DET, ADJ, NOUN, AD...",[],"[CC, ADVMOD, COP, ROOT, ADVMOD, DET, AMOD, NSU...",[],geschweige denn,"(128, 143)",etwas zu sagen,"(111, 125)"
4,57B9D1BEB68F44E7A51D1920F1EA6951E39469B5,10,Und dann ist da noch das generelle Problem mit...,"[CCONJ, ADV, AUX, ADV, ADV, DET, ADJ, NOUN, AD...",[],"[CC, ADVMOD, COP, ROOT, ADVMOD, DET, AMOD, NSU...",[],geschweige denn,"(128, 143)",das letzte Wort,"(144, 159)"
...,...,...,...,...,...,...,...,...,...,...,...
22999,79339294317D620CA5F7C70A6F5DED39AAB8F0BD,99,Auch andere Unternehmen wie Uber kooperieren i...,"[ADV, ADJ, NOUN, ADP, PROPN, VERB, ADV, ADP, P...",[],"[ADVMOD, AMOD, NSUBJ, CASE, NMOD, ROOT, ADVMOD...",[],er,"(144, 146)",sicher,"(138, 144)"
23000,79339294317D620CA5F7C70A6F5DED39AAB8F0BD,99,Auch andere Unternehmen wie Uber kooperieren i...,"[ADV, ADJ, NOUN, ADP, PROPN, VERB, ADV, ADP, P...",[],"[ADVMOD, AMOD, NSUBJ, CASE, NMOD, ROOT, ADVMOD...",[],er,"(144, 146)",gestalten soll,"(170, 184)"
23001,572AF707F11010A60B24BC7700543A7BB090D905,99,Rund 45 . 000 Dollar soll der Byton - SUV kost...,"[ADV, NUM, NUM, NUM, NOUN, AUX, DET, NOUN, PUN...",[],"[ADVMOD, NUMMOD, CC, CONJ, OBJ, AUX, DET, NSUB...",[],er,"(72, 74)",er,"(51, 53)"
23002,572AF707F11010A60B24BC7700543A7BB090D905,99,Rund 45 . 000 Dollar soll der Byton - SUV kost...,"[ADV, NUM, NUM, NUM, NOUN, AUX, DET, NOUN, PUN...",[],"[ADVMOD, NUMMOD, CC, CONJ, OBJ, AUX, DET, NSUB...",[],er,"(72, 74)",günstig,"(65, 72)"


In [13]:
json_comapp = []
csv_comapp = []
errors = 0
problematic_constructions = set()
unproblematic_constructions = set()
tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased')

for _, row in tqdm(sentences.iterrows(), total=len(sentences)):
    tokenized = str(row["text"]).split() # tokenizer.tokenize(...)
    
    try:
        ke_start, ke_end = row["kes_idx"]
        tokenized_kees = str(row["kees"]).split()  # tokenizer.tokenize(...)  # Split the KEEs if there are multi-word KEEs
    except TypeError:
        continue  # If there is nothing to mask or if there is no KEE, we can't use this example.
    
    tokenized_kes = str(row["text"][ke_start:ke_end]).split()
    
    masked_texts = []
    tokenized_masked_list = []
    
    for k, tokenized_ke in enumerate(tokenized_kes):
        masked_text = (
                row["text"][:ke_start] + " "
                + " ".join(tokenized_kes[:k]) 
                + " [MASK] " 
                + " ".join(tokenized_kes[k+1:]) 
                + " " + row["text"][ke_end:]
        ).replace("  ", " ").replace("  ", " ")
        masked_texts.append(masked_text)
        tokenized_masked_list.append(str(masked_text).split())
        
    for masked_text, tokenized_masked in zip(masked_texts, tokenized_masked_list):
        try:
            kee_idx = [tokenized.index(tokenized_kee) for tokenized_kee in tokenized_kees]
            kee_query_idx = []
            for i in kee_idx:
                kee_query_idx.append(i)
            assert len(masked_text.split()) == len(row["text"].split())
        except (ValueError, AssertionError) as e:
            print(row["constr_id"], type(e), e, "... Continuing ...")
            errors += 1
            problematic_constructions.add(row["constr_id"])
            continue
            
        unproblematic_constructions.add(row["constr_id"])
        
        out_json = [{
            "label": kee + str(row["constr_id"]),
            "target1": row["text"], 
            "target1_idx": idx, 
            "query": masked_text,
            "query_idx": q
        } for kee, idx, q in zip(tokenized_kees, kee_idx, kee_query_idx)]  # Split the KEEs if there are multi-word KEEs
        out_csv = [{
            "text": row["text"],
            "pos_tags": row["text_pos"],
            "xpos_tags": row["text_xpos"],
            "dep_rels": row["text_dep"],
            "dep_heads": row["text_head"],
            "mask": row["text"][ke_start:ke_end],
            "ambiguous_word": kee,
            "label": kee + str(row["constr_id"])  # This will be the new token that we will add to the LLM (named "<kee><i>" where <kee> and <i> are replaced by the KEE's name and i will be replaced by the construction it appears in).
        } for kee in tokenized_kees]
        
        json_comapp += out_json
        csv_comapp += out_csv
        
with open("../../data/pseudowords/CoMaPP_all_bert.json", "w") as file:
    json.dump(json_comapp, file, ensure_ascii=False)

f"{errors} elements from {len(problematic_constructions)} different constructions could not be saved as intended."

  0%|          | 0/23004 [00:00<?, ?it/s]

1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'>

'8174 elements from 112 different constructions could not be saved as intended.'

In [14]:
import csv

with open("../../data/pseudowords/CoMapp_Dataset_bert.csv", "w+", newline="") as file:
    writer = csv.DictWriter(file, fieldnames=["label", "text", "pos_tags", "xpos_tags", "dep_rels", "dep_heads", "mask", "ambiguous_word"])
    writer.writeheader()
    writer.writerows(csv_comapp)