In [9]:
import json
import os
import pickle
import re
import time
from pathlib import Path
import requests

import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
from nltk import download
from nltk.tokenize import word_tokenize
import torch
from tqdm.auto import tqdm
from transformers import MBart50Tokenizer, MBartForConditionalGeneration
from transformers import pipeline

download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
def parse_sentences(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    construction_id = root.attrib.get('id')
    try:
        category, name = root.attrib.get('name').split(':')
    except ValueError:
        category = root.attrib.get('name')
        name = None

    sentences_data = []
    
    for sentence in root.findall('.//sentence'):
        if sentence.attrib.get('uid') is None:
            continue
        uid = sentence.attrib.get('uid')
        text = sentence.find('.//text').text.strip()
        
        text_pos = []
        text_xpos = []
        text_dep = []
        text_head = []
        kees = []
        
        # Loop through annotations:
        for layer in sentence.findall('.//layer'):
            layer_name = layer.attrib.get('name')
            
            if "KEE-" in layer_name:
                # Loop over all KEEs or KEs:
                for label in layer.findall('.//label'):
                    start = int(label.attrib.get('start'))
                    end = int(label.attrib.get('end'))
                    kees.append(text[start:end])  # Read the KEE
            if "UPOS" == layer_name:
                for label in layer.findall('.//label'):
                    text_pos.append(label.attrib.get('name'))
            elif "XPOS" == layer_name:
                for label in layer.findall('.//label'):
                    text_xpos.append(label.attrib.get('name'))
            elif "DEP_REL" == layer_name:
                for label in layer.findall('.//label'):
                    text_dep.append(label.attrib.get('name'))
            elif "DEP_HEAD" == layer_name:
                for label in layer.findall('.//label'):
                    text_head.append(label.attrib.get('name'))
        
        
        sentences_data.append({
            'uid': uid,
            'text': text,
            "pos_tags": text_pos,
            "xpos_tags": text_xpos,
            "dep_rels": text_dep,
            "dep_heads": text_head,
            "kees": kees,
            'construction_id': int(construction_id),
            'category': category,
            'name': name
        })

    return sentences_data

kee_list = []
xml_directory = '../../data/constructicon/construction'

for filename in tqdm(list(os.listdir(xml_directory))):
    if filename.endswith('.xml'):
        constr_id = Path(filename).stem
        if b"fa-triangle-exclamation" in requests.get(f"https://gsw.phil.hhu.de/constructicon/construction?id={constr_id}").content:
            print(constr_id, "does not exist online!")
            continue
        xml_file = os.path.join(xml_directory, filename)
        data = parse_sentences(xml_file)
        if data:
            kee_list += data
        time.sleep(.5)

sentences = pd.DataFrame.from_dict(kee_list)
sentences.set_index('uid', inplace=True)
sentences

  0%|          | 0/212 [00:00<?, ?it/s]

1286 does not exist online!


Unnamed: 0_level_0,text,pos_tags,xpos_tags,dep_rels,dep_heads,kees,construction_id,category,name
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
57B9D1BEB68F44E7A51D1920F1EA6951E39469B5,Und dann ist da noch das generelle Problem mit...,"[CCONJ, ADV, AUX, ADV, ADV, DET, ADJ, NOUN, AD...",[],"[CC, ADVMOD, COP, ROOT, ADVMOD, DET, AMOD, NSU...","[4, 4, 4, 0, 4, 8, 8, 4, 10, 8, 25, 25, 25, 15...",[geschweige denn],10,Negation,NEG_X_geschweige_denn_Y
D86B38CFA5D2458D9F3615EDD55C9308DF423DB0,»Ohne Europa sind viele Fragen nicht mehr seri...,"[PUNCT, PROPN, AUX, ADJ, NOUN, PART, ADV, ADJ,...",[],"[PUNCT, OBJ, CCOMP, DET, NSUBJ, ADVMOD:NEG, AD...","[3, 3, 20, 5, 3, 3, 6, 3, 10, 3, 12, 3, 12, 3,...",[geschweige denn],10,Negation,NEG_X_geschweige_denn_Y
7B3FAB8B01ED862D3D3FA18531B0904BCF24DFF0,"Dies lässt sich damit begründen , dass vor und...","[PRON, VERB, PRON, ADV, VERB, PUNCT, SCONJ, AD...",[],"[NSUBJ, ROOT, OBJ, ADVMOD, XCOMP, PUNCT, MARK,...","[2, 0, 2, 2, 2, 22, 22, 22, 11, 11, 8, 2, 15, ...",[geschweige denn],10,Negation,NEG_X_geschweige_denn_Y
B003E526D0AA301DA46458D32A82CEEFD7ADADA5,"Dennoch hat Hamas es nicht geschafft , ihre mi...","[ADV, AUX, PROPN, PRON, PART, VERB, PUNCT, DET...",[],"[ADVMOD, AUX, NSUBJ, EXPL, ADVMOD:NEG, ROOT, P...","[6, 6, 6, 6, 6, 0, 11, 10, 10, 11, 6, 11, 11, ...",[geschweige denn],10,Negation,NEG_X_geschweige_denn_Y
C3FFF1CA55CA8FF0A806C33E14A0FB331582698E,"Abermillionen rings um die Welt , die das Spek...","[NOUN, ADV, ADP, DET, NOUN, PUNCT, PRON, DET, ...",[],"[NSUBJ, ADVMOD, CASE, DET, NMOD, PUNCT, NSUBJ,...","[16, 5, 5, 5, 1, 13, 13, 9, 13, 12, 12, 13, 5,...",[geschweige denn],10,Negation,NEG_X_geschweige_denn_Y
...,...,...,...,...,...,...,...,...,...
ABB2D854C007CDAF94AA0F10FD7CEC5E1BF51033,Vor dieser Zeit waren Immobilien in Deutschlan...,"[ADP, DET, NOUN, AUX, NOUN, ADP, PROPN, ADJ, A...",[],"[CASE, DET, OBL, COP, NSUBJ, CASE, OBL, ADVMOD...","[3, 3, 9, 9, 9, 7, 9, 9, 0, 12, 12, 9, 16, 16,...",[er],99,Komparativ,ADJ-er_als_X
B8CBF505FD474F8EFBFF7FDDEC5DA1B3C775B622,"Wichtiger als die Frage , wer sich aus diesem ...","[ADJ, ADP, DET, NOUN, PUNCT, PRON, PRON, ADP, ...",[],"[ROOT, CASE, DET, OBL, PUNCT, NSUBJ, IOBJ, CAS...","[0, 4, 4, 1, 14, 14, 14, 11, 11, 11, 14, 13, 1...",[er],99,Komparativ,ADJ-er_als_X
3AE63AAADC213E35AD70B3A83D13415FA5EBE61F,Damit wäre der Geländewagen deutlich günstiger...,"[ADV, AUX, DET, NOUN, ADJ, ADJ, ADP, ADJ, NOUN...",[],"[ADVMOD, COP, DET, NSUBJ, ADVMOD, ROOT, CASE, ...","[6, 6, 4, 6, 6, 0, 9, 9, 6, 12, 12, 9, 6]",[er],99,Komparativ,ADJ-er_als_X
79339294317D620CA5F7C70A6F5DED39AAB8F0BD,Auch andere Unternehmen wie Uber kooperieren i...,"[ADV, ADJ, NOUN, ADP, PROPN, VERB, ADV, ADP, P...",[],"[ADVMOD, AMOD, NSUBJ, CASE, NMOD, ROOT, ADVMOD...","[3, 3, 6, 5, 3, 0, 6, 9, 6, 11, 6, 26, 26, 15,...",[er],99,Komparativ,ADJ-er_als_X


In [15]:
# Pickle contextleft and text with construction_id as index for generation purposes later:
sentences.to_csv("../../data/pseudowords/annotations.csv")