In [None]:
import os
import time
from pathlib import Path
import requests

import pandas as pd
import xml.etree.ElementTree as ET
from nltk import download
from tqdm.auto import tqdm

download('punkt')

### Read data from the German FrameNet Constructicon in order to do dependency analysis:

In [None]:
def parse_sentences(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    construction_id = root.attrib.get('id')
    try:
        category, name = root.attrib.get('name').split(':')
    except ValueError:
        category = root.attrib.get('name')
        name = None

    sentences_data = []
    
    for sentence in root.findall('.//sentence'):
        if sentence.attrib.get('uid') is None:
            continue
        uid = sentence.attrib.get('uid')
        text = sentence.find('.//text').text.strip()
        
        text_pos = []
        text_xpos = []
        text_dep = []
        text_head = []
        kees = []
        
        # Loop through annotations:
        for layer in sentence.findall('.//layer'):
            layer_name = layer.attrib.get('name')
            
            if "KEE-" in layer_name:
                # Loop over all KEEs or KEs:
                for label in layer.findall('.//label'):
                    start = int(label.attrib.get('start'))
                    end = int(label.attrib.get('end'))
                    kees.append(text[start:end])  # Read the KEE
            if "UPOS" == layer_name:
                for label in layer.findall('.//label'):
                    text_pos.append(label.attrib.get('name'))
            elif "XPOS" == layer_name:
                for label in layer.findall('.//label'):
                    text_xpos.append(label.attrib.get('name'))
            elif "DEP_REL" == layer_name:
                for label in layer.findall('.//label'):
                    text_dep.append(label.attrib.get('name'))
            elif "DEP_HEAD" == layer_name:
                for label in layer.findall('.//label'):
                    idx = int(label.attrib.get('name')) - 1
                    text_head.append(text.split()[idx].lower())
        
        
        sentences_data.append({
            'uid': uid,
            'text': text,
            "pos_tags": text_pos,
            "xpos_tags": text_xpos,
            "dep_rels": text_dep,
            "dep_heads": text_head,
            "kees": kees,
            'construction_id': int(construction_id),
            'category': category,
            'name': name
        })

    return sentences_data

kee_list = []
xml_directory = '../../data/constructicon/construction'

for filename in tqdm(list(os.listdir(xml_directory))):
    if filename.endswith('.xml'):
        constr_id = Path(filename).stem
        if b"fa-triangle-exclamation" in requests.get(f"https://gsw.phil.hhu.de/constructicon/construction?id={constr_id}").content:
            print(constr_id, "does not exist online!")
            continue
        xml_file = os.path.join(xml_directory, filename)
        data = parse_sentences(xml_file)
        if data:
            kee_list += data
        time.sleep(.5)

sentences = pd.DataFrame.from_dict(kee_list)
sentences.set_index('uid', inplace=True)
sentences

In [None]:
# Pickle contextleft and text with construction_id as index for generation purposes later:
sentences.to_csv("../../data/pseudowords/annotations.csv")