In [26]:
import json
import os
import re
import time
from pathlib import Path
import requests

import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
from nltk import download
from nltk.tokenize import word_tokenize
import torch
from tqdm.auto import tqdm
from transformers import MBart50Tokenizer, MBartForConditionalGeneration
from transformers import pipeline

download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Konstruktikon:

In [27]:
def parse_sentences(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    construction_id = root.attrib.get('id')
    try:
        category, name = root.attrib.get('name').split(':')
    except ValueError:
        category = root.attrib.get('name')
        name = None

    sentences_data = []
    
    for sentence in root.findall('.//sentence'):
        if sentence.attrib.get('uid') is None:
            continue
        uid = sentence.attrib.get('uid')
        text = sentence.find('.//text').text.strip()
        contextleft = sentence.find('.//contextleft').text.strip()
        contextright = sentence.find('.//contextright').text.strip()
        
        sentences_data.append({
            'uid': uid,
            'text': text,
            'contextleft': contextleft,
            'contextright': contextright,
            'construction_id': int(construction_id),
            'category': category,
            'name': name,
        })

    return sentences_data

kee_list = []
xml_directory = '../../data/constructicon/construction'

for filename in tqdm(list(os.listdir(xml_directory))):
    if filename.endswith('.xml'):
        constr_id = Path(filename).stem
        if b"fa-triangle-exclamation" in requests.get(f"https://gsw.phil.hhu.de/constructicon/construction?id={constr_id}").content:
            print(constr_id, "does not exist online!")
            continue
        xml_file = os.path.join(xml_directory, filename)
        data = parse_sentences(xml_file)
        if data:
            kee_list += data
        time.sleep(.5)

sentences = pd.DataFrame.from_dict(kee_list)
sentences.set_index('uid', inplace=True)
sentences

  0%|          | 0/443 [00:00<?, ?it/s]

1009 does not exist online!
1011 does not exist online!
1014 does not exist online!
1016 does not exist online!
1018 does not exist online!
1026 does not exist online!
1066 does not exist online!
1070 does not exist online!
1073 does not exist online!
1075 does not exist online!
1083 does not exist online!
1086 does not exist online!
1088 does not exist online!
1091 does not exist online!
1092 does not exist online!
1093 does not exist online!
1097 does not exist online!
1099 does not exist online!
1102 does not exist online!
1114 does not exist online!
1117 does not exist online!
1123 does not exist online!
1135 does not exist online!
1147 does not exist online!
1149 does not exist online!
1154 does not exist online!
1158 does not exist online!
1159 does not exist online!
1160 does not exist online!
1163 does not exist online!
1167 does not exist online!
1169 does not exist online!
1171 does not exist online!
1181 does not exist online!
1183 does not exist online!
1187 does not exist 

Unnamed: 0_level_0,text,contextleft,contextright,construction_id,category,name
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
57B9D1BEB68F44E7A51D1920F1EA6951E39469B5,Und dann ist da noch das generelle Problem mit...,"""""Wir haben die Gaza-Offensive nicht begonnen,...",Der ehemalige Ministerpräsident Hanija gehört ...,10,Negation,NEG_X_geschweige_denn_Y
D86B38CFA5D2458D9F3615EDD55C9308DF423DB0,»Ohne Europa sind viele Fragen nicht mehr seri...,Die Zeit nationaler Alleingänge sei längst vor...,"Die Frage ist: Kann es Europa gelingen, sich i...",10,Negation,NEG_X_geschweige_denn_Y
7B3FAB8B01ED862D3D3FA18531B0904BCF24DFF0,"Dies lässt sich damit begründen , dass vor und...","Wenn Alexander das Heer nicht geführt hätte, d...",Ganz anders ist die Beweislage für den ersten ...,10,Negation,NEG_X_geschweige_denn_Y
B003E526D0AA301DA46458D32A82CEEFD7ADADA5,"Dennoch hat Hamas es nicht geschafft , ihre mi...",DerCouncil of Foreign Relations schätzt das jä...,"Es ist in erster Linie die politische Führung,...",10,Negation,NEG_X_geschweige_denn_Y
C3FFF1CA55CA8FF0A806C33E14A0FB331582698E,"Abermillionen rings um die Welt , die das Spek...",Nichts hätte die Mär vom Niedergang der Superm...,"Doch wie schon die Wahl Obamas global war, ist...",10,Negation,NEG_X_geschweige_denn_Y
...,...,...,...,...,...,...
ABB2D854C007CDAF94AA0F10FD7CEC5E1BF51033,Vor dieser Zeit waren Immobilien in Deutschlan...,Das ist nur ein Auswuchs von zehn Jahren Immob...,Doch die Deutschen hätten diese Hochpreislände...,99,Komparativ,ADJ-er_als_X
B8CBF505FD474F8EFBFF7FDDEC5DA1B3C775B622,"Wichtiger als die Frage , wer sich aus diesem ...","""Jedes Kraut braucht seine eigene Zeit""""""""""","Die erste Fährte, auf die der unverhofft zum K...",99,Komparativ,ADJ-er_als_X
3AE63AAADC213E35AD70B3A83D13415FA5EBE61F,Damit wäre der Geländewagen deutlich günstiger...,Sein Elektro-SUV verspricht eine Reichweite vo...,"""Unsere Wettbewerber sehen wir bei den großen ...",99,Komparativ,ADJ-er_als_X
79339294317D620CA5F7C70A6F5DED39AAB8F0BD,Auch andere Unternehmen wie Uber kooperieren i...,Die Technologie Drive IX hilft selbstfahrenden...,"Technik, die mitfühlt",99,Komparativ,ADJ-er_als_X


In [28]:
constructions = sentences[['construction_id', 'category', 'name']].drop_duplicates()
constructions.set_index('construction_id', inplace=True)
constructions

Unnamed: 0_level_0,category,name
construction_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10,Negation,NEG_X_geschweige_denn_Y
100,Äquativ_Plural,gleich_ADJ
1004,Superlativ_Klimax,ADJ1_ADJ1-er_NP
1006,Superlativ,PRÄP_ADJ-ster_NP
101,Äquativ,ADJ_wie_NP
...,...,...
97,Komparativ,ADJ1-er_als_ADJ1
973,Disjunktion_Doppeltitel,X_oder_Y
976,Korrelation_Affirmation,Wo_X_ist_ist_Y
98,Äquativ,so_ADJ_wie_XP


In [29]:
constructions['category'].value_counts()

category
Intensivierung                      13
Exklamativ                          13
Äquativ                             12
Superlativ                          11
Kategorisierung                      7
                                    ..
Passe-partout-Kompositum             1
Konkretisierung_restriktiv           1
Konkretisierung_exemplifizierend     1
Konkretisierung_exhaustiv            1
Korrelation_Affirmation              1
Name: count, Length: 126, dtype: int64

In [30]:
def parse_kee(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    construction_id = root.attrib.get('id')

    ke_data = []
    
    for ke in root.findall('.//ke'):
        ke_data.append({
            'ke_id': int(ke.attrib.get("id")),
            'ke_name': ke.attrib.get("name"),
            'ke_coretype': ke.attrib.get("coretype"),
            'ke_attributes': ke.attrib.get("attributes")
        })

    kee_data = []
    
    for kee in root.findall('.//kee'):
        for ke in ke_data:
            kee_data.append({
                'kee_id': int(kee.attrib.get("id")),
                'kee_name': kee.attrib.get("name"),
                'construction_id': int(construction_id)
            } | ke)

    return kee_data

kee_list = []
xml_directory = '../../data/constructicon/construction'

for filename in tqdm(list(os.listdir(xml_directory))):
    if filename.endswith('.xml'):
        constr_id = Path(filename).stem
        if b"fa-triangle-exclamation" in requests.get(f"https://gsw.phil.hhu.de/constructicon/construction?id={constr_id}").content:
            print(constr_id, "does not exist online!")
            continue
        xml_file = os.path.join(xml_directory, filename)
        data = parse_kee(xml_file)
        if data:
            kee_list += data
        time.sleep(.5)

kees = pd.DataFrame.from_dict(kee_list)
kees.set_index('kee_id', inplace=True)
kees

  0%|          | 0/443 [00:00<?, ?it/s]

1009 does not exist online!
1011 does not exist online!
1014 does not exist online!
1016 does not exist online!
1018 does not exist online!
1026 does not exist online!
1066 does not exist online!
1070 does not exist online!
1073 does not exist online!
1075 does not exist online!
1083 does not exist online!
1086 does not exist online!
1088 does not exist online!
1091 does not exist online!
1092 does not exist online!
1093 does not exist online!
1097 does not exist online!
1099 does not exist online!
1102 does not exist online!
1114 does not exist online!
1117 does not exist online!
1123 does not exist online!
1135 does not exist online!
1147 does not exist online!
1149 does not exist online!
1154 does not exist online!
1158 does not exist online!
1159 does not exist online!
1160 does not exist online!
1163 does not exist online!
1167 does not exist online!
1169 does not exist online!
1171 does not exist online!
1181 does not exist online!
1183 does not exist online!
1187 does not exist 

Unnamed: 0_level_0,kee_name,construction_id,ke_id,ke_name,ke_coretype,ke_attributes
kee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,geschweige_denn,10,1,Fokuskontext,CORE,
1,geschweige_denn,10,2,Erstes_Konjunkt,CORE,
1,geschweige_denn,10,3,Zweites_Konjunkt,CORE,[]
1,geschweige_denn,10,2350,Negator,CORE,
1,geschweige_denn,10,1,Fokuskontext,,
...,...,...,...,...,...,...
643,er,99,336,Dimension,CORE,
643,er,99,337,Bezugswert,CORE,
643,er,99,338,Quantifizierung,NONCORE,
643,er,99,333,Annäherung,NONCORE,


In [31]:
kees['ke_name'].value_counts()

ke_name
Verglichenes        96
Bezugswert          77
Dimension           76
Annäherung          53
Klammerausdruck     49
                    ..
doch                 1
Direkte_Rede         1
Sprecher             1
Sprecher_X           1
Von_X_Geäußertes     1
Name: count, Length: 143, dtype: int64

In [32]:
kees['kee_name'].value_counts()

kee_name
(_)          75
Klammern     52
pro          24
im           24
wie          20
             ..
lich          1
ileinchen     1
elchen        1
ilein         1
lecker        1
Name: count, Length: 215, dtype: int64

Die KEEs sind markiert wie folgt:

```
            <annotationset>
            ...
                <layer name="KEE-Negation:NEG_X_geschweige_denn_Y" descriptor="KEE-Negation:NEG_X_geschweige_denn_Y">
                    <label start="72" end="87" name="geschweige_denn" refid="GC_KEE:1" itype="null" groupid="0"></label>
                </layer>
            ...
            </annotationset>
```

In [33]:
def parse_masked_sentences(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    sentences_data = []
    
    constr_id = int(root.attrib.get('id'))
    
    for sentence in root.findall('.//sentence'):
        if sentence.attrib.get('uid') is None:
            continue
        
        uid = sentence.attrib.get('uid')
        text = sentence.find('.//text').text.strip()
        
        text_pos = []
        kees = []
        kees_idx = []
        kes = []
        kes_idx = []
        
        # Loop through annotations:
        for layer in sentence.findall('.//layer'):
            layer_name = layer.attrib.get('name')
            
            # Get the KEE annotations:
            if "KE-" in layer_name or "KEE-" in layer_name:
                # Loop over all KEEs or KEs:
                for label in layer.findall('.//label'):
                    start = int(label.attrib.get('start'))
                    end = int(label.attrib.get('end'))
                    if "KEE-" in layer_name:
                        kees.append(text[start:end])  # Read the KEE
                        kees_idx.append((start, end))  # Log the position of the KEE
                    else:
                        kes.append(text[start:end])  # Read the KE
                        kes_idx.append((start, end))  # Log the position of the KE
            elif "UPOS" == layer_name:
                for label in layer.findall('.//label'):
                    text_pos.append(label.attrib.get('name'))
        
        sentences_data.append({
            'uid': uid,
            'constr_id': constr_id,
            'text': text,
            'text_pos': text_pos,
            'kees': kees,
            'kees_idx': kees_idx,
            'kes': kes,
            'kes_idx': kes_idx,
        })
    return sentences_data
    
sentence_list = []
xml_directory = ('../../data/constructicon/construction')

for filename in tqdm(list(os.listdir(xml_directory))):
    if filename.endswith('.xml'):
        constr_id = Path(filename).stem
        if b"fa-triangle-exclamation" in requests.get(f"https://gsw.phil.hhu.de/constructicon/construction?id={constr_id}").content:
            print(constr_id, "does not exist online!")
            continue
        xml_file = os.path.join(xml_directory, filename)
        data = parse_masked_sentences(xml_file)
        if data:
            sentence_list += data
        time.sleep(.5)

sentences = pd.DataFrame.from_dict(sentence_list)
# sentences.set_index('uid', inplace=True)
sentences

  0%|          | 0/443 [00:00<?, ?it/s]

1009 does not exist online!
1011 does not exist online!
1014 does not exist online!
1016 does not exist online!
1018 does not exist online!
1026 does not exist online!
1066 does not exist online!
1070 does not exist online!
1073 does not exist online!
1075 does not exist online!
1083 does not exist online!
1086 does not exist online!
1088 does not exist online!
1091 does not exist online!
1092 does not exist online!
1093 does not exist online!
1097 does not exist online!
1099 does not exist online!
1102 does not exist online!
1114 does not exist online!
1117 does not exist online!
1123 does not exist online!
1135 does not exist online!
1147 does not exist online!
1149 does not exist online!
1154 does not exist online!
1158 does not exist online!
1159 does not exist online!
1160 does not exist online!
1163 does not exist online!
1167 does not exist online!
1169 does not exist online!
1171 does not exist online!
1181 does not exist online!
1183 does not exist online!
1187 does not exist 

Unnamed: 0,uid,constr_id,text,text_pos,kees,kees_idx,kes,kes_idx
0,57B9D1BEB68F44E7A51D1920F1EA6951E39469B5,10,Und dann ist da noch das generelle Problem mit...,"[CCONJ, ADV, AUX, ADV, ADV, DET, ADJ, NOUN, AD...",[geschweige denn],"[(128, 143)]","[dass, nicht, jeder Sprecher und Führer , der ...","[(55, 59), (60, 65), (66, 110), (111, 125), (1..."
1,D86B38CFA5D2458D9F3615EDD55C9308DF423DB0,10,»Ohne Europa sind viele Fragen nicht mehr seri...,"[PUNCT, PROPN, AUX, ADJ, NOUN, PART, ADV, ADJ,...",[geschweige denn],"[(72, 87)]","[»Ohne Europa sind viele Fragen, nicht, mehr s...","[(0, 30), (31, 36), (37, 48), (49, 69), (88, 97)]"
2,7B3FAB8B01ED862D3D3FA18531B0904BCF24DFF0,10,"Dies lässt sich damit begründen , dass vor und...","[PRON, VERB, PRON, ADV, VERB, PUNCT, SCONJ, AD...",[geschweige denn],"[(126, 141)]",[dass vor und neben Alexander – soweit wir wis...,"[(34, 84), (85, 89), (90, 97), (98, 123), (142..."
3,B003E526D0AA301DA46458D32A82CEEFD7ADADA5,10,"Dennoch hat Hamas es nicht geschafft , ihre mi...","[ADV, AUX, PROPN, PRON, PART, VERB, PUNCT, DET...",[geschweige denn],"[(83, 98)]","[Dennoch hat Hamas es, nicht, geschafft, ihre ...","[(0, 20), (21, 26), (27, 36), (39, 80), (99, 1..."
4,C3FFF1CA55CA8FF0A806C33E14A0FB331582698E,10,"Abermillionen rings um die Welt , die das Spek...","[NOUN, ADV, ADP, DET, NOUN, PUNCT, PRON, DET, ...",[geschweige denn],"[(183, 198)]","[Abermillionen rings um die Welt , die das Spe...","[(0, 92), (93, 98), (99, 180), (201, 246)]"
...,...,...,...,...,...,...,...,...
7372,ABB2D854C007CDAF94AA0F10FD7CEC5E1BF51033,99,Vor dieser Zeit waren Immobilien in Deutschlan...,"[ADP, DET, NOUN, AUX, NOUN, ADP, PROPN, ADJ, A...",[er],"[(63, 65)]","[Immobilien in Deutschland, deutlich, billig, ...","[(22, 47), (48, 56), (57, 63), (66, 107)]"
7373,B8CBF505FD474F8EFBFF7FDDEC5DA1B3C775B622,99,"Wichtiger als die Frage , wer sich aus diesem ...","[ADJ, ADP, DET, NOUN, PUNCT, PRON, PRON, ADP, ...",[er],"[(7, 9)]","[Wichtig, als die Frage , wer sich aus diesem ...","[(0, 7), (10, 88), (109, 176)]"
7374,3AE63AAADC213E35AD70B3A83D13415FA5EBE61F,99,Damit wäre der Geländewagen deutlich günstiger...,"[ADV, AUX, DET, NOUN, ADJ, ADJ, ADP, ADJ, NOUN...",[er],"[(44, 46)]","[der Geländewagen, deutlich, günstig, als heut...","[(11, 27), (28, 36), (37, 44), (47, 86)]"
7375,79339294317D620CA5F7C70A6F5DED39AAB8F0BD,99,Auch andere Unternehmen wie Uber kooperieren i...,"[ADV, ADJ, NOUN, ADP, PROPN, VERB, ADV, ADP, P...",[er],"[(144, 146)]","[sicher, gestalten soll]","[(138, 144), (170, 184)]"


In [34]:
sentences = sentences.explode(["kees", "kees_idx"], ignore_index=True)
sentences

Unnamed: 0,uid,constr_id,text,text_pos,kees,kees_idx,kes,kes_idx
0,57B9D1BEB68F44E7A51D1920F1EA6951E39469B5,10,Und dann ist da noch das generelle Problem mit...,"[CCONJ, ADV, AUX, ADV, ADV, DET, ADJ, NOUN, AD...",geschweige denn,"(128, 143)","[dass, nicht, jeder Sprecher und Führer , der ...","[(55, 59), (60, 65), (66, 110), (111, 125), (1..."
1,D86B38CFA5D2458D9F3615EDD55C9308DF423DB0,10,»Ohne Europa sind viele Fragen nicht mehr seri...,"[PUNCT, PROPN, AUX, ADJ, NOUN, PART, ADV, ADJ,...",geschweige denn,"(72, 87)","[»Ohne Europa sind viele Fragen, nicht, mehr s...","[(0, 30), (31, 36), (37, 48), (49, 69), (88, 97)]"
2,7B3FAB8B01ED862D3D3FA18531B0904BCF24DFF0,10,"Dies lässt sich damit begründen , dass vor und...","[PRON, VERB, PRON, ADV, VERB, PUNCT, SCONJ, AD...",geschweige denn,"(126, 141)",[dass vor und neben Alexander – soweit wir wis...,"[(34, 84), (85, 89), (90, 97), (98, 123), (142..."
3,B003E526D0AA301DA46458D32A82CEEFD7ADADA5,10,"Dennoch hat Hamas es nicht geschafft , ihre mi...","[ADV, AUX, PROPN, PRON, PART, VERB, PUNCT, DET...",geschweige denn,"(83, 98)","[Dennoch hat Hamas es, nicht, geschafft, ihre ...","[(0, 20), (21, 26), (27, 36), (39, 80), (99, 1..."
4,C3FFF1CA55CA8FF0A806C33E14A0FB331582698E,10,"Abermillionen rings um die Welt , die das Spek...","[NOUN, ADV, ADP, DET, NOUN, PUNCT, PRON, DET, ...",geschweige denn,"(183, 198)","[Abermillionen rings um die Welt , die das Spe...","[(0, 92), (93, 98), (99, 180), (201, 246)]"
...,...,...,...,...,...,...,...,...
9691,ABB2D854C007CDAF94AA0F10FD7CEC5E1BF51033,99,Vor dieser Zeit waren Immobilien in Deutschlan...,"[ADP, DET, NOUN, AUX, NOUN, ADP, PROPN, ADJ, A...",er,"(63, 65)","[Immobilien in Deutschland, deutlich, billig, ...","[(22, 47), (48, 56), (57, 63), (66, 107)]"
9692,B8CBF505FD474F8EFBFF7FDDEC5DA1B3C775B622,99,"Wichtiger als die Frage , wer sich aus diesem ...","[ADJ, ADP, DET, NOUN, PUNCT, PRON, PRON, ADP, ...",er,"(7, 9)","[Wichtig, als die Frage , wer sich aus diesem ...","[(0, 7), (10, 88), (109, 176)]"
9693,3AE63AAADC213E35AD70B3A83D13415FA5EBE61F,99,Damit wäre der Geländewagen deutlich günstiger...,"[ADV, AUX, DET, NOUN, ADJ, ADJ, ADP, ADJ, NOUN...",er,"(44, 46)","[der Geländewagen, deutlich, günstig, als heut...","[(11, 27), (28, 36), (37, 44), (47, 86)]"
9694,79339294317D620CA5F7C70A6F5DED39AAB8F0BD,99,Auch andere Unternehmen wie Uber kooperieren i...,"[ADV, ADJ, NOUN, ADP, PROPN, VERB, ADV, ADP, P...",er,"(144, 146)","[sicher, gestalten soll]","[(138, 144), (170, 184)]"


In [35]:
sentences = sentences.explode(["kes", "kes_idx"], ignore_index=True)
sentences

Unnamed: 0,uid,constr_id,text,text_pos,kees,kees_idx,kes,kes_idx
0,57B9D1BEB68F44E7A51D1920F1EA6951E39469B5,10,Und dann ist da noch das generelle Problem mit...,"[CCONJ, ADV, AUX, ADV, ADV, DET, ADJ, NOUN, AD...",geschweige denn,"(128, 143)",dass,"(55, 59)"
1,57B9D1BEB68F44E7A51D1920F1EA6951E39469B5,10,Und dann ist da noch das generelle Problem mit...,"[CCONJ, ADV, AUX, ADV, ADV, DET, ADJ, NOUN, AD...",geschweige denn,"(128, 143)",nicht,"(60, 65)"
2,57B9D1BEB68F44E7A51D1920F1EA6951E39469B5,10,Und dann ist da noch das generelle Problem mit...,"[CCONJ, ADV, AUX, ADV, ADV, DET, ADJ, NOUN, AD...",geschweige denn,"(128, 143)","jeder Sprecher und Führer , der redet , auch","(66, 110)"
3,57B9D1BEB68F44E7A51D1920F1EA6951E39469B5,10,Und dann ist da noch das generelle Problem mit...,"[CCONJ, ADV, AUX, ADV, ADV, DET, ADJ, NOUN, AD...",geschweige denn,"(128, 143)",etwas zu sagen,"(111, 125)"
4,57B9D1BEB68F44E7A51D1920F1EA6951E39469B5,10,Und dann ist da noch das generelle Problem mit...,"[CCONJ, ADV, AUX, ADV, ADV, DET, ADJ, NOUN, AD...",geschweige denn,"(128, 143)",das letzte Wort,"(144, 159)"
...,...,...,...,...,...,...,...,...
23781,79339294317D620CA5F7C70A6F5DED39AAB8F0BD,99,Auch andere Unternehmen wie Uber kooperieren i...,"[ADV, ADJ, NOUN, ADP, PROPN, VERB, ADV, ADP, P...",er,"(144, 146)",sicher,"(138, 144)"
23782,79339294317D620CA5F7C70A6F5DED39AAB8F0BD,99,Auch andere Unternehmen wie Uber kooperieren i...,"[ADV, ADJ, NOUN, ADP, PROPN, VERB, ADV, ADP, P...",er,"(144, 146)",gestalten soll,"(170, 184)"
23783,572AF707F11010A60B24BC7700543A7BB090D905,99,Rund 45 . 000 Dollar soll der Byton - SUV kost...,"[ADV, NUM, NUM, NUM, NOUN, AUX, DET, NOUN, PUN...",er,"(72, 74)",er,"(51, 53)"
23784,572AF707F11010A60B24BC7700543A7BB090D905,99,Rund 45 . 000 Dollar soll der Byton - SUV kost...,"[ADV, NUM, NUM, NUM, NOUN, AUX, DET, NOUN, PUN...",er,"(72, 74)",günstig,"(65, 72)"


In [36]:
json_comapp = []
csv_comapp = []
errors = 0
problematic_constructions = set()
unproblematic_constructions = set()


for _, row in tqdm(sentences.iterrows(), total=len(sentences)):
    tokenized = str(row["text"]).split()  # tokenizer.tokenize(...)
    
    try:
        ke_start, ke_end = row["kes_idx"]
        tokenized_kees = str(row["kees"]).split()  # tokenizer.tokenize(...)  # Split the KEEs if there are multi-word KEEs
    except TypeError:
        continue  # If there is nothing to mask or if there is no KEE, we can't use this example.
        
    masked_text = row["text"][:ke_start] + "<mask>" + row["text"][ke_end:]
    tokenized_masked = str(masked_text).split()
    # Rejoin "<", "mask" and ">".
    tokenized_masked = [
        '<mask>' if tokenized_masked[i] == '<' else tokenized_masked[i]  # add "<mask>" back in instead of "<"
        for i in range(len(tokenized_masked)) 
        if tokenized_masked[i] != 'mask' and tokenized_masked[i] != '>'  # remove "mask" and ">"
    ]
    
    try:
        kee_idx = [tokenized.index(tokenized_kee) for tokenized_kee in tokenized_kees]
        kee_query_idx = []
        for i in kee_idx:
            if tokenized_masked.index("<mask>") < i:  # indicates that the query_idx might have been shifted because the multi-token mask comes first
                offset = len(tokenized) - len(tokenized_masked)
                assert i - offset >= 0
                kee_query_idx.append(i - offset)
            else:
                kee_query_idx.append(i)
    except (ValueError, AssertionError) as e:
        print(row["constr_id"], type(e), e, "... Continuing ...")
        errors += 1
        problematic_constructions.add(row["constr_id"])
        continue
    
    unproblematic_constructions.add(row["constr_id"])
    
    out_json = [{
        "label": kee + str(row["constr_id"]),
        "target1": row["text"], 
        "target1_idx": idx, 
        "query": masked_text,
        "query_idx": q
    } for kee, idx, q in zip(tokenized_kees, kee_idx, kee_query_idx)]  # Split the KEEs if there are multi-word KEEs
    out_csv = [{
        "text": row["text"],
        "pos_tags": row["text_pos"],
        "mask": row["text"][ke_start:ke_end],
        "ambiguous_word": kee,
        "label": kee + str(row["constr_id"])  # This will be the new token that we will add to the LLM (named "<kee><i>" where <kee> and <i> are replaced by the KEE's name and i will be replaced by the construction it appears in).
    } for kee in tokenized_kees]
    
    json_comapp += out_json
    csv_comapp += out_csv
    
with open("../../data/pseudowords/CoMaPP_all.json", "w") as file:
    json.dump(json_comapp, file, ensure_ascii=False)

f"{errors} elements from {len(problematic_constructions)} different constructions could not be saved as intended."

  0%|          | 0/23786 [00:00<?, ?it/s]

1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'> 'er' is not in list ... Continuing ...
1004 <class 'ValueError'>

'8929 elements from 150 different constructions could not be saved as intended.'

In [37]:
import csv

with open("../../data/pseudowords/CoMapp_Dataset.csv", "w+", newline="") as file:
    writer = csv.DictWriter(file, fieldnames=["label", "text", "pos_tags", "mask", "ambiguous_word"])
    writer.writeheader()
    writer.writerows(csv_comapp)

In [38]:
# Load the pre-trained MBart-50 model and tokenizer
model_name = "facebook/mbart-large-50"
model = MBartForConditionalGeneration.from_pretrained(model_name)
tokenizer = MBart50Tokenizer.from_pretrained(model_name, src_lang="de_DE", tgt_lang="de_DE")

In [39]:
# Define a masked input sequence and convert it into tokens
masked_sequence = "Ich bin <mask> gegangen."
input_ids = tokenizer.encode(masked_sequence, return_tensors="pt")

In [40]:
# Generate predictions
with torch.no_grad():
    outputs = model.generate(input_ids, max_length=100, num_return_sequences=15, num_beams=20)

In [41]:
predicted_texts = []
for output in outputs:
    predicted_texts.append(tokenizer.decode(output, skip_special_tokens=True))
predicted_texts

['Ich bin nach Hause gegangen.',
 'Ich bin wieder nach Hause gegangen.',
 'Ich bin ins Bett gegangen.',
 'Ich bin wieder ins Bett gegangen.',
 'Ich bin in den Wald gegangen.',
 'Ich bin aus dem Haus gegangen.',
 'Ich bin ins Ausland gegangen.',
 'Ich bin ins Krankenhaus gegangen.',
 'Ich bin in eine andere Richtung gegangen.',
 'Ich bin in eine andere Welt gegangen.',
 'Ich bin gerade ins Bett gegangen.',
 'Ich bin in die Stadt gegangen.',
 'Ich bin heute früh gegangen.',
 'Ich bin wieder ins Leben gegangen.',
 'Ich bin nicht ins Bett gegangen.']

In [42]:
# Create a dataframe which matches the index of the constructions dataframe and the constr_id of the kees dataframe
kee_categories = sentences.join(constructions, on="constr_id")[["constr_id", "kees", "category", "name"]].drop_duplicates().reset_index(drop=True)
kee_categories

Unnamed: 0,constr_id,kees,category,name
0,10,geschweige denn,Negation,NEG_X_geschweige_denn_Y
1,10,Geschweige denn,Negation,NEG_X_geschweige_denn_Y
2,100,gleich,Äquativ_Plural,gleich_ADJ
3,1004,er,Superlativ_Klimax,ADJ1_ADJ1-er_NP
4,1006,st,Superlativ,PRÄP_ADJ-ster_NP
...,...,...,...,...
657,98,gleich,Äquativ,so_ADJ_wie_XP
658,98,ebenso,Äquativ,so_ADJ_wie_XP
659,98,so,Äquativ,so_ADJ_wie_XP
660,98,So,Äquativ,so_ADJ_wie_XP


In [43]:
# Count the distinct names:
kee_categories["name"].value_counts()

name
Klammern               61
w-VL-Satz              54
es_gab_eine_Zeit_XP    26
NP_VP_REFL_PART        14
X_Kopula_ART_Y-ding    13
                       ..
V2_V2                   1
NP_par_excellence       1
zu_Tode_X               1
X_wie_wenn_Y            1
ADJ-er_als_X            1
Name: count, Length: 185, dtype: int64

Vergleiche wie ähnlich sich cased und uncased sind:

In [44]:
input_text_1 = "Wie Schulbücher den Mauerfall darstellen"
input_text_2 = "wie Schulbücher den Mauerfall darstellen"

input_tokens_1 = tokenizer(input_text_1, return_tensors="pt", padding=True, truncation=True)
input_tokens_2 = tokenizer(input_text_2, return_tensors="pt", padding=True, truncation=True)

torch.stack((input_tokens_1.input_ids, input_tokens_2.input_ids))

tensor([[[250003,   4887,  27325, 206709,    168,  23788,     56,   9146,
            1011,  17929,      2]],

        [[250003,   1225,  27325, 206709,    168,  23788,     56,   9146,
            1011,  17929,      2]]])

In [45]:
with torch.no_grad():
    model_output_1 = model(**input_tokens_1)
    model_output_2 = model(**input_tokens_2)
    
hidden_states_1 = model_output_1.encoder_last_hidden_state
hidden_states_2 = model_output_2.encoder_last_hidden_state

sequence_emb_1 = hidden_states_1[:, 5, :]
sequence_emb_2 = hidden_states_2[:, 5, :]

sequence_emb_1, sequence_emb_2

(tensor([[-0.2712,  0.2646,  0.1158,  ..., -0.3121, -0.0238,  0.2057]]),
 tensor([[-0.3570,  0.2023,  0.1094,  ..., -0.2422, -0.1210,  0.2481]]))

In [46]:
# TODO Markiere die zu desambiguierenden Wörter
# Position ergibt keinen Sinn, weil die maskierten Bereiche beliebig lang sein können (und damit der Index variabel ist in Abhängigkeit der länge der generierten Phrase.
# Idee wäre festgelegtes Symbol (z. B. §) oder gar kein Symbol und einfach heuristisch von .index(wort) ausgehen.

In [47]:
from transformers import Text2TextGenerationPipeline, MBartForConditionalGeneration
from transformers import MBart50Tokenizer

model_name = "facebook/mbart-large-50"  # "facebook/mbart-large-cc25" 

model = MBartForConditionalGeneration.from_pretrained(model_name)
tokenizer = MBart50Tokenizer.from_pretrained(model_name, src_lang="de_DE", tgt_lang="de_DE")

nlp = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer, device=0)
masked_sequence = "Ich bin <mask> gegangen."

predicted_texts = nlp(masked_sequence, max_length=100, num_return_sequences=5, num_beams=20)
predicted_texts = [result['generated_text'] for result in predicted_texts]

print(predicted_texts)

['Ich bin nach Hause gegangen.', 'Ich bin wieder nach Hause gegangen.', 'Ich bin ins Bett gegangen.', 'Ich bin wieder ins Bett gegangen.', 'Ich bin in den Wald gegangen.']


In [48]:
x = tokenizer("Moritz sitzt neben mir.", text_target="Moritz sitzt neben mir.")

tokenizer.decode(x.input_ids)

'de_DE Moritz sitzt neben mir.</s>'

In [49]:
from transformers import BartTokenizer

tokenizer2 = BartTokenizer.from_pretrained("facebook/bart-large")

x = tokenizer2("Moritz sitzt neben mir.", text_target="Moritz sitzt neben mir.")

tokenizer2.decode(x.input_ids), tokenizer2.decode(x.labels)

('<s>Moritz sitzt neben mir.</s>', '<s>Moritz sitzt neben mir.</s>')