In [37]:
import csv

import numpy as np
import pandas as pd
import torch
from tqdm.notebook import tqdm
from transformers import BertForMaskedLM, BertTokenizer

In [38]:
matches = pd.read_csv("../../out/matches/matches_dep_constr_kelex.tsv", sep="\t", header=0)
matches

Unnamed: 0,constr,fuzziness (common dep),fuzziness (matches),sentence
0,12,0.937500,0.857143,"Dieser hat allerdings weder die Möglichkeit , ..."
1,12,0.937500,0.857143,Das gestrige Ereignis hatte weder verletzte Pe...
2,12,0.937500,0.857143,Möglich werden @-Namen durch eine Funktion de...
3,15,0.933333,0.857143,"Auch im Cyberspace müsse es Grenzen geben , di..."
4,15,0.933333,0.857143,"Die dritte Möglichkeit wäre , dass sich die Ge..."
...,...,...,...,...
402905,1986,0.823529,0.800000,"Da eine Bestätigung per E-Mail folgte , hatte ..."
402906,1986,0.823529,0.800000,Für die Konkurrenz der Deutschen Telekom wird ...
402907,1986,0.823529,0.850000,"Inlandsgespräche waren um 7,9 Prozent billiger..."
402908,1986,0.823529,0.800000,Ohne Hightech und neue Ideen hat auch das trad...


In [50]:
with open("../../data/pseudowords/annotations.csv", "r") as csv_file:
    data = [row for row in csv.DictReader(csv_file)]
    
kelex_dict = {}
for example in data:
    kees = set()
    for kee in eval(example["kees"]):
        kees |= set(kee.split())
    kelex_dict[int(example["construction_id"])] = kees

kelex_dict

{10: {'denn', 'geschweige'},
 100: {'gleich'},
 1004: {'er'},
 1006: {'st'},
 101: {'wie'},
 1029: {'Mit', 'wem', 'würden'},
 103: {'wie'},
 1033: {'Hauptsache'},
 1034: {'Von', 'zu'},
 1035: {'von', 'zu'},
 104: {'das', 'nämliche'},
 1043: set(),
 1051: set(),
 1054: set(),
 11: {'kaum', 'und', 'wohl'},
 111: {'Umso', 'je'},
 112: set(),
 1126: {'Generation'},
 1134: {'Sowohl', 'als', 'auch'},
 1140: {'und'},
 1162: {'nicht', 'oder'},
 12: {'noch', 'weder'},
 1219: set(),
 122: {'genauso'},
 123: {'ähnlich'},
 125: {'es', 'sei', 'wie'},
 1257: set(),
 127: {'ein', 'einem', 'von'},
 128: {'artig'},
 1289: {'lassen', 'sein'},
 129: {'Art', 'eine'},
 1291: {'Wenn'},
 13: {'Was', 'für'},
 130: {'wie', 'wäre'},
 1300: {'So', 'geht'},
 1301: {'Augenblick', 'Der'},
 1313: {'Es', 'war', 'zum'},
 1315: {'Als', 'ob'},
 1316: {'Raus', 'aus', 'in', 'rein'},
 132: {'gleicht'},
 1320: {'lein'},
 1322: {'chen'},
 1323: {'pur'},
 1324: {'satt'},
 1329: {'jedermanns'},
 133: {'als', 'ob'},
 1337: {'i'

In [40]:
# cf. evaluate_comapp_bert.ipynb

pseudowords = []
for i in range(15):
    pseudowords.append(np.load(f"../../data/pseudowords/bert/pseudowords_comapp_bert_{i*37}_{i*37+37}.npy"))
pseudowords = np.concatenate(pseudowords)

csv_data = []
for i in range(1, 16):
    csv_data.append(pd.read_csv(f"../../data/pseudowords/bert/order_bert_{i}.csv", sep=";", index_col=0, header=None, quotechar="|", names=["order", "label"]))
csv_data = pd.concat(csv_data)

bert_tokens = [d[0] for d in csv_data.values]
bert_tokens

['""Was13',
 '"647',
 '"Wir-äh-spielen-äh-in-der-äh-Champions-League647',
 '(1597',
 '(1600',
 '(1602',
 '(1624',
 '(1637',
 '(1639',
 '(1641',
 '(1643',
 '(1645',
 '(379',
 '(579',
 '(581',
 '(584',
 '(590',
 '(592',
 '(600',
 '(886',
 '(889',
 '(892',
 '(900',
 '(905',
 '(907',
 '(909',
 '(911',
 '(917',
 '(919',
 '(921',
 '(923',
 ')1597',
 ')1600',
 ')1602',
 ')1624',
 ')1637',
 ')1639',
 ')1641',
 ')1643',
 ')1645',
 ')1792',
 ')379',
 ')579',
 ')581',
 ')584',
 ')590',
 ')592',
 ')600',
 ')886',
 ')889',
 ')892',
 ')900',
 ')905',
 ')907',
 ')909',
 ')911',
 ')917',
 ')919',
 ')921',
 ')923',
 ')«579',
 ',1459',
 ',973',
 '-128',
 '-651',
 '-654',
 '-875',
 '-973',
 ':595',
 ':875',
 ':973',
 'Abstand683',
 'Allein20',
 'Aller1630',
 'Als1315',
 'Als133',
 'Als1770',
 'Am488',
 'Am492',
 'Am500',
 'Amerika605',
 'Anstatt320',
 'Art129',
 'Arzt1509',
 'Augenblick1301',
 'Ausmaß1777',
 'BRUTAL1503',
 'Besser1762',
 'Bis559',
 'Brutal1503',
 'Buche1346',
 'Das1313',
 'Das1461',
 'Da

In [41]:
model = BertForMaskedLM.from_pretrained('bert-base-german-cased', return_dict=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased')

combined_embeddings = torch.cat((model.bert.embeddings.word_embeddings.weight, torch.tensor(pseudowords)), dim=0)
model.bert.embeddings.word_embeddings = torch.nn.Embedding.from_pretrained(combined_embeddings)
model.bert.embeddings.word_embeddings

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Embedding(30555, 768)

In [42]:
tokenizer.add_tokens(bert_tokens)
model.resize_token_embeddings(len(tokenizer))

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 30555. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(30555, 768)

After everything has been prepared, we can create a list of the contextual embeddings for the standard BERT tokens and the pseudoword tokens.

In [61]:
contextual_embeds = {}

for constr, kelexes in tqdm(kelex_dict.items()):
    for kelex in kelexes:
        if kelex + str(constr) in bert_tokens:
            original_ids = tokenizer(kelex, return_tensors='pt')['input_ids']
            pseudoword_ids = tokenizer(kelex + str(constr), return_tensors='pt')['input_ids']
            with torch.no_grad():
                original_outputs = model(original_ids, output_hidden_states=True)
                pseudoword_outputs = model(pseudoword_ids, output_hidden_states=True)
            original_contextual_embed = original_outputs.hidden_states[12][0][1:-1]
            pseudoword_contextual_embed = pseudoword_outputs.hidden_states[12][0][1:-1]
            contextual_embeds[constr, kelex] = (original_contextual_embed, pseudoword_contextual_embed)
        else:
            print(kelex + str(constr))

contextual_embeds.keys()

  0%|          | 0/210 [00:00<?, ?it/s]

st1006
würden1029
zu1034
zu1035
ähnlich123
zum1313
lein1320
chen1322
i1337
zu1347
ähnlich139
eln1514
lich1521
leinchen1524
vor1574
Muster1594
ding1621
zum1630
kram1660
Standard1684
un1690
keineswegs1690
en1756
ge1756
ling1842
er381
en392
würdest392
häufigst498
sten500
et571
st623
st631
st65
sau653
en696
über85


dict_keys([(10, 'geschweige'), (10, 'denn'), (100, 'gleich'), (1004, 'er'), (101, 'wie'), (1029, 'wem'), (1029, 'Mit'), (103, 'wie'), (1033, 'Hauptsache'), (1034, 'Von'), (1035, 'von'), (104, 'nämliche'), (104, 'das'), (11, 'kaum'), (11, 'wohl'), (11, 'und'), (111, 'Umso'), (111, 'je'), (1126, 'Generation'), (1134, 'Sowohl'), (1134, 'als'), (1134, 'auch'), (1140, 'und'), (1162, 'oder'), (1162, 'nicht'), (12, 'noch'), (12, 'weder'), (122, 'genauso'), (125, 'es'), (125, 'sei'), (125, 'wie'), (127, 'einem'), (127, 'von'), (127, 'ein'), (128, 'artig'), (1289, 'sein'), (1289, 'lassen'), (129, 'Art'), (129, 'eine'), (1291, 'Wenn'), (13, 'Was'), (13, 'für'), (130, 'wäre'), (130, 'wie'), (1300, 'So'), (1300, 'geht'), (1301, 'Augenblick'), (1301, 'Der'), (1313, 'war'), (1313, 'Es'), (1315, 'ob'), (1315, 'Als'), (1316, 'rein'), (1316, 'in'), (1316, 'Raus'), (1316, 'aus'), (132, 'gleicht'), (1323, 'pur'), (1324, 'satt'), (1329, 'jedermanns'), (133, 'ob'), (133, 'als'), (1342, 'excellence'), (1342

In [62]:
contextual_embeds[10, "geschweige"]

(tensor([[-0.1567, -0.2954,  1.1641,  ..., -0.5217,  0.1245, -0.9498],
         [-0.2695, -0.1156,  0.7473,  ...,  0.0997, -0.4645, -0.5735]]),
 tensor([[-5.9337e-01,  1.8925e-01,  8.8151e-01,  7.6941e-01, -1.0799e-01,
          -5.4874e-02, -1.1751e+00, -7.0002e-01, -1.0662e-01,  3.7848e-01,
          -2.2536e-01,  4.8179e-01,  4.9365e-01, -5.8546e-01,  1.5485e-01,
           3.4184e-01,  2.3076e-01, -8.4451e-02, -1.3702e-01, -1.7958e-01,
          -3.6692e-01, -5.9974e-01, -2.9299e-01, -2.8951e-01, -1.0670e+00,
           2.8323e-01, -8.1848e-01, -8.2333e-01,  2.2542e-01,  1.1402e-01,
           9.8367e-01, -3.2775e-01, -1.6153e-02,  4.4972e-01, -4.7634e-01,
           3.5021e-01,  4.6307e-01,  8.4411e-02,  9.5915e-01, -8.9424e-01,
           8.2238e-01,  4.6471e-01, -1.5551e-01, -2.2514e-01,  8.2434e-01,
           2.2404e-01, -6.1623e-01,  4.6392e-01, -2.3587e-02,  4.7538e-01,
          -9.8099e-01,  2.8590e-01,  4.8032e-01, -2.5809e-01, -1.6818e-01,
           4.9187e-01, -3.0922e

Now we can compare the distance of the pseudowords to a sentence with the distance of the original token(s) to the same sentence. Since one pseudoword token can be equivalent to multiple original tokens, we need to take the average distance from all original tokens to compare this average distance to the distance of the pseudoword token.

In [ ]:
# TODO