In [1]:
import itertools
import pickle
import random
import re

import numpy as np
import pandas as pd
import torch
from transformers import BertForNextSentencePrediction, BertTokenizer

from tqdm.notebook import tqdm
tqdm.pandas()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
pseudowords = []
for i in range(15):
    pseudowords.append(np.load(f"../../data/pseudowords/bsbbert/pseudowords_comapp_bsbbert_{i*37}_{i*37+37}.npy"))
pseudowords = np.concatenate(pseudowords)

csv_data = []
for i in range(1, 16):
    csv_data.append(pd.read_csv(f"../../data/pseudowords/bsbbert/order_bsbbert_{i}.csv", sep=";", index_col=0, header=None, quotechar="|", names=["order", "label"]))
csv_data = pd.concat(csv_data)

bert_tokens = [d[0] for d in csv_data.values]
bert_tokens

['""Was13',
 '"647',
 '"Wir-äh-spielen-äh-in-der-äh-Champions-League647',
 '(1597',
 '(1600',
 '(1602',
 '(1624',
 '(1637',
 '(1639',
 '(1641',
 '(1643',
 '(1645',
 '(379',
 '(579',
 '(581',
 '(584',
 '(590',
 '(592',
 '(600',
 '(886',
 '(889',
 '(892',
 '(900',
 '(905',
 '(907',
 '(909',
 '(911',
 '(917',
 '(919',
 '(921',
 '(923',
 ')1597',
 ')1600',
 ')1602',
 ')1624',
 ')1637',
 ')1639',
 ')1641',
 ')1643',
 ')1645',
 ')1792',
 ')379',
 ')579',
 ')581',
 ')584',
 ')590',
 ')592',
 ')600',
 ')886',
 ')889',
 ')892',
 ')900',
 ')905',
 ')907',
 ')909',
 ')911',
 ')917',
 ')919',
 ')921',
 ')923',
 ')«579',
 ',1459',
 ',973',
 '-128',
 '-651',
 '-654',
 '-875',
 '-973',
 ':595',
 ':875',
 ':973',
 'Abstand683',
 'Allein20',
 'Aller1630',
 'Als1315',
 'Als133',
 'Als1770',
 'Am488',
 'Am492',
 'Am500',
 'Amerika605',
 'Anstatt320',
 'Art129',
 'Arzt1509',
 'Augenblick1301',
 'Ausmaß1777',
 'BRUTAL1503',
 'Besser1762',
 'Bis559',
 'Brutal1503',
 'Buche1346',
 'Das1313',
 'Das1461',
 'Da

In [3]:
model = BertForNextSentencePrediction.from_pretrained("dbmdz/bert-base-german-cased", return_dict=True)
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')

combined_embeddings = torch.cat((model.bert.embeddings.word_embeddings.weight, torch.tensor(pseudowords)), dim=0)
model.bert.embeddings.word_embeddings = torch.nn.Embedding.from_pretrained(combined_embeddings)
tokenizer.add_tokens(bert_tokens)
model.resize_token_embeddings(len(tokenizer))
model.to("cuda:0")

Some weights of BertForNextSentencePrediction were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForNextSentencePrediction(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31657, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [4]:
with open("../../out/definitions.pickle", "rb") as file:
    definitions = pickle.load(file)
with open("../../out/sentences.pickle", "rb") as file:
    sentences = pickle.load(file)

In [5]:
def find_examples(definition, examples):
    predictions = {}
    for num, example in enumerate(examples):
        len_prompt = len(definition) + len(" Zum Beispiel: ") + len(example)
        if len_prompt > 512:
            prompt = definition[:512-len_prompt+len(definition)-1] + "… Zum Beispiel: "
            if len(prompt) > 512:
                prompt = prompt[:511] + "…"
        else:
            prompt = definition + " Zum Beispiel: "  # TODO Deutsch
        
        inputs = tokenizer(prompt, example, return_tensors="pt").to("cuda:0")
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        predictions[num] = logits[0, 0]  # probability that the next sentence makes sense
    res = max(predictions, key=predictions.get)
    return examples[res]

In [6]:
random.seed(15)
attempts = 10
for false_positives in range(2, 8):
    result = []
    for key, definition in tqdm(definitions.items()):
        for attempt in range(attempts):
            try:
                sentence = random.choice(list(sentences[int(key)]))
            except KeyError:
                #print(None, None, None)
                result.append(pd.Series({"constr": key, "definition": definition, "example": None, "prediction": None, "correct": None}))
                continue
            others = itertools.chain.from_iterable([sentence_list for constr, sentence_list in sentences.items() if int(constr) != int(key)])
            others = random.choices(list(others), k=false_positives)
            examples = list(set(others) | {sentence})
            prediction = find_examples(definition, examples)
            #print(prediction == sentence, sentence, prediction)
            result.append(pd.Series({"constr": key, "definition": definition, "example": sentence, "prediction": prediction, "correct": prediction == sentence}))
    result = pd.DataFrame(result)
    result.to_csv(f"../../out/comapp/result_1_vs_{false_positives}_{attempts}attempts_bsbbert.tsv", sep="\t")

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

In [7]:
kelex = csv_data.copy()
kelex['constr'] = csv_data['label'].str.extract('(\d+)').astype(int)
#kelex.set_index('constr', inplace=True)
kelex = kelex.groupby('constr')['label'].apply(set).to_dict()
kelex

{5: {'Und5', 'erst5', 'gar5', 'nicht5', 'recht5', 'schon5', 'und5'},
 10: {'Geschweige10', 'denn10', 'geschweige10'},
 11: {'Und11', 'kaum11', 'und11', 'wohl11'},
 12: {'Weder12', 'noch12', 'weder12'},
 13: {'""Was13', 'Was13', 'für13', 'was13'},
 14: {'Wie14'},
 15: {'Welch15', 'welch15'},
 16: {'Was16'},
 19: {'Diese19', 'Dieser19', 'Dieses19', 'dieses19'},
 20: {'Allein20'},
 21: {'Dass21'},
 22: {'So22'},
 74: {'So74', 'so74'},
 78: {'Solch78', 'Solche78', 'solch78'},
 83: {'gegen83'},
 97: {'als97'},
 98: {'So98', 'ebenso98', 'genauso98', 'gleich98', 'so98'},
 99: {'er99'},
 100: {'gleich100'},
 101: {'wie101'},
 103: {'wie103'},
 104: {'Dasselbe104',
  'Gleiche104',
  'das104',
  'dasselbe104',
  'gleiche104',
  'nämliche104'},
 111: {'Desto111', 'Je111', 'Umso111', 'desto111', 'je111', 'umso111'},
 122: {'ebenso122', 'genauso122'},
 125: {'es125', 'ist125', 'sei125', 'wie125'},
 127: {'ein127', 'einem127', 'einer127', 'von127'},
 128: {'-128', 'artig128'},
 129: {'Art129', 'eine

In [8]:
random.seed(15)
attempts = 10
for false_positives in range(2, 8):
    result = []
    for key, definition in tqdm(definitions.items()):
        for attempt in range(attempts):
            try:
                sentence = random.choice(list(sentences[int(key)]))
            except KeyError:
                # print(None, None, None)
                result.append(pd.Series({"constr": key, "definition": definition, "example": None, "prediction": None, "correct": None}))
                continue
            sentence_kelex = []
            if kelex.get(key):
                for token in sentence.split():
                    new_token = token
                    # assert kelex.get(key) is not None
                    for pseudoword in kelex[key]:
                        if token == re.findall(r'\D+', pseudoword)[0]:
                            new_token = pseudoword
                            break
                    sentence_kelex.append(new_token)
                sentence_kelex = " ".join(sentence_kelex)
            else:
                continue  # skip constructions without kelex
                # sentence_kelex = sentence
            others = itertools.chain.from_iterable([sentence_list for constr, sentence_list in sentences.items() if int(constr) != int(key)])
            others = random.choices(list(others), k=false_positives)
            examples = list(set(others) | {sentence_kelex})
            prediction = find_examples(definition, examples)
            # print(prediction == sentence_kelex, sentence, prediction)
            result.append(pd.Series({"constr": key, "definition": definition, "example": sentence, "example_kelex": sentence_kelex, "prediction": prediction, "correct": prediction == sentence_kelex}))
    result = pd.DataFrame(result)
    result.to_csv(f"../../out/comapp/result_1_vs_{false_positives}_kelex_{attempts}attempts_bsbbert.tsv", sep="\t")

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

In [None]:
random.seed(15)
attempts = 10
for false_positives in range(2, 8):
    result = []
    for key, definition in tqdm(definitions.items()):
        for attempt in range(attempts):
            try:
                sentence = random.choice(list(sentences[int(key)]))
            except KeyError:
                # print(None, None, None)
                result.append(pd.Series({"constr": key, "definition": definition, "example": None, "prediction": None, "correct": None}))
                continue
            sentence_kelex = []
            if kelex.get(key):
                for token in sentence.split():
                    new_token = token
                    # assert kelex.get(key) is not None
                    for pseudoword in kelex[key]:
                        if token == re.findall(r'\D+', pseudoword)[0]:
                            new_token = pseudoword
                            break
                    sentence_kelex.append(new_token)
                sentence_kelex = " ".join(sentence_kelex)
            else:
                # continue  # skip constructions without kelex
                sentence_kelex = sentence
            others = itertools.chain.from_iterable([sentence_list for constr, sentence_list in sentences.items() if int(constr) != int(key)])
            others = random.choices(list(others), k=false_positives)
            examples = list(set(others) | {sentence_kelex})
            prediction = find_examples(definition, examples)
            # print(prediction == sentence_kelex, sentence, prediction)
            result.append(pd.Series({"constr": key, "definition": definition, "example": sentence, "example_kelex": sentence_kelex, "prediction": prediction, "correct": prediction == sentence_kelex}))
    result = pd.DataFrame(result)
    result.to_csv(f"../../out/comapp/result_1_vs_{false_positives}_kelex_all_{attempts}attempts_bsbbert.tsv", sep="\t")

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]