In [25]:
import itertools
import pickle
import random
import re

import numpy as np
import pandas as pd
import torch
from transformers import BertForNextSentencePrediction, BertTokenizer

from tqdm.notebook import tqdm
tqdm.pandas()

In [26]:
pseudowords = []
for i in range(15):
    pseudowords.append(np.load(f"../../data/pseudowords/bsbbert/pseudowords_comapp_bsbbert_{i*37}_{i*37+37}.npy"))
pseudowords = np.concatenate(pseudowords)

csv_data = []
for i in range(1, 16):
    csv_data.append(pd.read_csv(f"../../data/pseudowords/bsbbert/order_bsbbert_{i}.csv", sep=";", index_col=0, header=None, quotechar="|", names=["order", "label"]))
csv_data = pd.concat(csv_data)

bert_tokens = [d[0] for d in csv_data.values]
bert_tokens

['""Was13',
 '"647',
 '"Wir-äh-spielen-äh-in-der-äh-Champions-League647',
 '(1597',
 '(1600',
 '(1602',
 '(1624',
 '(1637',
 '(1639',
 '(1641',
 '(1643',
 '(1645',
 '(379',
 '(579',
 '(581',
 '(584',
 '(590',
 '(592',
 '(600',
 '(886',
 '(889',
 '(892',
 '(900',
 '(905',
 '(907',
 '(909',
 '(911',
 '(917',
 '(919',
 '(921',
 '(923',
 ')1597',
 ')1600',
 ')1602',
 ')1624',
 ')1637',
 ')1639',
 ')1641',
 ')1643',
 ')1645',
 ')1792',
 ')379',
 ')579',
 ')581',
 ')584',
 ')590',
 ')592',
 ')600',
 ')886',
 ')889',
 ')892',
 ')900',
 ')905',
 ')907',
 ')909',
 ')911',
 ')917',
 ')919',
 ')921',
 ')923',
 ')«579',
 ',1459',
 ',973',
 '-128',
 '-651',
 '-654',
 '-875',
 '-973',
 ':595',
 ':875',
 ':973',
 'Abstand683',
 'Allein20',
 'Aller1630',
 'Als1315',
 'Als133',
 'Als1770',
 'Am488',
 'Am492',
 'Am500',
 'Amerika605',
 'Anstatt320',
 'Art129',
 'Arzt1509',
 'Augenblick1301',
 'Ausmaß1777',
 'BRUTAL1503',
 'Besser1762',
 'Bis559',
 'Brutal1503',
 'Buche1346',
 'Das1313',
 'Das1461',
 'Da

In [27]:
model = BertForNextSentencePrediction.from_pretrained("dbmdz/bert-base-german-cased", return_dict=True)
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')

combined_embeddings = torch.cat((model.bert.embeddings.word_embeddings.weight, torch.tensor(pseudowords)), dim=0)
model.bert.embeddings.word_embeddings = torch.nn.Embedding.from_pretrained(combined_embeddings)
tokenizer.add_tokens(bert_tokens)
model.resize_token_embeddings(len(tokenizer))
model.to("cuda:0")

Some weights of BertForNextSentencePrediction were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 31657. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


BertForNextSentencePrediction(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31657, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [28]:
with open("../../out/definitions.pickle", "rb") as file:
    definitions = pickle.load(file)
with open("../../out/sentences.pickle", "rb") as file:
    sentences = pickle.load(file)

In [29]:
def find_examples(definition, examples):
    predictions = {}
    tok_examples = [tokenizer.tokenize("Zum Beispiel: " + example) for example in examples]
    tok_definition = tokenizer.tokenize(definition)
    for num, tok_example in enumerate(tok_examples):
        len_prompt = len(tok_definition) + len(tok_example) + 3  # 3 extra tokens for [CLS] and [SEP] (2x)
        if len_prompt > 512:
            # shorten the definition so that the example fits fully, and add "..." (again, 3 additional tokens)
            prompt = tokenizer.convert_tokens_to_string(tok_definition[:512-len(tok_example)-3-3]) + " ..."
        else:
            prompt = tokenizer.convert_tokens_to_string(tok_definition)
            
        inputs = tokenizer(prompt, tokenizer.convert_tokens_to_string(tok_example), return_tensors="pt").to("cuda:0")
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        predictions[num] = logits[0, 0] > logits[0, 1]  # next sentence is not random
    res = [examples[num] for num, p in predictions.items() if p]  # return all sentences which have been classified as correct
    return res

In [30]:
random.seed(15)
attempts = 15
for num_true in range(0, 5):
    for num_false in range(0, 5):
        if not num_true and not num_false:
            continue  # skip (0, 0)
        result = []
        for key, definition in tqdm(definitions.items()):
            others = list(itertools.chain.from_iterable([sentence_list for constr, sentence_list in sentences.items() if int(constr) != int(key)]))  # flatten all other sentences which are not part of the current construction
            
            for attempt in range(attempts):
                try:
                    # pick the true positives
                    true_sentences = {random.choice(list(sentences[int(key)])) for t in range(num_true)}
                except KeyError:
                    result.append(pd.Series({"constr": key, "definition": definition}))
                    continue
    
                # pick random false positives from the other sentences
                false_sentences = set(random.choices(others, k=num_false))
                examples = list(false_sentences | true_sentences)
                
                positive_predicted = find_examples(definition, examples)
                negative_predicted = [ex for ex in examples if ex not in positive_predicted]
                
                true_positives = [pr for pr in positive_predicted if pr in true_sentences]
                false_positives = [pr for pr in positive_predicted if pr in false_sentences]
                false_negatives = [pr for pr in negative_predicted if pr in true_sentences]
                true_negatives = [pr for pr in negative_predicted if pr in false_sentences]
                
                if len(true_positives) + len(false_positives) > 0:
                    precision = len(true_positives) / (len(true_positives) + len(false_positives))
                else:
                    precision = 1.0  # nothing found, so all things found are correct
                
                if len(true_positives) > 0:
                    recall = len(true_positives) / (len(true_positives) + len(false_negatives))
                else:
                    recall = 1.0  # all found
                    
                result.append(pd.Series({
                    "constr": key, 
                    "definition": definition, 
                    "examples": examples, 
                    "positive_predicted": positive_predicted,
                    "negative_predicted": negative_predicted,
                    "true_positives": true_positives,
                    "false_positives": false_positives,
                    "false_negatives": false_negatives,
                    "true_negatives": true_negatives,
                    "precision": precision,
                    "recall": recall,
                    "f1": (2 * precision * recall) / (precision + recall),
                    "accuracy": (len(true_positives) + len(true_negatives)) / (len(true_sentences) + len(false_sentences))
                }))
                
        result = pd.DataFrame(result)
        result.to_csv(f"../../out/comapp/result_{num_true}t_vs_{num_false}f_{attempts}attempts_bsbbert.tsv", sep="\t", decimal=",")

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
kelex = csv_data.copy()
kelex['constr'] = csv_data['label'].str.extract('(\d+)').astype(int)
#kelex.set_index('constr', inplace=True)
kelex = kelex.groupby('constr')['label'].apply(set).to_dict()
kelex

In [None]:
# TODO
random.seed(15)
attempts = 10
for num_true in range(2, 8):
    result = []
    for key, definition in tqdm(definitions.items()):
        for attempt in range(attempts):
            try:
                sentence = random.choice(list(sentences[int(key)]))
            except KeyError:
                # print(None, None, None)
                result.append(pd.Series({"constr": key, "definition": definition, "example": None, "prediction": None, "correct": None}))
                continue
            sentence_kelex = []
            if kelex.get(key):
                for token in sentence.split():
                    new_token = token
                    # assert kelex.get(key) is not None
                    for pseudoword in kelex[key]:
                        if token == re.findall(r'\D+', pseudoword)[0]:
                            new_token = pseudoword
                            break
                    sentence_kelex.append(new_token)
                sentence_kelex = " ".join(sentence_kelex)
            else:
                continue  # skip constructions without kelex
                # sentence_kelex = sentence
            others = itertools.chain.from_iterable([sentence_list for constr, sentence_list in sentences.items() if int(constr) != int(key)])
            others = random.choices(list(others), k=num_true)
            examples = list(set(others) | {sentence_kelex})
            positive_predicted = find_examples(definition, examples)
            # print(prediction == sentence_kelex, sentence, prediction)
            result.append(pd.Series({"constr": key, "definition": definition, "example": sentence, "example_kelex": sentence_kelex, "prediction": positive_predicted, "correct": positive_predicted == sentence_kelex}))
    result = pd.DataFrame(result)
    result.to_csv(f"../../out/comapp/result_1_vs_{num_true}_kelex_{attempts}attempts_bsbbert.tsv", sep="\t")

In [None]:
random.seed(15)
attempts = 10
for num_true in range(2, 8):
    result = []
    for key, definition in tqdm(definitions.items()):
        for attempt in range(attempts):
            try:
                sentence = random.choice(list(sentences[int(key)]))
            except KeyError:
                # print(None, None, None)
                result.append(pd.Series({"constr": key, "definition": definition, "example": None, "prediction": None, "correct": None}))
                continue
            sentence_kelex = []
            if kelex.get(key):
                for token in sentence.split():
                    new_token = token
                    # assert kelex.get(key) is not None
                    for pseudoword in kelex[key]:
                        if token == re.findall(r'\D+', pseudoword)[0]:
                            new_token = pseudoword
                            break
                    sentence_kelex.append(new_token)
                sentence_kelex = " ".join(sentence_kelex)
            else:
                # continue  # skip constructions without kelex
                sentence_kelex = sentence
            others = itertools.chain.from_iterable([sentence_list for constr, sentence_list in sentences.items() if int(constr) != int(key)])
            others = random.choices(list(others), k=num_true)
            examples = list(set(others) | {sentence_kelex})
            positive_predicted = find_examples(definition, examples)
            # print(prediction == sentence_kelex, sentence, prediction)
            result.append(pd.Series({"constr": key, "definition": definition, "example": sentence, "example_kelex": sentence_kelex, "prediction": positive_predicted, "correct": positive_predicted == sentence_kelex}))
    result = pd.DataFrame(result)
    result.to_csv(f"../../out/comapp/result_1_vs_{num_true}_kelex_all_{attempts}attempts_bsbbert.tsv", sep="\t")