In [1]:
import itertools
import pickle
import random
import re

import numpy as np
import pandas as pd
import torch
from transformers import BertForNextSentencePrediction, BertTokenizer

from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
pseudowords = []
for i in range(15):
    pseudowords.append(np.load(f"../../data/pseudowords/bsbbert/pseudowords_comapp_bsbbert_{i*37}_{i*37+37}.npy"))
pseudowords = np.concatenate(pseudowords)

csv_data = []
for i in range(1, 16):
    csv_data.append(pd.read_csv(f"../../data/pseudowords/bsbbert/order_bsbbert_{i}.csv", sep=";", index_col=0, header=None, quotechar="|", names=["order", "label"]))
csv_data = pd.concat(csv_data)

bert_tokens = [d[0] for d in csv_data.values]
bert_tokens

['""Was13',
 '"647',
 '"Wir-äh-spielen-äh-in-der-äh-Champions-League647',
 '(1597',
 '(1600',
 '(1602',
 '(1624',
 '(1637',
 '(1639',
 '(1641',
 '(1643',
 '(1645',
 '(379',
 '(579',
 '(581',
 '(584',
 '(590',
 '(592',
 '(600',
 '(886',
 '(889',
 '(892',
 '(900',
 '(905',
 '(907',
 '(909',
 '(911',
 '(917',
 '(919',
 '(921',
 '(923',
 ')1597',
 ')1600',
 ')1602',
 ')1624',
 ')1637',
 ')1639',
 ')1641',
 ')1643',
 ')1645',
 ')1792',
 ')379',
 ')579',
 ')581',
 ')584',
 ')590',
 ')592',
 ')600',
 ')886',
 ')889',
 ')892',
 ')900',
 ')905',
 ')907',
 ')909',
 ')911',
 ')917',
 ')919',
 ')921',
 ')923',
 ')«579',
 ',1459',
 ',973',
 '-128',
 '-651',
 '-654',
 '-875',
 '-973',
 ':595',
 ':875',
 ':973',
 'Abstand683',
 'Allein20',
 'Aller1630',
 'Als1315',
 'Als133',
 'Als1770',
 'Am488',
 'Am492',
 'Am500',
 'Amerika605',
 'Anstatt320',
 'Art129',
 'Arzt1509',
 'Augenblick1301',
 'Ausmaß1777',
 'BRUTAL1503',
 'Besser1762',
 'Bis559',
 'Brutal1503',
 'Buche1346',
 'Das1313',
 'Das1461',
 'Da

In [3]:
model = BertForNextSentencePrediction.from_pretrained("dbmdz/bert-base-german-cased", return_dict=True)
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')

combined_embeddings = torch.cat((model.bert.embeddings.word_embeddings.weight, torch.tensor(pseudowords)), dim=0)
model.bert.embeddings.word_embeddings = torch.nn.Embedding.from_pretrained(combined_embeddings)
tokenizer.add_tokens(bert_tokens)
model.resize_token_embeddings(len(tokenizer))
model.to("cuda:0")

Some weights of BertForNextSentencePrediction were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 31657. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


BertForNextSentencePrediction(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31657, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [4]:
with open("../../out/definitions.pickle", "rb") as file:
    definitions = pickle.load(file)
with open("../../out/sentences.pickle", "rb") as file:
    sentences = pickle.load(file)

In [5]:
def find_examples(definition, examples, true_examples=0):
    predictions = {}
    tok_examples = [tokenizer.tokenize(example) for example in examples]
    tok_eg = tokenizer.tokenize("zum Beispiel:")
    tok_definition = tokenizer.tokenize(definition[:definition.index(".")])  # only the first sentence of the definition is used, so BERT has a chance of keeping some of the definition in mind
    for num, tok_example in enumerate(tok_examples):
        len_prompt = len(tok_definition) + len(tok_eg) + len(tok_example) + 3  # 3 extra tokens for [CLS] and [SEP] (2x)
        if len_prompt > 512:
            # shorten the definition so that the example fits fully, and add "...," (again, 4 additional tokens)
            prompt = tokenizer.convert_tokens_to_string(tok_definition[:512-len(tok_example)-len(tok_eg)-3-4]) + "..., zum Beispiel:"
        else:
            prompt = tokenizer.convert_tokens_to_string(tok_definition) + ", zum Beispiel:"
            
        inputs = tokenizer(prompt, tokenizer.convert_tokens_to_string(tok_example), return_tensors="pt").to("cuda:0")
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        if true_examples == 0:
            predictions[num] = logits[0, 0] > logits[0, 1]  # next sentence is not random
        else:
            predictions[num] = logits[0, 0]  # collect in order to sort later
    if true_examples == 0:
        res = [examples[num] for num, p in predictions.items() if p]  # return all sentences which have been classified as correct
    else:
        sorted_idxs = sorted(range(len(examples)), key=lambda i: predictions[i], reverse=True)[:true_examples]
        res = [examples[i] for i in sorted_idxs]
    return res

In [12]:
def get_metrics(positive_predicted, negative_predicted, true_sentences, false_sentences, key, definition, examples):
    true_positives = [pr for pr in positive_predicted if pr in true_sentences]
    false_positives = [pr for pr in positive_predicted if pr in false_sentences]
    false_negatives = [pr for pr in negative_predicted if pr in true_sentences]
    true_negatives = [pr for pr in negative_predicted if pr in false_sentences]
    
    if len(true_positives) + len(false_positives) > 0:
        precision = len(true_positives) / (len(true_positives) + len(false_positives))
    else:
        precision = 1.0  # nothing found, so all things found are correct
    
    if len(true_positives) > 0:
        recall = len(true_positives) / (len(true_positives) + len(false_negatives))
    else:
        recall = 1.0  # all found
        
    return pd.Series({
        "constr": key, 
        "definition": definition, 
        "examples": examples, 
        "positive_predicted": positive_predicted,
        "negative_predicted": negative_predicted,
        "true_positives": true_positives,
        "false_positives": false_positives,
        "false_negatives": false_negatives,
        "true_negatives": true_negatives,
        "precision": precision,
        "recall": recall,
        "f1": (2 * precision * recall) / (precision + recall),
        "accuracy": (len(true_positives) + len(true_negatives)) / (len(true_sentences) + len(false_sentences))
    })

In [12]:
random.seed(15)
attempts = 15
for num_true in range(0, 6):
    for num_false in range(0, 6):
        if not num_true and not num_false:
            continue  # skip (0, 0)
        result = []
        for key, definition in tqdm(definitions.items()):
            others = list(itertools.chain.from_iterable([sentence_list for constr, sentence_list in sentences.items() if int(constr) != int(key)]))  # flatten all other sentences which are not part of the current construction
            
            for attempt in range(attempts):
                try:
                    # pick the true elements of the current construction
                    true_sentences = {random.choice(list(sentences[int(key)])) for t in range(num_true)}
                except KeyError:
                    result.append(pd.Series({"constr": key, "definition": definition}))
                    continue
    
                # pick random false positives from the other sentences
                false_sentences = set(random.choices(others, k=num_false))
                examples = list(false_sentences | true_sentences)
                
                positive_predicted = find_examples(definition, examples)
                negative_predicted = [ex for ex in examples if ex not in positive_predicted]
                
                result.append(get_metrics(positive_predicted, negative_predicted, true_sentences, false_sentences, key, definition, examples))
                
        result = pd.DataFrame(result)
        result.to_csv(f"../../out/comapp/result_{num_true}t_vs_{num_false}f_{attempts}attempts_bsbbert.tsv", sep="\t", decimal=",", header=True, index=False)

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

Exception ignored in: <function tqdm.__del__ at 0x7fcbd75784c0>
Traceback (most recent call last):
  File "/home/tim/Projekte/llm-x-cxg/venv/lib/python3.10/site-packages/tqdm/std.py", line 1148, in __del__
    def __del__(self):
KeyboardInterrupt: 


KeyboardInterrupt: 

In [7]:
kelex = csv_data.copy()
kelex['constr'] = csv_data['label'].str.extract('(\d+)').astype(int)
#kelex.set_index('constr', inplace=True)
kelex = kelex.groupby('constr')['label'].apply(set).to_dict()
kelex

{5: {'Und5', 'erst5', 'gar5', 'nicht5', 'recht5', 'schon5', 'und5'},
 10: {'Geschweige10', 'denn10', 'geschweige10'},
 11: {'Und11', 'kaum11', 'und11', 'wohl11'},
 12: {'Weder12', 'noch12', 'weder12'},
 13: {'""Was13', 'Was13', 'für13', 'was13'},
 14: {'Wie14'},
 15: {'Welch15', 'welch15'},
 16: {'Was16'},
 19: {'Diese19', 'Dieser19', 'Dieses19', 'dieses19'},
 20: {'Allein20'},
 21: {'Dass21'},
 22: {'So22'},
 74: {'So74', 'so74'},
 78: {'Solch78', 'Solche78', 'solch78'},
 83: {'gegen83'},
 97: {'als97'},
 98: {'So98', 'ebenso98', 'genauso98', 'gleich98', 'so98'},
 99: {'er99'},
 100: {'gleich100'},
 101: {'wie101'},
 103: {'wie103'},
 104: {'Dasselbe104',
  'Gleiche104',
  'das104',
  'dasselbe104',
  'gleiche104',
  'nämliche104'},
 111: {'Desto111', 'Je111', 'Umso111', 'desto111', 'je111', 'umso111'},
 122: {'ebenso122', 'genauso122'},
 125: {'es125', 'ist125', 'sei125', 'wie125'},
 127: {'ein127', 'einem127', 'einer127', 'von127'},
 128: {'-128', 'artig128'},
 129: {'Art129', 'eine

In [17]:
random.seed(15)
attempts = 15
for num_true in range(1, 6):
    for num_false in range(0, 6):
        if not num_true and not num_false:
            continue  # skip (0, 0)
        result = []
        for key, definition in tqdm(definitions.items()):
            others = list(itertools.chain.from_iterable([sentence_list for constr, sentence_list in sentences.items() if int(constr) != int(key)]))
            for attempt in range(attempts):
                try:
                    # pick the true elements of the current construction
                    true_sentences = {random.choice(list(sentences[int(key)])) for t in range(num_true)}
                except KeyError:
                    result.append(pd.Series({"constr": key, "definition": definition}))
                    continue
                    
                true_sentences_kelex = []
                if kelex.get(key):  # if there is a predefined KE-lex in the construction
                    # if len(true_sentences) > 0:
                        for pseudoword in kelex[key]:  # replace each predefined KE-lex in the sentence one-by-one (if there are multiple)
                            cur_true_sentences = []
                            for sentence in true_sentences:
                                pseudoword_found = False
                                sentence_kelex = []
                                for token in sentence.split():
                                    new_token = token
                                    if token == re.findall(r'\D+', pseudoword)[0]:
                                        new_token = pseudoword
                                        pseudoword_found = True
                                    sentence_kelex.append(new_token)
                                if pseudoword_found:  # only add the sentence if it has a replaced pseudoword
                                    cur_true_sentences.append(" ".join(sentence_kelex))
                            #if len(cur_true_sentences) > 0:
                            true_sentences_kelex.append(set(cur_true_sentences))
                else:
                    continue  # skip constructions without kelex
                
                for true_sentences in true_sentences_kelex:
                    if len(true_sentences) == num_true:  # watch out that the number of true_sentences is still coherent with the number predefined by num_true (e.g. in case a specific KE-lex wasn't in the sentence)
                        # pick random false positives from the other sentences                    
                        false_sentences = set(random.choices(others, k=num_false))
                        examples = list(false_sentences | true_sentences)
                        
                        positive_predicted = find_examples(definition, examples)
                        negative_predicted = [ex for ex in examples if ex not in positive_predicted]
                        
                        result.append(get_metrics(positive_predicted, negative_predicted, true_sentences, false_sentences, key, definition, examples))
                    
                
        result = pd.DataFrame(result)
        result.to_csv(f"../../out/comapp/result_{num_true}t_vs_{num_false}f_kelex_{attempts}attempts_bsbbert.tsv", sep="\t", decimal=",", header=True, index=False)

  0%|          | 0/211 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [15]:
random.seed(15)
attempts = 15
for num_true in range(1, 6):
    for num_false in range(1, 6):
        if not num_true and not num_false:
            continue  # skip (0, 0)
        result = []
        for key, definition in tqdm(definitions.items()):
            others = list(itertools.chain.from_iterable([sentence_list for constr, sentence_list in sentences.items() if int(constr) != int(key)]))  # flatten all other sentences which are not part of the current construction
            
            for attempt in range(attempts):
                try:
                    # pick the true elements of the current construction
                    true_sentences = {random.choice(list(sentences[int(key)])) for t in range(num_true)}
                except KeyError:
                    result.append(pd.Series({"constr": key, "definition": definition}))
                    continue
    
                # pick random false positives from the other sentences
                false_sentences = set(random.choices(others, k=num_false))
                examples = list(false_sentences | true_sentences)
                
                positive_predicted = find_examples(definition, examples, num_true)
                negative_predicted = [ex for ex in examples if ex not in positive_predicted]
                
                result.append(get_metrics(positive_predicted, negative_predicted, true_sentences, false_sentences, key, definition, examples))
                
        result = pd.DataFrame(result)
        result.to_csv(f"../../out/comapp/result_{num_true}t_vs_{num_false}f_{attempts}attempts_bsbbert_2.tsv", sep="\t", decimal=",", header=True, index=False)

  0%|          | 0/211 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
random.seed(15)
attempts = 15
for num_true in range(1, 6):
    for num_false in range(1, 6):
        result = []
        for key, definition in tqdm(definitions.items()):
            others = list(itertools.chain.from_iterable([sentence_list for constr, sentence_list in sentences.items() if int(constr) != int(key)]))
            for attempt in range(attempts):
                try:
                    # pick the true elements of the current construction
                    true_sentences = {random.choice(list(sentences[int(key)])) for t in range(num_true)}
                except KeyError:
                    result.append(pd.Series({"constr": key, "definition": definition}))
                    continue
                    
                true_sentences_kelex = []
                if kelex.get(key):  # if there is a predefined KE-lex in the construction
                    # if len(true_sentences) > 0:
                        for pseudoword in kelex[key]:  # replace each predefined KE-lex in the sentence one-by-one (if there are multiple)
                            cur_true_sentences = []
                            for sentence in true_sentences:
                                pseudoword_found = False
                                sentence_kelex = []
                                for token in sentence.split():
                                    new_token = token
                                    if token == re.findall(r'\D+', pseudoword)[0]:
                                        new_token = pseudoword
                                        pseudoword_found = True
                                    sentence_kelex.append(new_token)
                                if pseudoword_found:  # only add the sentence if it has a replaced pseudoword
                                    cur_true_sentences.append(" ".join(sentence_kelex))
                            #if len(cur_true_sentences) > 0:
                            true_sentences_kelex.append(set(cur_true_sentences))
                else:
                    continue  # skip constructions without kelex
                
                for true_sentences in true_sentences_kelex:
                    if len(true_sentences) == num_true:  # watch out that the number of true_sentences is still coherent with the number predefined by num_true (e.g. in case a specific KE-lex wasn't in the sentence)
                        # pick random false positives from the other sentences                    
                        false_sentences = set(random.choices(others, k=num_false))
                        examples = list(false_sentences | true_sentences)
                        
                        positive_predicted = find_examples(definition, examples, true_examples=num_true)
                        negative_predicted = [ex for ex in examples if ex not in positive_predicted]
                        
                        result.append(get_metrics(positive_predicted, negative_predicted, true_sentences, false_sentences, key, definition, examples))
                        
                
        result = pd.DataFrame(result)
        result.to_csv(f"../../out/comapp/result_{num_true}t_vs_{num_false}f_kelex_{attempts}attempts_bsbbert_2.tsv", sep="\t", decimal=",", header=True, index=False)