In [1]:
#!pip3 install transformers==4.33.2
#!pip3 install optimum==1.13.2
#!pip3 install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BertForMaskedLM, BertTokenizer
import pickle
import re
import os
import csv
import itertools
import json
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from tqdm.notebook import tqdm, trange

tqdm.pandas()

### Models
First, let us load a LLAMA model:

In [3]:
llama_name_or_path = 'TheBloke/Llama-2-13B-German-Assistant-v4-GPTQ'
llama = AutoModelForCausalLM.from_pretrained(llama_name_or_path,
                                             device_map="cuda:0",
                                             trust_remote_code=False,
                                             revision="gptq-4bit-32g-actorder_True")
llama_tokenizer = AutoTokenizer.from_pretrained(llama_name_or_path)

Then, we also need a BERT model:

In [4]:
bert_base = BertForMaskedLM.from_pretrained('dbmdz/bert-base-german-cased', return_dict=True)
bert_base_tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
bert_base.bert.embeddings.word_embeddings

Some weights of the model checkpoint at dbmdz/bert-base-german-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Embedding(31102, 768, padding_idx=0)

### Data
Now, we load some data we need. First, we need some definitions and their example sentences.

In [5]:
with open("../../out/definitions.pickle", "rb") as file:
    definitions = pickle.load(file)
with open("../../out/sentences.pickle", "rb") as file:
    sentences = pickle.load(file)

In [6]:
definitions

{10: 'Die "Negation:NEG_XgeschweigedennY-Konstruktion" korreliert zwei Propositionen, die jeweils einen Punkt auf einem pragmatisch definiertem Maßstab markieren. Die erste Proposition ist pragmatisch stärker als die zweite Proposition; so zieht, pragmatisch betrachtet, die Wahrheit der ersten Proposition automatisch die Wahrheit der zweiten Proposition nach sich. Einfacher ausgedrückt: Ist die erste Proposition wahr, so muss die zweite Proposition ebenfalls wahr sein. Die erste Proposition ist darüber hinaus auch informativer als die zweite Proposition, da die erstgenannte pragmatisch betrachtet die letztgenannte einschließt, während diese wiederum die diskursrelevantere der beiden Propositionen ist. Die Konstruktion umfasst das konstruktionsevozierende Element (KEE) "geschweige_denn", die internen Kern-Konstruktionselemente (Kern-KE) "Erstes_Konjunkt" und "Zweites_Konjunkt" sowie die externen Kern-KE "Negator" und "Fokuskontext". Das "Erste_Konjunkt" geht dem "KEE" voraus, auf welche

Next up, we load the prepared BERT pseudoword embeddings.

In [7]:
pseudowords = []
for i in range(15):
    pseudowords.append(np.load(f"../../data/pseudowords/bsbbert/pseudowords_comapp_bsbbert_{i*37}_{i*37+37}.npy"))
pseudowords = np.concatenate(pseudowords)
pseudowords

array([[-3.3182416 ,  2.3023303 ,  1.777937  , ..., -1.2179666 ,
         0.08523893, -1.138552  ],
       [-0.47602665,  1.2971725 , -0.41911647, ...,  0.74555194,
         0.5969563 ,  0.48672953],
       [-0.49536157,  0.59784025,  1.3000653 , ..., -0.81466854,
         0.5967385 , -0.31931594],
       ...,
       [-0.8017223 ,  1.0898677 ,  0.5879395 , ...,  0.28440672,
        -0.3114752 , -1.0823542 ],
       [-0.686387  ,  0.6861806 , -0.02480252, ..., -0.21791594,
         1.0139762 , -0.6977931 ],
       [-0.9309381 ,  1.19819   ,  3.1951077 , ...,  2.9199026 ,
         0.9376991 , -0.79965603]], dtype=float32)

In [8]:
csv_data = []
for i in range(1, 16):
    csv_data.append(pd.read_csv(f"../../data/pseudowords/bsbbert/order_bsbbert_{i}.csv", sep=";", index_col=0, header=None, quotechar="|", names=["order", "label"]))
csv_data = pd.concat(csv_data)
csv_data

Unnamed: 0_level_0,label
order,Unnamed: 1_level_1
0,"""""Was13"
1,"""647"
2,"""Wir-äh-spielen-äh-in-der-äh-Champions-League647"
3,(1597
4,(1600
...,...
550,wohl1134
551,wollen1029
552,wäre130
553,wäre1660


Also, we define a lookup table to map from construction ids to the pseudowords more quickly.

In [9]:
with open("../../data/pseudowords/annotations.csv", "r") as csv_file:
    data = [row for row in csv.DictReader(csv_file)]
    
kelex_dict = {}
for example in data:
    kees = set()
    for kee in eval(example["kees"]):
        kees |= set(kee.split())
    kelex_dict[int(example["construction_id"])] = kees

kelex_dict

{10: {'denn', 'geschweige'},
 100: {'gleich'},
 1004: {'er'},
 1006: {'st'},
 101: {'wie'},
 1029: {'Mit', 'wem', 'würden'},
 103: {'wie'},
 1033: {'Hauptsache'},
 1034: {'Von', 'zu'},
 1035: {'von', 'zu'},
 104: {'das', 'nämliche'},
 1043: set(),
 1051: set(),
 1054: set(),
 11: {'kaum', 'und', 'wohl'},
 111: {'Umso', 'je'},
 112: set(),
 1126: {'Generation'},
 1134: {'Sowohl', 'als', 'auch'},
 1140: {'und'},
 1162: {'nicht', 'oder'},
 12: {'noch', 'weder'},
 1219: set(),
 122: {'genauso'},
 123: {'ähnlich'},
 125: {'es', 'sei', 'wie'},
 1257: set(),
 127: {'ein', 'einem', 'von'},
 128: {'artig'},
 1289: {'lassen', 'sein'},
 129: {'Art', 'eine'},
 1291: {'Wenn'},
 13: {'Was', 'für'},
 130: {'wie', 'wäre'},
 1300: {'So', 'geht'},
 1301: {'Augenblick', 'Der'},
 1313: {'Es', 'war', 'zum'},
 1315: {'Als', 'ob'},
 1316: {'Raus', 'aus', 'in', 'rein'},
 132: {'gleicht'},
 1320: {'lein'},
 1322: {'chen'},
 1323: {'pur'},
 1324: {'satt'},
 1329: {'jedermanns'},
 133: {'als', 'ob'},
 1337: {'i'

These pseudowords are now added to BERT.

In [10]:
bert_tokens = [d[0] for d in csv_data.values]

bert_tokens, len(bert_tokens)

(['""Was13',
  '"647',
  '"Wir-äh-spielen-äh-in-der-äh-Champions-League647',
  '(1597',
  '(1600',
  '(1602',
  '(1624',
  '(1637',
  '(1639',
  '(1641',
  '(1643',
  '(1645',
  '(379',
  '(579',
  '(581',
  '(584',
  '(590',
  '(592',
  '(600',
  '(886',
  '(889',
  '(892',
  '(900',
  '(905',
  '(907',
  '(909',
  '(911',
  '(917',
  '(919',
  '(921',
  '(923',
  ')1597',
  ')1600',
  ')1602',
  ')1624',
  ')1637',
  ')1639',
  ')1641',
  ')1643',
  ')1645',
  ')1792',
  ')379',
  ')579',
  ')581',
  ')584',
  ')590',
  ')592',
  ')600',
  ')886',
  ')889',
  ')892',
  ')900',
  ')905',
  ')907',
  ')909',
  ')911',
  ')917',
  ')919',
  ')921',
  ')923',
  ')«579',
  ',1459',
  ',973',
  '-128',
  '-651',
  '-654',
  '-875',
  '-973',
  ':595',
  ':875',
  ':973',
  'Abstand683',
  'Allein20',
  'Aller1630',
  'Als1315',
  'Als133',
  'Als1770',
  'Am488',
  'Am492',
  'Am500',
  'Amerika605',
  'Anstatt320',
  'Art129',
  'Arzt1509',
  'Augenblick1301',
  'Ausmaß1777',
  'BRUTAL150

In [11]:
combined_embeddings = torch.cat((bert_base.bert.embeddings.word_embeddings.weight, torch.tensor(pseudowords)), dim=0)
bert_base.bert.embeddings.word_embeddings = torch.nn.Embedding.from_pretrained(combined_embeddings)
bert_base.bert.embeddings.word_embeddings

Embedding(31657, 768)

In [12]:
bert_base_tokenizer.add_tokens(bert_tokens)
bert_base.resize_token_embeddings(len(bert_base_tokenizer))

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 31657. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(31657, 768)

Finally, we move both models to a GPU. If there is only one GPU, only the LLAMA model is moved there.

In [13]:
llama.to("cuda:0")
bert_device = "cuda:1" if torch.cuda.device_count() >= 2 else "cpu"
bert_base.to(bert_device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31657, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
   

# Generation
Finally, we can start generating new examples. First we define a function which lets LLAMA propose a sentence. 

In [14]:
def generate_examples(definition: str, sentence: str, temperature=0.75, top_p=0.95, top_k=1, max_new_tokens=1024):
    if len(sentence) > 2:
        prompt = lambda definition, sentence: f'''### User: Du bist kreativ und gewissenhaft. Hier ist eine Definition: {definition} Bilde neue Sätze gemäß dieser Definition. Gib die Sätze in einer Python-Liste aus. Gib sonst nichts aus.
        ### Example: {sentence}
        ### Assistant:["'''
    else:
        prompt = lambda definition, sentence: f'''### User: Du bist kreativ und gewissenhaft. Hier ist eine Definition: {definition} Bilde neue Sätze gemäß dieser Definition. Gib die Sätze in einer Python-Liste aus. Gib sonst nichts aus.
        ### Assistant:["'''

    prompt_length = len(prompt(definition, sentence))
    output = []
    first_output = ""
    i = 100  # only try 100 times
    while (
        (not any([c.isalpha() for c in first_output]))
        or any([x in first_output for x in {"Konstruktion", "Satz", "Überschrift", "_", ":", "XY", "XP", "X ", "Y ", "X.", "Y."}])
        or re.search(r".*\].*\[.*", first_output)
    ):
        input_ids = llama_tokenizer(prompt(definition, sentence), return_tensors='pt').input_ids.cuda()
        output = llama.generate(inputs=input_ids, temperature=temperature,
                                do_sample=True, top_p=top_p, top_k=top_k,
                                max_new_tokens=max_new_tokens)
        output = llama_tokenizer.decode(output[0])[prompt_length:].strip()
        # print(output)
        output = re.findall('\[.*\]', output)
        if len(output) > 0:
            first_output = output[0]
        i -= 1
        print(i, end=" ")
        if i == 0:
            first_output = "[]"
            break
        #print(f"\t{output}")
    #print(f"\n{output}")
    print()
    return first_output

To validate the output from LLAMA, we check the produced sentence for the existence of the necessary KE-lex. If it is not available, we try again. If it is available, we check whether the pseudoword or the general embedding is closer to the representation given in the sentence. We accept a sentence only if the pseudoword is closer.

Load the contextual embeddings first (are prepared in `compare_embeddings.ipynb`).

In [15]:
with open("../../out/comapp/contextual_embeds_ex.pickle", "rb") as file:
    contextual_embeds_ex = pickle.load(file)
    
contextual_embeds_ex[10].keys()

dict_keys(['Geschweige', 'denn', 'geschweige'])

In [16]:
def compare_distances(constr, sentence):
    global contextual_embeds_ex
    
    sentence_ids = bert_base_tokenizer(sentence, return_tensors='pt')['input_ids']
    # First, check whether the construction has ke-lex and if any ke-lex is in the sentence. Also, drop a sentence if it is way too long!
    if (constr not in contextual_embeds_ex.keys()) or (not any([kelex in sentence for kelex in contextual_embeds_ex[constr].keys()])) or (sentence_ids.size(-1) > 512):
        print(".", end="")
        return False
    with torch.no_grad():
        sentence_id_list = [sentence_ids]
        outputs_list = [bert_base(sentence_ids.to(bert_device), output_hidden_states=True)]
    
        pseudoword_fitting = []
        for kelex, embeds in contextual_embeds_ex[constr].items():
            if kelex not in sentence:
                continue
            bert_sims = []
            pseudoword_sims = []
            bert_euclideans = []
            pseudoword_euclideans = []
            bert_manhattans = []
            pseudoword_manhattans = []
            for cur_sentence_ids, outputs in zip(sentence_id_list, outputs_list):
                kelex_ids = [idx for idx, t in enumerate(cur_sentence_ids[0]) if t in bert_base_tokenizer(kelex, return_tensors='pt')['input_ids'][0][1:-1]]
                if len(kelex_ids) == 0:  # the KE-LEX is not in the current segment
                    continue
                sentence_contextual_embeds = outputs.hidden_states[12][0][kelex_ids]
                
                # Now let's compare BERT and pseudoword:
                bert_sims.append(torch.mean(F.cosine_similarity(embeds[0].to(bert_device), sentence_contextual_embeds, dim=-1)))
                pseudoword_sims.append(torch.mean(F.cosine_similarity(embeds[1].to(bert_device).expand_as(sentence_contextual_embeds), sentence_contextual_embeds, dim=-1)))
                bert_euclideans.append(torch.mean(torch.norm(embeds[0].to(bert_device)-sentence_contextual_embeds, p=2, dim=-1)))
                pseudoword_euclideans.append(torch.mean(torch.norm(embeds[1].to(bert_device).expand_as(sentence_contextual_embeds) - sentence_contextual_embeds, p=2, dim=-1)))
                bert_manhattans.append(torch.mean(torch.norm(embeds[0].to(bert_device)-sentence_contextual_embeds, p=1, dim=-1)))
                pseudoword_manhattans.append(torch.mean(torch.norm(embeds[1].to(bert_device).expand_as(sentence_contextual_embeds) - sentence_contextual_embeds, p=1, dim=-1)))
                
            bert_sim = torch.mean(torch.tensor(bert_sims))
            pseudoword_sim = torch.mean(torch.tensor(pseudoword_sims))
            bert_euclidean = torch.mean(torch.tensor(bert_euclideans))
            pseudoword_euclidean = torch.mean(torch.tensor(pseudoword_euclideans))
            bert_manhattan = torch.mean(torch.tensor(bert_manhattans))
            pseudoword_manhattan = torch.mean(torch.tensor(pseudoword_manhattans))
            
            pseudoword_fitting.append(any([pseudoword_sim >= bert_sim, pseudoword_euclidean <= bert_euclidean, pseudoword_manhattan <= bert_manhattan]))
        return any(pseudoword_fitting)  # return True if for the pseudoword at least one metric is better than for any of the standard embeddings in the examples

In [17]:
def generate_and_check_examples(constr, definition, sentence, temperature=0.75, max_new_tokens=1000, top_k=100, top_p=0.99, patience=10):
    fitting_example = False
    # Loop until the example is fitting the construction properly according to pseudowords:
    example_list_fitting = []
    while not fitting_example and patience:
        patience -= 1
        example = generate_examples(definition=definition, sentence=sentence, temperature=temperature, max_new_tokens=max_new_tokens, top_k=top_k, top_p=top_p)
        example_list_fitting = []
        try:
            example_list = eval(example)
            if example_list != []:
                for e in example_list:     
                    e_fit = compare_distances(constr, e)
                    fitting_example = fitting_example or e_fit  # "or": at least one pseudoword needs to fit
                    if e_fit:
                        example_list_fitting.append(e)
            print(fitting_example, example_list, example_list_fitting)
        except:
            pass
    return example_list_fitting

In [18]:
for shot in [1, 0]:
    examples = {}
    if os.path.exists(f"../../out/llama_bert/examples_{shot}_shot_plus_bert.pickle"):
        with open(f"../../out/llama_bert/examples_{shot}_shot_plus_bert.pickle", "rb") as file:
            examples = pickle.load(file)

    for k in tqdm(definitions.keys()):
        if k in examples.keys():
            continue  # have already generated examples for this construction

        definition = definitions[k]
        try:
            sentence = str(list(sentences[int(k)])[0:shot])  # get some sentences
        except KeyError:
            print(("[]", "[]", "This seems wrong..."))
            examples[k] = ("[]", "[]")
            continue

        example = generate_and_check_examples(
            constr=k, definition=definition, sentence=sentence, temperature=0.75,
            max_new_tokens=512, top_k=100, top_p=0.99
        )
        
        try:
            print((sentence, example))
            examples[k] = (sentence, example)
        except:
            print((sentence, "[]"))
            examples[k] = (sentence, "[]")
        
        print("=====")

        with open(f"../../out/llama_bert/examples_{shot}_shot.pickle", "wb") as file:
            pickle.dump(examples, file)

  0%|          | 0/211 [00:00<?, ?it/s]

99 
..False ['Die meisten von ihnen haben zuvor noch nie einen Computer gesehen', 'Die meisten von ihnen haben zuvor noch nie einen Computer bedient'] []
99 
..False ['Die meisten von ihnen haben zuvor noch nie einen Computer gesehen', 'Die meisten von ihnen haben zuvor noch nie einen Computer bedient'] []
99 
False ['Die meisten Menschen haben zuvor noch nie einen Computer gesehen – geschweige denn bedient', 'Die meisten Menschen haben zuvor noch nie einen Computer bedient – geschweige denn gesehen'] []
99 
False ['Die meisten Menschen haben zuvor noch nie einen Computer gesehen und geschweige denn bedient', 'Die meisten Menschen haben zuvor noch nie einen Computer gesehen, geschweige denn bedient'] []
99 98 97 
..False ['Die meisten von ihnen haben zuvor noch nie einen Computer bedient', 'Die meisten von ihnen haben zuvor noch nie einen Computer gesehen'] []
99 
..False ['Die meisten von ihnen haben zuvor noch nie einen Computer gesehen', 'Die meisten von ihnen haben zuvor noch nie e

OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacty of 11.75 GiB of which 18.31 MiB is free. Including non-PyTorch memory, this process has 11.54 GiB memory in use. Of the allocated memory 11.08 GiB is allocated by PyTorch, and 331.10 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF