In [1]:
import csv

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from tqdm.notebook import tqdm
from transformers import BertForMaskedLM, BertTokenizer

tqdm.pandas()

KeyboardInterrupt: 

In [None]:
matches = pd.read_csv("../../out/matches/matches_dep_constr_kelex.tsv", sep="\t", header=0)
matches

In [None]:
with open("../../data/pseudowords/annotations.csv", "r") as csv_file:
    data = [row for row in csv.DictReader(csv_file)]
    
kelex_dict = {}
for example in data:
    kees = set()
    for kee in eval(example["kees"]):
        kees |= set(kee.split())
    kelex_dict[int(example["construction_id"])] = kees

kelex_dict

In [None]:
# cf. evaluate_comapp_bert.ipynb

pseudowords = []
for i in range(15):
    pseudowords.append(np.load(f"../../data/pseudowords/bert/pseudowords_comapp_bert_{i*37}_{i*37+37}.npy"))
pseudowords = np.concatenate(pseudowords)

csv_data = []
for i in range(1, 16):
    csv_data.append(pd.read_csv(f"../../data/pseudowords/bert/order_bert_{i}.csv", sep=";", index_col=0, header=None, quotechar="|", names=["order", "label"]))
csv_data = pd.concat(csv_data)

bert_tokens = [d[0] for d in csv_data.values]
bert_tokens

In [None]:
model = BertForMaskedLM.from_pretrained('bert-base-german-cased', return_dict=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased')

combined_embeddings = torch.cat((model.bert.embeddings.word_embeddings.weight, torch.tensor(pseudowords)), dim=0)
model.bert.embeddings.word_embeddings = torch.nn.Embedding.from_pretrained(combined_embeddings)
model.bert.embeddings.word_embeddings

In [None]:
tokenizer.add_tokens(bert_tokens)
model.resize_token_embeddings(len(tokenizer))

After everything has been prepared, we can create a list of the contextual embeddings for the standard BERT tokens and the pseudoword tokens.

In [None]:
contextual_embeds = {}

for constr, kelexes in tqdm(kelex_dict.items()):
    contextual_embeds[constr] = {}
    for kelex in kelexes:
        if kelex + str(constr) in bert_tokens:
            original_ids = tokenizer(kelex, return_tensors='pt')['input_ids']
            pseudoword_ids = tokenizer(kelex + str(constr), return_tensors='pt')['input_ids']
            with torch.no_grad():
                original_outputs = model(original_ids, output_hidden_states=True)
                pseudoword_outputs = model(pseudoword_ids, output_hidden_states=True)
            original_contextual_embed = original_outputs.hidden_states[12][0][1:-1]
            pseudoword_contextual_embed = pseudoword_outputs.hidden_states[12][0][1:-1]
            contextual_embeds[constr][kelex] = (original_contextual_embed, pseudoword_contextual_embed)
        else:
            print(kelex + str(constr))

contextual_embeds.keys()

In [None]:
contextual_embeds[10]["geschweige"]

Now we can compare the distance of the pseudowords to a sentence with the distance of the original token(s) to the same sentence. Since one pseudoword token can be equivalent to multiple original tokens, we need to take the average distance from all original tokens to compare this average distance to the distance of the pseudoword token.

In [None]:
def distances(row):
    global contextual_embeds
    constr = row["constr"]
    if constr not in contextual_embeds.keys():
        print(".")
        return pd.Series({'constr': row['constr'], 'kelex': None, 'sentence': row['sentence'], 'bert_sim': None, 'pseudword_sim': None, 'bert_euclidean': None, 'pseudword_euclidean': None, 'bert_manhattan': None, 'pseudword_manhattan': None})
    
    return_row = []
    sentence = row["sentence"]
    sentence_ids = tokenizer(sentence, return_tensors='pt')['input_ids']
    with torch.no_grad():
        if sentence_ids.size(-1) > 512:
            # sliding window approach
            print("Sentence needs slicing, this may take a while ...")
            sentence_id_list = [sentence_ids[:, i:i + 512] for i in range(0, sentence_ids.size(-1)-512+1)]
            outputs_list = [model(sentence_ids[:, i:i + 512], output_hidden_states=True) for i in range(0, sentence_ids.size(-1)-512+1)]
        else:
            sentence_id_list = [sentence_ids]
            outputs_list = [model(sentence_ids, output_hidden_states=True)]
    
        for kelex, embeds in contextual_embeds[constr].items():
            if kelex not in sentence:
                continue
            bert_sims = []
            pseudoword_sims = []
            bert_euclideans = []
            pseudoword_euclideans = []
            bert_manhattans = []
            pseudoword_manhattans = []
            for cur_sentence_ids, outputs in zip(sentence_id_list, outputs_list):
                kelex_ids = [idx for idx, t in enumerate(cur_sentence_ids[0]) if t in tokenizer(kelex, return_tensors='pt')['input_ids'][0][1:-1]]
                if len(kelex_ids) == 0:  # the KE-LEX is not in the current segment
                    continue
                sentence_contextual_embeds = outputs.hidden_states[12][0][kelex_ids]
                
                # Now let's compare BERT and pseudoword:
                bert_sims.append(torch.mean(F.cosine_similarity(embeds[0], sentence_contextual_embeds, dim=-1)))
                pseudoword_sims.append(torch.mean(F.cosine_similarity(embeds[1].expand_as(sentence_contextual_embeds), sentence_contextual_embeds, dim=-1)))
                bert_euclideans.append(torch.mean(torch.norm(embeds[0]-sentence_contextual_embeds, p=2, dim=-1)))
                pseudoword_euclideans.append(torch.mean(torch.norm(embeds[1].expand_as(sentence_contextual_embeds) - sentence_contextual_embeds, p=2, dim=-1)))
                bert_manhattans.append(torch.mean(torch.norm(embeds[0]-sentence_contextual_embeds, p=1, dim=-1)))
                pseudoword_manhattans.append(torch.mean(torch.norm(embeds[1].expand_as(sentence_contextual_embeds) - sentence_contextual_embeds, p=1, dim=-1)))
                
            bert_sim = torch.mean(torch.tensor(bert_sims))
            pseudoword_sim = torch.mean(torch.tensor(pseudoword_sims))
            bert_euclidean = torch.mean(torch.tensor(bert_euclideans))
            pseudoword_euclidean = torch.mean(torch.tensor(pseudoword_euclideans))
            bert_manhattan = torch.mean(torch.tensor(bert_manhattans))
            pseudoword_manhattan = torch.mean(torch.tensor(pseudoword_manhattans))
            return_row.append({'constr': row['constr'], 'kelex': kelex, 'sentence': row['sentence'], 'bert_sim': float(bert_sim), 'pseudword_sim': float(pseudoword_sim), 'bert_euclidean': float(bert_euclidean), 'pseudword_euclidean': float(pseudoword_euclidean), 'bert_manhattan': float(bert_manhattan), 'pseudword_manhattan': float(pseudoword_manhattan)})
            
        print(return_row)
        return pd.Series(return_row)

similarities = matches.progress_apply(distances, axis=1)
similarities

In [ ]:
similarities.to_csv(f"../../out/comapp/similarities_bert.tsv", sep="\t", decimal=",")
similarities.to_excel(f"../../out/comapp/similarities_bert.xlsx")