In [None]:
import csv
import pickle

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from tqdm.notebook import tqdm, trange
from transformers import BertForMaskedLM, BertTokenizer

tqdm.pandas()

In [None]:
matches = pd.read_csv("../../out/matches/matches_dep_constr.tsv", sep="\t", header=0)
matches["m"] = matches.apply(lambda row: row["fuzziness (common dep)"] * row["fuzziness (matches)"], axis=1)
matches

In [None]:
with open("../../data/pseudowords/annotations.csv", "r") as csv_file:
    data = [row for row in csv.DictReader(csv_file)]
    
kelex_dict = {}
for example in data:
    kees = set()
    for kee in eval(example["kees"]):
        kees |= set(kee.split())
    kelex_dict[int(example["construction_id"])] = kees

kelex_dict

In [None]:
# cf. bert_comapp_generate_examples.ipynb

pseudowords = []
for i in range(15):
    pseudowords.append(np.load(f"../../data/pseudowords/bsbbert/pseudowords_comapp_bsbbert_{i*37}_{i*37+37}.npy"))
pseudowords = np.concatenate(pseudowords)

csv_data = []
for i in range(1, 16):
    csv_data.append(pd.read_csv(f"../../data/pseudowords/bsbbert/order_bsbbert_{i}.csv", sep=";", index_col=0, header=None, quotechar="|", names=["order", "label"]))
csv_data = pd.concat(csv_data)

bert_tokens = [d[0] for d in csv_data.values]
bert_tokens

In [None]:
model = BertForMaskedLM.from_pretrained('dbmdz/bert-base-german-cased', return_dict=True)
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')

combined_embeddings = torch.cat((model.bert.embeddings.word_embeddings.weight, torch.tensor(pseudowords)), dim=0)
model.bert.embeddings.word_embeddings = torch.nn.Embedding.from_pretrained(combined_embeddings)
model.bert.embeddings.word_embeddings

In [None]:
tokenizer.add_tokens(bert_tokens)
model.resize_token_embeddings(len(tokenizer))

In [None]:
model.to("cuda:0")

After everything has been prepared, we can create a list of the contextual embeddings for the standard BERT tokens and the pseudoword tokens.

In [None]:
contextual_embeds = {}

for constr, kelexes in tqdm(kelex_dict.items()):
    contextual_embeds[constr] = {}
    for kelex in kelexes:
        if kelex + str(constr) in bert_tokens:
            original_ids = tokenizer(kelex, return_tensors='pt')['input_ids'].to("cuda:0")
            pseudoword_ids = tokenizer(kelex + str(constr), return_tensors='pt')['input_ids'].to("cuda:0")
            with torch.no_grad():
                original_outputs = model(original_ids, output_hidden_states=True)
                pseudoword_outputs = model(pseudoword_ids, output_hidden_states=True)
            original_contextual_embed = original_outputs.hidden_states[12][0][1:-1]
            pseudoword_contextual_embed = pseudoword_outputs.hidden_states[12][0][1:-1]
            contextual_embeds[constr][kelex] = (original_contextual_embed, pseudoword_contextual_embed)
        else:
            print(kelex + str(constr))

contextual_embeds.keys()

In [None]:
contextual_embeds[10]["geschweige"]

Now we can compare the distance of the pseudowords to a sentence with the distance of the original token(s) to the same sentence. Since one pseudoword token can be equivalent to multiple original tokens, we need to take the average distance from all original tokens to compare this average distance to the distance of the pseudoword token.

In [None]:
def distances(row):
    global contextual_embeds
    global sentence_slices_cache
    constr = row["constr"]
    sentence = row["sentence"]
    
    return_row = []
    sentence_ids = tokenizer(sentence, return_tensors='pt')['input_ids']
    # First, check whether the construction has ke-lex and if any ke-lex is in the sentence. Also, drop a sentence if it is way too long!
    if (constr not in contextual_embeds.keys()) or (not any([kelex in sentence for kelex in contextual_embeds[constr].keys()])) or (sentence_ids.size(-1) > 512):
        print(".", end="")
        return [{'constr': row['constr'], 'kelex': None, 'sentence': row['sentence'], "fuzziness (common dep)": row["fuzziness (common dep)"], "fuzziness (matches)": row["fuzziness (common dep)"], "m": row["m"], 'bert_sim': None, 'pseudword_sim': None, 'bert_euclidean': None, 'pseudword_euclidean': None, 'bert_manhattan': None, 'pseudword_manhattan': None}]
    with torch.no_grad():
        sentence_id_list = [sentence_ids]
        try:
            outputs_list = [model(sentence_ids.to("cuda:0"), output_hidden_states=True)]
        except torch.cuda.OutOfMemoryError:
            model.to("cpu")
            outputs_list = [model(sentence_ids, output_hidden_states=True)]
            model.to("cuda:0")
    
        for kelex, embeds in contextual_embeds[constr].items():
            if kelex not in sentence:
                continue
            bert_sims = []
            pseudoword_sims = []
            bert_euclideans = []
            pseudoword_euclideans = []
            bert_manhattans = []
            pseudoword_manhattans = []
            for cur_sentence_ids, outputs in zip(sentence_id_list, outputs_list):
                kelex_ids = [idx for idx, t in enumerate(cur_sentence_ids[0]) if t in tokenizer(kelex, return_tensors='pt')['input_ids'][0][1:-1]]
                if len(kelex_ids) == 0:  # the KE-LEX is not in the current segment
                    continue
                sentence_contextual_embeds = outputs.hidden_states[12][0][kelex_ids]
                
                # Now let's compare BERT and pseudoword:
                bert_sims.append(torch.mean(F.cosine_similarity(embeds[0], sentence_contextual_embeds, dim=-1)))
                pseudoword_sims.append(torch.mean(F.cosine_similarity(embeds[1].expand_as(sentence_contextual_embeds), sentence_contextual_embeds, dim=-1)))
                bert_euclideans.append(torch.mean(torch.norm(embeds[0]-sentence_contextual_embeds, p=2, dim=-1)))
                pseudoword_euclideans.append(torch.mean(torch.norm(embeds[1].expand_as(sentence_contextual_embeds) - sentence_contextual_embeds, p=2, dim=-1)))
                bert_manhattans.append(torch.mean(torch.norm(embeds[0]-sentence_contextual_embeds, p=1, dim=-1)))
                pseudoword_manhattans.append(torch.mean(torch.norm(embeds[1].expand_as(sentence_contextual_embeds) - sentence_contextual_embeds, p=1, dim=-1)))
                
            bert_sim = torch.mean(torch.tensor(bert_sims))
            pseudoword_sim = torch.mean(torch.tensor(pseudoword_sims))
            bert_euclidean = torch.mean(torch.tensor(bert_euclideans))
            pseudoword_euclidean = torch.mean(torch.tensor(pseudoword_euclideans))
            bert_manhattan = torch.mean(torch.tensor(bert_manhattans))
            pseudoword_manhattan = torch.mean(torch.tensor(pseudoword_manhattans))
            return_row.append({'constr': row['constr'], 'kelex': kelex, 'sentence': row['sentence'], "fuzziness (common dep)": row["fuzziness (common dep)"], "fuzziness (matches)": row["fuzziness (common dep)"], "m": row["m"], 'bert_sim': float(bert_sim), 'pseudword_sim': float(pseudoword_sim), 'bert_euclidean': float(bert_euclidean), 'pseudword_euclidean': float(pseudoword_euclidean), 'bert_manhattan': float(bert_manhattan), 'pseudword_manhattan': float(pseudoword_manhattan)})
            
            if any([pseudoword_sim >= bert_sim, pseudoword_euclidean <= bert_euclidean, pseudoword_manhattan <= bert_manhattan]):
                print("\n" + str(return_row))
            else:
                print(":", end="")
        return return_row

similarities = pd.DataFrame()
save = 1000
for match in tqdm(matches.to_dict(orient="index").values(), total=len(matches)):
    similarities = pd.concat((similarities, pd.DataFrame(distances(match))), ignore_index=True)
    if save > 0:
        save -= 1
    else:
        save = 1000
        similarities.to_csv(f"../../out/comapp/similarities_bsbbert.tsv", sep="\t", decimal=",", index=False)
similarities.to_csv(f"../../out/comapp/similarities_bsbbert.tsv", sep="\t", decimal=",", index=False)
similarities

In [None]:
similarities_kelex_only = similarities.dropna(ignore_index=True)
similarities_kelex_only.to_csv(f"../../out/comapp/similarities_bsbbert_kelex_only.tsv", sep="\t", index=False)
similarities_kelex_only

#### Variant using averaged contextual embeds for the constructicon samples:

In [None]:
import json

In [None]:
with open("../../data/pseudowords/CoMaPP_all_bert.json") as json_file:
    data = json.load(json_file)
    
data = [{"example": d["target1"], "example_pseudoword": (" ".join(d["target1"].split()[:d["target1_idx"]]) + " " + d["label"] + " " + " ".join(d["target1"].split()[d["target1_idx"]+1:])).strip(), "pseudoword": d["label"], "kelex_idx": d["target1_idx"]} for d in data]
df = pd.DataFrame.from_dict(data).drop_duplicates(ignore_index=True)

# sort by construction number
df['index'] = df['pseudoword'].str.extract('(\d+)').astype(int)
df.set_index('index', inplace=True)

df.reset_index(inplace=True)
df.rename(columns={'index': 'construction'}, inplace=True)

# sort by constructions and their pseudowords/kelex
result_df = df.groupby(['construction', 'pseudoword']).agg({'example': list, 'example_pseudoword': list, 'kelex_idx': list}).reset_index()

result_df

In [None]:
# This dictionary is for easier access of each pseudoword's fitting examples:
result_dict = {}
for index, row in result_df.iterrows():
    construction = row['construction']
    pseudoword = row['pseudoword']
    example_list = row['example']
    example_pseudoword_list = row["example_pseudoword"]
    kelex_idxs = row["kelex_idx"]

    if construction not in result_dict:
        result_dict[construction] = {}

    result_dict[construction][pseudoword] = (example_list, example_pseudoword_list, kelex_idxs)
    
result_dict[5]["Und5"]

In [None]:
contextual_embeds_ex = {}

for constr, kelexes in tqdm(result_dict.items()):
    contextual_embeds_ex[constr] = {}
    for kelex, (exs, exs_pseudo, kelex_idx) in tqdm(kelexes.items(), disable=True):
        if kelex in bert_tokens:
            original_ids = [tokenizer(ex, return_tensors='pt')['input_ids'] for ex in exs]
            pseudoword_ids = [tokenizer(ex, return_tensors='pt')['input_ids'] for ex in exs_pseudo]
            with torch.no_grad():
                original_outputs = [model(o.to("cuda:0"), output_hidden_states=True) for o in original_ids]
                pseudoword_outputs = [model(p.to("cuda:0"), output_hidden_states=True) for p in pseudoword_ids]
            # The contextual embedding is calculated by getting the mean of all contextual embeddings for each example.
            original_contextual_embed_ex = torch.mean(torch.stack(
                [o_out.hidden_states[12][0][i+1:i+2] for o_out, i in zip(original_outputs, kelex_idx)]
            ), dim=0)
            pseudoword_contextual_embed_ex = torch.mean(torch.stack(
                [p_out.hidden_states[12][0][i+1:i+2] for p_out, i in zip(pseudoword_outputs, kelex_idx)]
            ), dim=0)
            contextual_embeds_ex[constr][kelex.replace(str(constr), "")] = (original_contextual_embed_ex, pseudoword_contextual_embed_ex)
        else:
            print(kelex)

contextual_embeds_ex.keys()

In [None]:
import pickle

with open("../../out/comapp/contextual_embeds_ex.pickle", "wb") as file:
    pickle.dump(contextual_embeds_ex, file)

In [None]:
with open("../../out/comapp/contextual_embeds_ex.pickle", "rb") as file:
    contextual_embeds_ex = pickle.load(file)
    
contextual_embeds_ex

In [None]:
def distances_ex(row):
    global contextual_embeds_ex
    global sentence_slices_cache
    constr = row["constr"]
    sentence = row["sentence"]
    
    return_row = []
    sentence_ids = tokenizer(sentence, return_tensors='pt')['input_ids']
    # First, check whether the construction has ke-lex and if any ke-lex is in the sentence. Also, drop a sentence if it is way too long!
    if (constr not in contextual_embeds_ex.keys()) or (not any([kelex in sentence for kelex in contextual_embeds_ex[constr].keys()])) or (sentence_ids.size(-1) > 512):
        print(".", end="")
        return [{'constr': row['constr'], 'kelex': None, 'sentence': row['sentence'], "fuzziness (common dep)": row["fuzziness (common dep)"], "fuzziness (matches)": row["fuzziness (common dep)"], "m": row["m"], 'bert_sim': None, 'pseudword_sim': None, 'bert_euclidean': None, 'pseudword_euclidean': None, 'bert_manhattan': None, 'pseudword_manhattan': None}]
    with torch.no_grad():
        sentence_id_list = [sentence_ids]
        outputs_list = [model(sentence_ids.to("cuda:0"), output_hidden_states=True)]
    
        for kelex, embeds in contextual_embeds_ex[constr].items():
            if kelex not in sentence:
                continue
            bert_sims = []
            pseudoword_sims = []
            bert_euclideans = []
            pseudoword_euclideans = []
            bert_manhattans = []
            pseudoword_manhattans = []
            for cur_sentence_ids, outputs in zip(sentence_id_list, outputs_list):
                kelex_ids = [idx for idx, t in enumerate(cur_sentence_ids[0]) if t in tokenizer(kelex, return_tensors='pt')['input_ids'][0][1:-1]]
                if len(kelex_ids) == 0:  # the KE-LEX is not in the current segment
                    continue
                sentence_contextual_embeds = outputs.hidden_states[12][0][kelex_ids]
                
                # Now let's compare BERT and pseudoword:
                bert_sims.append(torch.mean(F.cosine_similarity(embeds[0], sentence_contextual_embeds, dim=-1)))
                pseudoword_sims.append(torch.mean(F.cosine_similarity(embeds[1].expand_as(sentence_contextual_embeds), sentence_contextual_embeds, dim=-1)))
                bert_euclideans.append(torch.mean(torch.norm(embeds[0]-sentence_contextual_embeds, p=2, dim=-1)))
                pseudoword_euclideans.append(torch.mean(torch.norm(embeds[1].expand_as(sentence_contextual_embeds) - sentence_contextual_embeds, p=2, dim=-1)))
                bert_manhattans.append(torch.mean(torch.norm(embeds[0]-sentence_contextual_embeds, p=1, dim=-1)))
                pseudoword_manhattans.append(torch.mean(torch.norm(embeds[1].expand_as(sentence_contextual_embeds) - sentence_contextual_embeds, p=1, dim=-1)))
                
            bert_sim = torch.mean(torch.tensor(bert_sims))
            pseudoword_sim = torch.mean(torch.tensor(pseudoword_sims))
            bert_euclidean = torch.mean(torch.tensor(bert_euclideans))
            pseudoword_euclidean = torch.mean(torch.tensor(pseudoword_euclideans))
            bert_manhattan = torch.mean(torch.tensor(bert_manhattans))
            pseudoword_manhattan = torch.mean(torch.tensor(pseudoword_manhattans))
            return_row.append({'constr': row['constr'], 'kelex': kelex, 'sentence': row['sentence'], "fuzziness (common dep)": row["fuzziness (common dep)"], "fuzziness (matches)": row["fuzziness (common dep)"], "m": row["m"], 'bert_sim': float(bert_sim), 'pseudword_sim': float(pseudoword_sim), 'bert_euclidean': float(bert_euclidean), 'pseudword_euclidean': float(pseudoword_euclidean), 'bert_manhattan': float(bert_manhattan), 'pseudword_manhattan': float(pseudoword_manhattan)})
            
            if any([pseudoword_sim >= bert_sim, pseudoword_euclidean <= bert_euclidean, pseudoword_manhattan <= bert_manhattan]):
                print("\n" + str(return_row))
            else:
                print(":", end="")
        return return_row

similarities_ex = pd.DataFrame()
save = 1000
for match in tqdm(matches.to_dict(orient="index").values(), total=len(matches)):
    similarities_ex = pd.concat((similarities_ex, pd.DataFrame(distances_ex(match))), ignore_index=True)
    if save > 0:
        save -= 1
    else:
        save = 1000
        similarities_ex.to_csv(f"../../out/comapp/similarities_ex_bsbbert.tsv", sep="\t", decimal=",", index=False)
similarities_ex.to_csv(f"../../out/comapp/similarities_ex_bsbbert.tsv", sep="\t", decimal=",", index=False)
similarities_ex

In [None]:
similarities_ex_kelex_only = similarities_ex.dropna(ignore_index=True)
similarities_ex_kelex_only.to_csv(f"../../out/comapp/similarities_ex_bsbbert_kelex_only.tsv", sep="\t", index=False)
similarities_ex_kelex_only

In [ ]:
similarities[""]