In [None]:
import itertools
import json
import pickle
import re

import numpy as np
import pandas as pd
import torch
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
with open("../../data/pseudowords/CoMaPP_all.json") as json_file:
    data = json.load(json_file)
    
data = [{"example": d["target1"], "cue": " ".join(d["target1"].split()[:d["query_idx"]]), "pseudoword": d["label"]} for d in data]
df = pd.DataFrame.from_dict(data).drop_duplicates(ignore_index=True)
df

In [None]:
df['index'] = df['pseudoword'].str.extract('(\d+)').astype(int)
df.set_index('index', inplace=True)

df

In [None]:
contextleft = pd.read_pickle("../../data/pseudowords/contextleft_text.pickle")

def update_cue(row):
    output = row[['example', 'cue']]
    if row['cue'] == '':  # if the string in cue is empty
        # match the index of row with contextleft['construction_id'] and match contextleft['text'] with row['example'] and create matching_entry
        matching_entry = contextleft.loc[(contextleft['construction_id'] == row.name) & (contextleft['text'] == row['example']), 'contextleft'].tolist()
        if len(matching_entry) > 0:
            output = [matching_entry[0] + " " + row['example'], matching_entry[0]]
    return output

# Add the left context if there is no cue up until the pseudoword.
df[["example", "cue"]] = df.apply(update_cue, axis=1)
df

In [None]:
df.reset_index(inplace=True)
df.rename(columns={'index': 'construction'}, inplace=True)

result_df = df.groupby(['construction', 'pseudoword']).agg({'example': list, 'cue': list})

result_df

In [None]:
with open("../../out/definitions.pickle", "rb") as definitions_file:
    definitions = pd.DataFrame.from_dict(pickle.load(definitions_file), orient="index", columns=["definition"])
    
definitions

In [None]:
examples = pd.merge(result_df, definitions, how="inner", left_on="construction", right_index=True)
examples

##### Generieren neuer Sätze:

In [None]:
pseudowords = [
    np.load(f"../../data/pseudowords/mbart/pseudowords_comapp_0_69.npy"),
    np.load(f"../../data/pseudowords/mbart/pseudowords_comapp_69_93.npy"),
    np.load(f"../../data/pseudowords/mbart/pseudowords_comapp_93_165.npy"),
    np.load(f"../../data/pseudowords/mbart/pseudowords_comapp_165_176.npy"),
    np.load(f"../../data/pseudowords/mbart/pseudowords_comapp_176_281.npy"),
    np.load(f"../../data/pseudowords/mbart/pseudowords_comapp_281_364.npy"),
    np.load(f"../../data/pseudowords/mbart/pseudowords_comapp_364_437.npy"),
    np.load(f"../../data/pseudowords/mbart/pseudowords_comapp_437_562.npy")
]
pseudowords = np.concatenate(pseudowords)
pseudowords

In [None]:
csv_data = []
# TODO mBART-Order laden
for i in range(0, 8):
    csv_data.append(pd.read_csv(f"../../data/pseudowords/mbart/order_{i}.csv", sep=";", index_col=0, header=None, quotechar="|", names=["order", "label"]))
csv_data = pd.concat(csv_data)
csv_data

In [None]:
mbart_tokens = [d[0] for d in csv_data.values]

mbart_tokens, len(mbart_tokens)

Load the vanilla mbart model:

In [None]:
model = MBartForConditionalGeneration.from_pretrained(
    "facebook/mbart-large-50", return_dict=True
) 
tokenizer = MBart50TokenizerFast.from_pretrained(
    "facebook/mbart-large-50", src_lang="de_DE", tgt_lang="de_DE"
)
model.model.encoder.embed_tokens

Add to existing embeddings:

In [None]:
combined_embeddings = torch.cat((model.model.encoder.embed_tokens.weight, torch.tensor(pseudowords)), dim=0)
model.model.encoder.embed_tokens = torch.nn.Embedding.from_pretrained(combined_embeddings)
model.model.encoder.embed_tokens

Add to existing tokens:

In [None]:
tokenizer.add_tokens(mbart_tokens)
model.resize_token_embeddings(len(tokenizer))

In [None]:
model.to("cuda:0")

Complete the cues:

In [None]:
def complete_cues(row):
    try:
        output_texts = []
        scores = []
        for cue, example in zip(row["cue"], row["example"]):
            assert row["pseudoword"].iloc[0] in mbart_tokens  # skip pseudoword embeddings that haven't been learned
            
            input_text = cue + " " + row["pseudoword"].iloc[0] + " <mask>"
            target_length = int(1.5 * len(example))  # allow double the length of the original sentence
            
            outputs = tokenizer(input_text, return_tensors="pt")
            outputs = model.generate(outputs["input_ids"].to("cuda:0"), max_length=target_length, num_return_sequences=1, num_beams=10, output_scores=True, return_dict_in_generate=True)
            output_text = tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)
            score = torch.exp(outputs.sequences_scores)
            output_texts += output_text
            
            scores.append(score)
        print(".", end="")
        return pd.Series({"construction": row["construction"], "pseudoword": row["pseudoword"].iloc[0], "orig_example": row["example"], "generated": str(output_texts), "scores": str([float(score) for score in scores])})
    except Exception as e:
        print(":", end="")
        return pd.Series({"construction": row["construction"], "pseudoword": row["pseudoword"].iloc[0], "orig_example": row["example"], "generated": str(e), "scores": "[-1.0]"})

examples_reset = examples.reset_index()
pseudoword_output_scores = examples_reset[["construction", "pseudoword", "example", "cue", "pseudoword"]].progress_apply(complete_cues, axis=1)
pseudoword_output_scores

In [None]:
examples = pseudoword_output_scores[["pseudoword", "generated", "scores"]]

examples

#### Anzahl neuer Sätze

In [None]:
examples.to_csv(f"../../out/comapp/mbart/data_mbart.tsv", sep="\t", decimal=",")
examples.to_excel(f"../../out/comapp/mbart/data_mbart.xlsx")

In [ ]:
pseudoword_output_scores.to_csv("../../out/comapp/mbart/data_mbart_complete.tsv", sep="\t", decimal=",")