In [None]:
import itertools
import json
import pickle
import re

import numpy as np
import pandas as pd
import torch
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
with open("../../data/pseudowords/CoMaPP_all.json") as json_file:
    data = json.load(json_file)
    
# d+1 since we are not replacing the ke-lex this time!
data = [{"example": d["target1"], "cue": " ".join(d["target1"].split()[:d["query_idx"]+1]), "pseudoword": d["label"]} for d in data]
df = pd.DataFrame.from_dict(data).drop_duplicates(ignore_index=True)
df

In [None]:
df['index'] = df['pseudoword'].str.extract('(\d+)').astype(int)
df.set_index('index', inplace=True)

df

In [None]:
contextleft = pd.read_pickle("../../data/pseudowords/contextleft_text.pickle")

def update_cue(row):
    output = row[['example', 'cue']]
    if len(row['cue'].split()) == 1:  # if the string in cue is empty (except for the kelex)
        # match the index of row with contextleft['construction_id'] and match contextleft['text'] with row['example'] and create matching_entry
        matching_entry = contextleft.loc[(contextleft['construction_id'] == row.name) & (contextleft['text'] == row['example']), 'contextleft'].tolist()
        if len(matching_entry) > 0:
            # Add the left context to the example and to the cue:
            output = [matching_entry[0] + " " + row['example'], matching_entry[0] + " " + row['cue']]
    return output

# Add the left context if there is no cue up until the pseudoword.
df[["example", "cue"]] = df.apply(update_cue, axis=1)
df

In [None]:
df.reset_index(inplace=True)
df.rename(columns={'index': 'construction'}, inplace=True)

result_df = df.groupby(['construction', 'pseudoword']).agg({'example': list, 'cue': list})

result_df

In [None]:
with open("../../out/definitions.pickle", "rb") as definitions_file:
    definitions = pd.DataFrame.from_dict(pickle.load(definitions_file), orient="index", columns=["definition"])
    
definitions

In [None]:
examples = pd.merge(result_df, definitions, how="inner", left_on="construction", right_index=True)
examples

##### Generieren neuer Sätze:

Load the vanilla mbart model:

In [None]:
model = MBartForConditionalGeneration.from_pretrained(
    "facebook/mbart-large-50", return_dict=True
) 
tokenizer = MBart50TokenizerFast.from_pretrained(
    "facebook/mbart-large-50", src_lang="de_DE", tgt_lang="de_DE"
)
model.to("cuda:0")
model.model.encoder.embed_tokens

Complete the cues:

In [20]:
def complete_cues(row):
    try:
        output_texts = []
        scores = []
        for cue, example in zip(row["cue"], row["example"]):
            input_text = cue + " <mask>"
            
            # print(tokenizer.tokenize(input_text))
            
            target_length = int(1.5 * len(example))  # allow double the length of the original sentence
            
            outputs = tokenizer(input_text, return_tensors="pt")
            outputs = model.generate(outputs["input_ids"].to("cuda:0"), max_length=target_length, num_return_sequences=1, num_beams=10, output_scores=True, return_dict_in_generate=True)
            output_text = tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)
            score = torch.exp(outputs.sequences_scores)
            output_texts += output_text
            
            scores.append(score)
        #print(row["pseudoword"], str(output_texts), str([float(score) for score in scores]))
        print(".", end="")
        return pd.Series({"construction": row["construction"], "pseudoword": row["pseudoword"].iloc[0], "orig_example": row["example"], "generated": str(output_texts), "scores": str([float(score) for score in scores])})
    except Exception as e:
        print(":", end="")
        return pd.Series({"construction": row["construction"], "pseudoword": row["pseudoword"].iloc[0], "orig_example": row["example"], "generated": str(e), "scores": "[-1.0]"})

examples_reset = examples.reset_index()
pseudoword_output_scores = examples_reset[["construction", "pseudoword", "example", "cue", "pseudoword"]].progress_apply(complete_cues, axis=1)
pseudoword_output_scores

  0%|          | 0/562 [00:00<?, ?it/s]

..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

Unnamed: 0,construction,pseudoword,orig_example,generated,scores
0,5,Und5,"[""""Nicht herauskaufen"""", """"nicht erpressen las...","['""""Nicht herauskaufen"""", """"nicht erpressen la...","[0.7326326966285706, 0.3384742736816406, 0.712..."
1,5,erst5,[Trainer Lucien Favre hatte schon seine beiden...,['Trainer Lucien Favre hatte schon seine beide...,"[0.7867358326911926, 0.8212453126907349, 0.684..."
2,5,gar5,[Es hat Afghanistan nicht stabilisiert und sch...,['Es hat Afghanistan nicht stabilisiert und wi...,"[0.5909808874130249, 0.6491526365280151, 0.740..."
3,5,nicht5,[Es hat Afghanistan nicht stabilisiert und sch...,"['Es hat sich nicht gelohnt.', 'Es hat Afghani...","[0.47445887327194214, 0.4824838936328888, 0.40..."
4,5,recht5,[Trainer Lucien Favre hatte schon seine beiden...,['Trainer Lucien Favre hatte schon seine beide...,"[0.7321943640708923, 0.8531233072280884, 0.727..."
...,...,...,...,...,...
557,1884,Gold1884,"[Schweigen ist Silber , reden ist Gold ., Schw...","['Schweigen ist Silber, reden ist Gold - Seite...","[0.648520290851593, 0.6625412106513977, 0.6627..."
558,1884,Silber1884,"[Schweigen ist Silber , reden ist Gold ., Schw...","['Schweigen ist Silber und Gold', 'Schweigen i...","[0.5993227362632751, 0.5993227362632751, 0.541..."
559,1884,ist1884,"[Schweigen ist Silber , reden ist Gold ., Schw...","['Schweigen ist besser als nichts', 'Schweigen...","[0.4621695280075073, 0.4621695280075073, 0.487..."
560,1986,kaum1986,[Die Vorhut vor 20.000 Jahren war für das Ries...,['Die Vorhut vor 20.000 Jahren war nicht mehr ...,"[0.5413929224014282, 0.7501695156097412, 0.599..."


In [22]:
examples = pseudoword_output_scores[["pseudoword", "generated", "scores"]]

examples

Unnamed: 0,pseudoword,generated,scores
0,Und5,"['""""Nicht herauskaufen"""", """"nicht erpressen la...","[0.7326326966285706, 0.3384742736816406, 0.712..."
1,erst5,['Trainer Lucien Favre hatte schon seine beide...,"[0.7867358326911926, 0.8212453126907349, 0.684..."
2,gar5,['Es hat Afghanistan nicht stabilisiert und wi...,"[0.5909808874130249, 0.6491526365280151, 0.740..."
3,nicht5,"['Es hat sich nicht gelohnt.', 'Es hat Afghani...","[0.47445887327194214, 0.4824838936328888, 0.40..."
4,recht5,['Trainer Lucien Favre hatte schon seine beide...,"[0.7321943640708923, 0.8531233072280884, 0.727..."
...,...,...,...
557,Gold1884,"['Schweigen ist Silber, reden ist Gold - Seite...","[0.648520290851593, 0.6625412106513977, 0.6627..."
558,Silber1884,"['Schweigen ist Silber und Gold', 'Schweigen i...","[0.5993227362632751, 0.5993227362632751, 0.541..."
559,ist1884,"['Schweigen ist besser als nichts', 'Schweigen...","[0.4621695280075073, 0.4621695280075073, 0.487..."
560,kaum1986,['Die Vorhut vor 20.000 Jahren war nicht mehr ...,"[0.5413929224014282, 0.7501695156097412, 0.599..."


#### Anzahl neuer Sätze

In [23]:
examples.to_csv(f"../../out/comapp/mbart/data_mbart_vanilla.tsv", sep="\t", decimal=",")
examples.to_excel(f"../../out/comapp/mbart/data_mbart_vanilla.xlsx")

In [24]:
pseudoword_output_scores.to_csv("../../out/comapp/mbart/data_mbart_vanilla_complete.tsv", sep="\t", decimal=",")