In [None]:
import pandas as pd
from transformers import BertTokenizer

In [None]:
llama_data = {}
for i in range(5):
    llama_data[i] = pd.read_csv(f"data/{i}_shot_data.tsv", sep="\t", decimal=",", index_col=0)
    llama_data[i]["num_unique"] = llama_data[i].apply(lambda row: int(row["num_generated"] * row["unique_generated"]), axis=1)

In [None]:
mbart_data = pd.read_csv("data/data_mbart_complete2.tsv", sep="\t", index_col=0)
mbart_vanilla_data = pd.read_csv("data/data_mbart_vanilla_complete2.tsv", sep="\t", index_col=0)

In [None]:
bsbbert_data = pd.read_csv("data/data_bsbbert_complete.tsv", sep="\t", index_col=0)

In [None]:
bsbbert_data

In [None]:
def evaluate_generated(row):
    try:
        new = eval(row["generated"])
    except:
        new = []
    return new

In [None]:
mbart_data["evaluated_generated"] = mbart_data.apply(evaluate_generated, axis=1)
mbart_data

In [None]:
mbart_vanilla_data["evaluated_generated"] = mbart_vanilla_data.apply(evaluate_generated, axis=1)
mbart_vanilla_data

In [None]:
for llama in llama_data.values():
    llama["evaluated_generated"] = llama.apply(evaluate_generated, axis=1)
llama_data[0]

In [None]:
bsbbert_data["evaluated_generated"] = bsbbert_data.apply(evaluate_generated, axis=1)
bsbbert_data

In [None]:
llama_subset = pd.concat((llama_data[0][["definition", "evaluated_generated"]], llama_data[4]["examples"]), axis=1)
llama_subset

In [None]:
llama_subset_3 = pd.concat((llama_data[3][["definition", "evaluated_generated"]], llama_data[4]["examples"]), axis=1)
llama_subset_3

In [None]:
mbart_merged = pd.merge(mbart_vanilla_data[["construction", "evaluated_generated"]], mbart_data[["construction", "evaluated_generated"]], on="construction", how="inner", suffixes=["_mbart_vanilla", "_mbart_pseudo"])
mbart_merged

In [None]:
final_merged = pd.merge(llama_subset, mbart_merged, left_index=True, right_on="construction")
final_merged = pd.merge(final_merged, bsbbert_data[["construction", "evaluated_generated"]], on="construction", suffixes=["_llama", "_bert"])
final_merged = pd.merge(final_merged, llama_subset_3["evaluated_generated"], left_on="construction", right_index=True)
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
final_merged["evaluated_generated_bert"] = final_merged.apply(lambda row: [tokenizer.convert_tokens_to_string([" ".join([str(s) for s in n])]) for n in row["evaluated_generated_bert"]], axis=1)
final_merged.rename({"evaluated_generated": "evaluated_generated_llama3"}, axis=1, inplace=True)

final_merged

In [None]:
llama_generated = final_merged[["construction", "evaluated_generated_llama"]].explode("evaluated_generated_llama").drop_duplicates()
llama_generated.to_csv(f"../../out/generated/llama_0_shot.tsv", sep="\t", decimal=",", index=False)
llama_generated

In [None]:
llama_generated3 = final_merged[["construction", "evaluated_generated_llama3"]].explode("evaluated_generated_llama3").drop_duplicates()
llama_generated3.to_csv(f"../../out/generated/llama_3_shot.tsv", sep="\t", decimal=",", index=False)
llama_generated3

In [None]:
mbart_vanilla_generated = final_merged[["construction", "evaluated_generated_mbart_vanilla"]].explode("evaluated_generated_mbart_vanilla").drop_duplicates()
mbart_vanilla_generated.to_csv(f"../../out/generated/mbart_vanilla.tsv", sep="\t", decimal=",", index=False)
mbart_vanilla_generated

In [None]:
mbart_pseudo_generated = final_merged[["construction", "evaluated_generated_mbart_pseudo"]].explode("evaluated_generated_mbart_pseudo").drop_duplicates()
mbart_pseudo_generated.to_csv(f"../../out/generated/mbart_pseudo.tsv", sep="\t", decimal=",", index=False)
mbart_pseudo_generated

In [None]:
bert_pseudo_generated = final_merged[["construction", "evaluated_generated_bert"]].explode("evaluated_generated_bert").drop_duplicates()
bert_pseudo_generated.to_csv(f"../../out/generated/bert_pseudo.tsv", sep="\t", decimal=",", index=False)
bert_pseudo_generated