In [None]:
import pickle
import re

import pandas as pd

import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
examples = {}

for i in range(0, 5):
    with open("../../out/definitions.pickle", "rb") as definitions_file:
        with open(f"../../out/llama/examples_{i}_shot.pickle", "rb") as examples_file:
            examples[i] = {}
            cur_definitions = pickle.load(definitions_file)
            cur_examples = pickle.load(examples_file)
            for (d, de) in cur_definitions.items():
                # if i == 0:
                #     examples[i][d] = (de, [], ex)
                # else:
                examples[i][d] = (de, *cur_examples[d])
    examples[i] = pd.DataFrame.from_dict(examples[i], orient="index", columns=["definition", "examples", "generated"])
    
examples[0]

### Number of samples:

In [None]:
def evaluate_string(s):
    try:
        evaluated_value = eval(s)
        
        if isinstance(evaluated_value, list):
            
            return [str(e) for e in evaluated_value]  # muss String sein!
        
    except:# (SyntaxError, ValueError):
        return [s.replace("[", "").replace("]", "")]

In [None]:
for example_group in examples.values():
    example_group['evaluated_generated'] = example_group['generated'].apply(evaluate_string)
    example_group['num_generated'] = example_group['evaluated_generated'].apply(len)

examples[0]

In [None]:
examples[4]["num_generated"].mean()

In [None]:
examples[4]["num_generated"].mode()  # Häufigste Anzahl Beispielsätze (Modalwert)

### Number of unique samples:

In [None]:
for example_group in examples.values():
    example_group['unique_generated'] = example_group['evaluated_generated'].apply(lambda x: (len(set(x))/len(x)) if len(x) > 0 else 0.0)

In [None]:
examples[0]

For these outputs the number of generated samples and the number of unique samples differ:

In [None]:
not_all_unique = examples[0][(0.0 < examples[0]['unique_generated']) & (examples[0]['unique_generated'] < 1.0)]
print(len(not_all_unique) / len(examples[0]) * 100, "%")
not_all_unique

### Actually new content:

In [None]:
stopwords = {s.lower() for s in stopwords.words("german")}  # Stoppwörter werden entfernt, weil die nichts über Wiederholungen aussagen

def calculate_overlap(row):
    generated_set = {w for w in " ".join(row['evaluated_generated']).split() if w.lower() not in stopwords}
    
    if len(generated_set) > 0:
        definition_overlap = len(generated_set.intersection({w for w in row['definition'].split() if w.lower() not in stopwords})) / len(generated_set)
        examples_overlap = len(generated_set.intersection({w for w in " ".join(row['examples']).split() if w.lower() not in stopwords})) / len(generated_set)
        overlap = len(generated_set.intersection({w for w in row['definition'].split() if w.lower() not in stopwords} | {w for w in " ".join(row['examples']).split() if w.lower() not in stopwords})) / len(generated_set)
    else:
        definition_overlap = 0.0
        examples_overlap = 0.0
        overlap = 0.0
    
    return definition_overlap, examples_overlap, overlap

In [None]:
for example_group in examples.values():
    example_group[['definition_overlap', 'examples_overlap', 'overlap']] = example_group.apply(calculate_overlap, axis=1, result_type='expand')

In [None]:
examples[3]

In [None]:
examples[0]["overlap"].mean()

In [None]:
for i, example_group in examples.items():
    example_group.to_csv(f"../../out/llama/{i}_shot_data.tsv", sep="\t", decimal=",")
    example_group.to_excel(f"../../out/llama/{i}_shot_data.xlsx")