# Notebook: Create Synthetic Examples

## Packages

In [1]:
from helper_synthesis import get_examples_as_text, xml_to_json, is_valid_xml, check_valid_aspect_xml, count_sentences_in_text, german_language_detected
from IPython.display import clear_output
from dotenv import load_dotenv
from llama_cpp import Llama
import random
import openai
import json
import uuid
import os

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
load_dotenv()

True

## Settings

In [3]:
SPLIT = 0
MODEL_ID = 0
FEW_SHOTS = "fixed" # "fixed" or "random"

## Parameters

In [4]:
DATASET_PATH = f'../07 train classifier/real/split_{SPLIT}.json'
LABELS_AND_EXAMPLES_PATH = f"few_shot_examples/few_shot_examples_{FEW_SHOTS}.json"

# LLM Settings
MAX_TOKENS = 250
CONTEXT_SIZE = 4096
TEMPERATURE = 0.7

# Set Seed
SEED = int(str(43) + str(SPLIT) + str(MODEL_ID))

N_RETRIES = 25

In [5]:
# Setup Classes/Polarities for Synthesis
CLASSES  = ["GENERAL-IMPRESSION", "FOOD", "SERVICE", "AMBIENCE", "PRICE"]
POLARITIES = ["POSITIVE", "NEUTRAL", "NEGATIVE"]
COMBINATIONS = [(aspect, polarity) for polarity in POLARITIES for aspect in CLASSES]

In [6]:
STOP_CRITERIA = ["Label:", "\n"]

In [7]:
random.seed(SEED)

In [8]:
MODELS = ["Llama13B", "Llama70B", "Falcon40B", "GPT-3"]
# 175B, 70B und 40B
MODEL_PATHS = {"Llama13B": "llm_models/llama-2-13b.Q4_0.gguf", "Llama70B": "llm_models/llama-2-70b.Q4_0.gguf", "Falcon40B": "llm_models/falcon-40b-Q4_K_S.gguf"}
MODEL_NAME = MODELS[MODEL_ID]

In [9]:
SYNTH_PATH = f"../07 train classifier/synth/{MODEL_NAME}/{FEW_SHOTS}/split_{SPLIT}.json"

## Code

### Setup Prompt Template

In [10]:
with open('../prompt_template.txt', 'r') as file:
    PROMPT_TEMPLATE = file.read()

### Load Split

In [11]:
with open(DATASET_PATH, 'r', encoding='utf-8') as json_file:
    dataset = json.load(json_file)

### Setup Model

In [12]:
if MODEL_NAME == "Llama70B":
    llm = Llama(model_path=MODEL_PATHS[MODEL_NAME], seed=SEED, n_gpu_layers=1, n_ctx=CONTEXT_SIZE, verbose=False, n_gqa=8)
    clear_output(wait=False)
    def llm_model(text):
        return llm(text, max_tokens=MAX_TOKENS, stop=STOP_CRITERIA, echo=True, top_p=1, temperature=TEMPERATURE)["choices"][0]["text"][len(text):]
    
if MODEL_NAME == "Llama13B" or MODEL_NAME == "Falcon40B":
    llm = Llama(model_path=MODEL_PATHS[MODEL_NAME], seed=SEED, n_gpu_layers=1, n_ctx=CONTEXT_SIZE, verbose=False)
    clear_output(wait=False)
    def llm_model(text):
        return llm(text, max_tokens=MAX_TOKENS, stop=STOP_CRITERIA, echo=True, top_p=1, temperature=TEMPERATURE)["choices"][0]["text"][len(text):]

In [13]:
if MODEL_NAME == "GPT-3":
    openai.api_key = os.getenv("OPENAI_API_KEY")
    def llm_model(text):
        response = openai.ChatCompletion.create(
           model="gpt-3.5-turbo",
           messages=[
              {"role": "user", "content": text}
           ],
           max_tokens=MAX_TOKENS,  
           temperature=TEMPERATURE, 
           stop=STOP_CRITERIA
        )
        return response.choices[0].message.content.strip()

### Load Labels and Examples

In [14]:
with open(LABELS_AND_EXAMPLES_PATH, 'r', encoding='utf-8') as json_file:
    labels_and_examples = json.load(json_file)[f"split_{SPLIT}"]

In [15]:
labels = labels_and_examples["labels_for_prediction"]
labels = [[(aspect, polarity) for aspect, polarity in sub_list] for sub_list in labels]

In [16]:
examples = labels_and_examples["few_shot_ids"]

### Create Synthetic Samples

In [17]:
synth_dataset = []

In [18]:
for idx, label in enumerate(labels[:5]):
    # Setup Statistics
    invalid_xml_schema = 0
    invalid_xml_tags = 0
    aspect_polarity_in_text_but_not_in_label = 0
    more_than_one_sentences = 0
    no_german_language = 0
    no_quotes = 0

    # Setup JSON for new synth example
    synth_example = {}
    synth_example["llm_retry_statistic"] = []

    found_valid_example = False

    # Alle 25 Beispielsets sollen 25 mal versucht werden
    for new_example_idx in range(len(examples[str(idx)])):
        for retry in range(N_RETRIES):
            # new_example_idx will change in case it wasn't possible to generate a text for a given label after N_MAX_NEW_EXAMPLES retires
            few_shot_examples = examples[str(idx)][f"{new_example_idx}"]
            few_shot_examples = [
                entry for entry in dataset if entry['id'] in few_shot_examples]

            # Build Prompt
            examples_text = get_examples_as_text(few_shot_examples)
            prompt_footer = f'\nLabel:{str(label)}\nPrediction:'
            prompt = PROMPT_TEMPLATE + examples_text + prompt_footer

            # Execute LLM
            prediction = llm_model(prompt).lstrip()

            if len(prediction) < 2:
                no_quotes += 1
            else:
                if (prediction[0] != '"' and prediction[0] != '„') or (prediction[-1] != '"' and prediction[-1] != '”'):
                    no_quotes += 1
                else:
                    # now that we know, that quotes are given, we can remove the quotes again
                    prediction = prediction[1:-1]
                    if is_valid_xml(f'<input>{prediction}</input>') == False:
                        invalid_xml_schema += 1
                    else:
                        if check_valid_aspect_xml(f'<input>{prediction}</input>') == False:
                            invalid_xml_tags += 1
                        else:
                            prediction_as_json = xml_to_json(
                                prediction, label, MODEL_NAME, SPLIT)
                            if prediction_as_json == "not-in-label":
                                aspect_polarity_in_text_but_not_in_label += 1
                            else:
                                if count_sentences_in_text(prediction_as_json["text"]) > 1:
                                    more_than_one_sentences += 1
                                else:
                                    if german_language_detected(prediction_as_json["text"]) == False:
                                        no_german_language += 1
                                    else:
                                        synth_example["id"] = str(uuid.uuid4())
                                        synth_example["llm_label"] = label
                                        synth_example["llm_examples"] = few_shot_examples
                                        synth_example["llm_prompt"] = prompt
                                        synth_example["llm_prediction_raw"] = prediction
                                        synth_example["llm_invalid_xml_schema"] = invalid_xml_schema
                                        synth_example["llm_invalid_xml_tags"] = invalid_xml_tags
                                        synth_example["llm_aspect_polarity_in_text_but_not_in_label"] = aspect_polarity_in_text_but_not_in_label
                                        synth_example["llm_more_than_one_sentences"] = more_than_one_sentences
                                        synth_example["llm_no_german_language"] = no_german_language
                                        synth_example["llm_no_quotes"] = no_quotes
                                        for key in prediction_as_json.keys():
                                            synth_example[key] = prediction_as_json[key]

                                        found_valid_example = True

            # Log current generation
            print(
                f'current index: {idx}, n_retry: {len(synth_example["llm_retry_statistic"])}, text: {prediction}')

            if found_valid_example:
                break
            else:
                # Save Statistics of retries
                retry_statistic = {}
                retry_statistic["llm_label"] = label
                retry_statistic["llm_examples"] = [example["id"]
                                                   for example in few_shot_examples]
                retry_statistic["llm_prompt"] = prompt
                retry_statistic["llm_prediction_raw"] = prediction
                retry_statistic["llm_invalid_xml_schema"] = invalid_xml_schema
                retry_statistic["llm_invalid_xml_tags"] = invalid_xml_tags
                retry_statistic["llm_aspect_polarity_in_text_but_not_in_label"] = aspect_polarity_in_text_but_not_in_label
                retry_statistic["llm_more_than_one_sentences"] = more_than_one_sentences
                retry_statistic["llm_no_german_language"] = no_german_language
                retry_statistic["llm_no_quotes"] = no_quotes
                retry_statistic["llm_change_examples"] = new_example_idx
                retry_statistic["llm_retries_for_example_set"] = retry
                synth_example["llm_retry_statistic"].append(retry_statistic)

        if found_valid_example:
            synth_dataset.append(synth_example)
            break

current index: 0, n_retry: 0, text: Aber ich muss sagen, dass mir die <aspect-term aspect="GENERAL-IMPRESSION" polarity="POSITIVE">Atmosphäre</aspect-term> gefallen hat.
current index: 1, n_retry: 0, text: :) Wir haben es sehr gern erlebt.
current index: 2, n_retry: 0, text: Die <aspect-term aspect="SERVICE" polarity="NEGATIVE">Beratung</aspect-term> von der <aspect-term aspect="FOOD" polarity="NEGATIVE">Speisekarte</aspect-term> war ein wenig langsam, aber sonst alles in Ordnung.
current index: 3, n_retry: 0, text: :)
current index: 3, n_retry: 1, text: Die <aspect-term aspect="AMBIENCE" polarity="POSITIVE">Atmosphäre</aspect-term> war sehr angenehm.
current index: 4, n_retry: 0, text: :""
current index: 4, n_retry: 1, text: Die Preise waren nicht so teuer.


In [23]:
synth_dataset[3]["text"], synth_dataset[3]["tags"]

('Die Atmosphäre war sehr angenehm.',
 [{'text': 'Atmosphäre',
   'start': 4,
   'end': 14,
   'tag_with_polarity': 'AMBIENCE-POSITIVE',
   'tag_with_polarity_and_type': 'AMBIENCE-POSITIVE-explicit',
   'type': 'label-explicit',
   'label': 'AMBIENCE',
   'polarity': 'POSITIVE'},
  {'text': None,
   'start': 0,
   'end': 0,
   'tag_with_polarity': 'SERVICE-NEUTRAL',
   'tag_with_polarity_and_type': 'SERVICE-NEUTRAL-implicit',
   'type': 'label-implicit',
   'label': 'SERVICE',
   'polarity': 'NEUTRAL'},
  {'text': None,
   'start': 0,
   'end': 0,
   'tag_with_polarity': 'FOOD-POSITIVE',
   'tag_with_polarity_and_type': 'FOOD-POSITIVE-implicit',
   'type': 'label-implicit',
   'label': 'FOOD',
   'polarity': 'POSITIVE'}])

In [20]:
os.makedirs(os.path.dirname(SYNTH_PATH), exist_ok=True)
with open(SYNTH_PATH, "w") as outfile:
    json.dump(synth_dataset, outfile)

In [21]:
print(synth_dataset[0]["llm_prompt"])

Bitte erzeuge genau einen Satz einer Restaurant-Bewertung, die für das Training eines Modells für die Aspekt-basierte Sentiment Analyse verwendet werden kann.
Gegeben ist ein Label in Form eines Arrays, wobei für jedes Sentiment, mit dem eine Aspekt-Kategorie im Satz adressiert wird, ein Tuple (Aspekt-Kategorie, Aspekt-Sentiment) vorhanden ist.
Eine Aspekt-Kategorie kann mehrfach im Satz adressiert werden, auch mit verschiedenen Sentiment-Polaritäten.

* Folgende Aspekt-Kategorien werden betrachtet: ["GENERAL-IMPRESSION", "FOOD", "SERVICE", "PRICE", "AMBIENCE"]
* Folgende Sentiment-Polaritäten werden betrachtet: ["negative", "neutral", "positive"]

Auf Basis des Labels soll genau ein deutscher Satz erzeugt werden, der die in den Tuple definierten Aspekte adressiert.
Zusätzlich kann für ein im Label vorgegebenes Tuple (Aspekt-Kategorie, Aspekt-Sentiment) ein Aspekt-Begriff im Text vorliegen.
Ein Aspekt-Begriff ist eine  Entität oder Eigenschaft innerhalb eines Textes, die auf eine der b