# Notebook: Split Dataset in folds

## Packages

In [1]:
from helper import divide_equally, get_examples_for_aspects_in_label, get_examples_as_text, xml_to_json, is_valid_xml, check_valid_aspect_xml
from IPython.display import clear_output
from llama_cpp import Llama
import numpy as np
import itertools
import warnings
import random
import json
import uuid

## Settings

In [2]:
SPLIT = 0

# Select Model for Synthesis
MODELS = ["Llama70B"]
MODEL = MODELS[0]

# Number of Examples to be generated with the LLM
N_SYNTH = 500

# Setup Classes/Polarities for Synthesis
CLASSES  = ["GENERAL-IMPRESSION", "FOOD", "SERVICE", "AMBIENCE", "PRICE"]
POLARITY = ["POSITIVE", "NEUTRAL", "NEGATIVE"]

## Parameters

In [3]:
DATASET_PATH = f'../03 dataset split/real/real_{SPLIT}.json'
MAX_TOKENS = 200
CONTEXT_SIZE = 2048
SEED = 43

In [4]:
random.seed(SEED)

## Code

### Create Labels for Synthesis

#### Calculate Number Examples with 1, 2 or 3 Aspects and total number of Aspects

In [5]:
n_for_n_aspects = divide_equally(500, random)
n_aspects_synth = sum([n_for_n_aspects[0], n_for_n_aspects[1] * 2, n_for_n_aspects[2] * 3])

In [6]:
n_aspects_synth, n_for_n_aspects

(1001, [166, 167, 167])

#### Generate Random Tuples

Each combination of aspect-category + polarity is equally frequent

In [7]:
combinations = list(itertools.product(CLASSES, POLARITY))
total_combinations = len(combinations)
desired_count_per_combination = n_aspects_synth // total_combinations
remaining_tuples = n_aspects_synth % total_combinations
tuples = []
for combination in combinations:
    for _ in range(desired_count_per_combination):
        tuples.append(combination)

if remaining_tuples > 0:
    tuples.extend(combinations[:remaining_tuples])
    
random.shuffle(tuples)

In [8]:
len(tuples), tuples[:10]

(1001,
 [('GENERAL-IMPRESSION', 'POSITIVE'),
  ('SERVICE', 'POSITIVE'),
  ('SERVICE', 'POSITIVE'),
  ('FOOD', 'POSITIVE'),
  ('SERVICE', 'NEGATIVE'),
  ('SERVICE', 'NEGATIVE'),
  ('PRICE', 'NEGATIVE'),
  ('SERVICE', 'POSITIVE'),
  ('GENERAL-IMPRESSION', 'NEGATIVE'),
  ('SERVICE', 'NEUTRAL')])

#### Generate Labels

In [9]:
n_aspects = 1
labels = []
idx_aspects = 0

for k in n_for_n_aspects:
    for n in range(k):
        label = []
        for aspect_idx in range(n_aspects):
            label.append(tuples[idx_aspects])
            idx_aspects += 1
        labels.append(label)
    n_aspects += 1

In [10]:
random.shuffle(labels)

### Setup Prompt Template

In [11]:
with open('../prompt_template.txt', 'r') as file:
    PROMPT_TEMPLATE = file.read()

### Load Split

In [12]:
with open(DATASET_PATH, 'r', encoding='utf-8') as json_file:
    dataset = json.load(json_file)

### Setup Model

In [13]:
llm = Llama(model_path="llama-2-13b.Q4_0.gguf", seed=SEED, n_gpu_layers=1, n_ctx=CONTEXT_SIZE, verbose=False)
clear_output(wait=False)

### Create Synthetic Samples

In [14]:
for label in labels[:3]:
    
    valid_example = False
    invalid_xml_schema = 0
    invalid_xml_tags = 0
    aspect_polarity_in_text_but_not_in_label = 0
    
    while valid_example == False:
        unique_aspects = [aspect for aspect, _ in label if label.count((aspect, _)) == 1]
    
        ids_examples_for_aspects = get_examples_for_aspects_in_label(unique_aspects, dataset, random)
        additional_examples = [entry for entry in dataset if entry['id'] not in ids_examples_for_aspects]
        additional_examples = random.sample(additional_examples, 10-len(ids_examples_for_aspects))

        examples = additional_examples + [entry for entry in dataset if entry['id'] in ids_examples_for_aspects]
        examples_text = get_examples_as_text(examples)
    
        prompt_footer = f'\nLabel:{str(label)}\nPrediction:'
        prompt = PROMPT_TEMPLATE + examples_text + prompt_footer
    
        prediction = llm(prompt, max_tokens=MAX_TOKENS, stop=["Label:", "\n"], echo=True, top_p=1)["choices"][0]["text"][len(prompt):]
        print("---------\nGiven Label:", label, "\n")
        print("Prediction:", prediction, "\n")
        
        if is_valid_xml(f'<input>{prediction}</input>') == False:
            invalid_xml_schema += 1
        else:
            if check_valid_aspect_xml(f'<input>{prediction}</input>') == False:
                invalid_xml_tags += 1
            else: 
                prediction_as_json = xml_to_json(prediction, label, MODEL, SPLIT)
                if prediction_as_json == "not-in-label":
                     aspect_polarity_in_text_but_not_in_label += 1
                else:
                    if invalid_xml_schema == 0 and invalid_xml_tags == 0 and aspect_polarity_in_text_but_not_in_label == 0:
                         valid_example = True
        
        print(invalid_xml_schema, invalid_xml_tags, aspect_polarity_in_text_but_not_in_label)
    
    print(prediction_as_json)

---------
 [('FOOD', 'NEGATIVE'), ('SERVICE', 'NEUTRAL'), ('PRICE', 'NEGATIVE')] 

Wir haben uns vorher über <aspect-term aspect="SERVICE" polarity="NEUTRAL">Service</aspect-term> informiert und mögen es ganz gut, auch wenn ein Stück gebrannte Kartoffeln zu viel sind. Es hat einfach <aspect-term aspect="PRICE" polarity="NEGATIVE">Verkauf</aspect-term> und <aspect-term aspect="FOOD" polarity="NEGATIVE">Preis-Leistungsverhältnis</aspect-term> gemacht. 

0 0 0
{'tags': [{'text': 'Preis-Leistungsverhältnis', 'start': 150, 'end': 175, 'tag_with_polarity': 'FOOD-NEGATIVE', 'tag_with_polarity_and_type': 'FOOD-NEGATIVE-explicit', 'type': 'label-explicit', 'label': 'FOOD', 'polarity': 'NEGATIVE'}, {'text': 'Verkauf', 'start': 138, 'end': 145, 'tag_with_polarity': 'PRICE-NEGATIVE', 'tag_with_polarity_and_type': 'PRICE-NEGATIVE-explicit', 'type': 'label-explicit', 'label': 'PRICE', 'polarity': 'NEGATIVE'}, {'text': 'Service', 'start': 26, 'end': 33, 'tag_with_polarity': 'SERVICE-NEUTRAL', 'tag_wi


KeyboardInterrupt



In [None]:
prediction

In [None]:
# To-Do: Prüfen, ob tags valides xml :white_check_mark: 
# To-Do: Prüfen, welche Tag-Namen vorkommen
# To-Do: Was, wenn es keinen Aspekt im Text gibt
# To-Do: Validieren, dass Aspekt Tag valide ist " statt ', nur definierte tags
# To-Do: Prüfen, ob Polarität/Label valide
# To-Do: Prüfen, ob es sich um einen ganzen Satz handelt

In [None]:


test_string = '<input>Damit war die <aspect-term aspect="FOOD" polarity="NEGATIVE"><aspect-term aspect="FOOD" polarity="NEGATIVE">Sauce</aspect-term></aspect-term> in Summe ein wenig zu salzig und hat die <aspect-term aspect="FOOD" polarity="NEUTRAL">Steinpilze</aspect-term> komplett überdeckt.</input>'
print(is_valid_aspect_xml(test_string))


{'tags': [{'text': 'Preis-Leistungsverhältnis', 'start': 150, 'end': 175, 'tag_with_polarity': 'FOOD-NEGATIVE', 'tag_with_polarity_and_type': 'FOOD-NEGATIVE-explicit', 'type': 'label-explicit', 'label': 'FOOD', 'polarity': 'NEGATIVE'}, {'text': 'Verkauf', 'start': 138, 'end': 145, 'tag_with_polarity': 'PRICE-NEGATIVE', 'tag_with_polarity_and_type': 'PRICE-NEGATIVE-explicit', 'type': 'label-explicit', 'label': 'PRICE', 'polarity': 'NEGATIVE'}, {'text': 'Service', 'start': 26, 'end': 33, 'tag_with_polarity': 'SERVICE-NEUTRAL', 'tag_with_polarity_and_type': 'SERVICE-NEUTRAL-explicit', 'type': 'label-explicit', 'label': 'SERVICE', 'polarity': 'NEUTRAL'}], 'text': 'Wir haben uns vorher über Service informiert und mögen es ganz gut, auch wenn ein Stück gebrannte Kartoffeln zu viel sind. Es hat einfach Verkauf und Preis-Leistungsverhältnis gemacht.', 'id': UUID('ae0e71f6-6c79-481a-b035-eaecd3853563'), 'model': 'Llama70B', 'split': 0}
{'tags': [{'text': 'Fragebögen', 'start': 36, 'end': 46, 'tag_with_polarity': 'PRICE-POSITIVE', 'tag_with_polarity_and_type': 'PRICE-POSITIVE-explicit', 'type': 'label-explicit', 'label': 'PRICE', 'polarity': 'POSITIVE'}, {'text': None, 'start': 0, 'end': 0, 'tag_with_polarity': 'GENERAL-IMPRESSION-NEGATIVE', 'tag_with_polarity_and_type': 'GENERAL-IMPRESSION-NEGATIVE-implicit', 'type': 'label-implicit', 'label': 'GENERAL-IMPRESSION', 'polarity': 'NEGATIVE'}], 'text': 'Es kommt darauf an, was man für die Fragebögen zahlt.', 'id': UUID('8b031c07-0fa9-4af5-bb37-671dd635d890'), 'model': 'Llama70B', 'split': 0}