# Notebook: Split Dataset in folds

## Packages

In [1]:
from helper import divide_equally, get_examples_for_aspects_in_label, get_examples_as_text, xml_to_json, is_valid_xml, check_valid_aspect_xml, count_sentences_in_text
from IPython.display import clear_output
from llama_cpp import Llama
import numpy as np
import itertools
import warnings
import random
import json
import uuid
import os

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Settings

In [2]:
SPLIT = 0

# Select Model for Synthesis
MODEL_ID = 0

# Number of Examples to be generated with the LLM
N_SYNTH = 500

# Setup Classes/Polarities for Synthesis
CLASSES  = ["GENERAL-IMPRESSION", "FOOD", "SERVICE", "AMBIENCE", "PRICE"]
POLARITY = ["POSITIVE", "NEUTRAL", "NEGATIVE"]

## Parameters

In [3]:
DATASET_PATH = f'../03 dataset split/real/real_{SPLIT}.json'
MAX_TOKENS = 200
CONTEXT_SIZE = 2048
SEED = 43

In [4]:
random.seed(SEED)

In [5]:
MODELS = ["Llama70B", "GPT-3"]
MODEL_PATHS = {"Llama70B": "llama-2-13b.Q4_0.gguf"}
MODEL_NAME = MODELS[MODEL_ID]

## Code

### Create Labels for Synthesis

#### Calculate Number Examples with 1, 2 or 3 Aspects and total number of Aspects

In [6]:
n_for_n_aspects = divide_equally(500, random)
n_aspects_synth = sum([n_for_n_aspects[0], n_for_n_aspects[1] * 2, n_for_n_aspects[2] * 3])

In [7]:
n_aspects_synth, n_for_n_aspects

(1001, [166, 167, 167])

#### Generate Random Tuples

Each combination of aspect-category + polarity is equally frequent

In [8]:
combinations = list(itertools.product(CLASSES, POLARITY))
total_combinations = len(combinations)
desired_count_per_combination = n_aspects_synth // total_combinations
remaining_tuples = n_aspects_synth % total_combinations
tuples = []
for combination in combinations:
    for _ in range(desired_count_per_combination):
        tuples.append(combination)

if remaining_tuples > 0:
    tuples.extend(combinations[:remaining_tuples])
    
random.shuffle(tuples)

In [9]:
len(tuples), tuples[:10]

(1001,
 [('GENERAL-IMPRESSION', 'POSITIVE'),
  ('SERVICE', 'POSITIVE'),
  ('SERVICE', 'POSITIVE'),
  ('FOOD', 'POSITIVE'),
  ('SERVICE', 'NEGATIVE'),
  ('SERVICE', 'NEGATIVE'),
  ('PRICE', 'NEGATIVE'),
  ('SERVICE', 'POSITIVE'),
  ('GENERAL-IMPRESSION', 'NEGATIVE'),
  ('SERVICE', 'NEUTRAL')])

In [14]:
from collections import Counter
counter = Counter(tuples)

for key, value in counter.items():
    print(f'Kombination: Aspekt="{key[0]}", Polarität="{key[1]}", Häufigkeit={value}')

Kombination: Aspekt="GENERAL-IMPRESSION", Polarität="POSITIVE", Häufigkeit=67
Kombination: Aspekt="SERVICE", Polarität="POSITIVE", Häufigkeit=67
Kombination: Aspekt="FOOD", Polarität="POSITIVE", Häufigkeit=67
Kombination: Aspekt="SERVICE", Polarität="NEGATIVE", Häufigkeit=67
Kombination: Aspekt="PRICE", Polarität="NEGATIVE", Häufigkeit=66
Kombination: Aspekt="GENERAL-IMPRESSION", Polarität="NEGATIVE", Häufigkeit=67
Kombination: Aspekt="SERVICE", Polarität="NEUTRAL", Häufigkeit=67
Kombination: Aspekt="FOOD", Polarität="NEGATIVE", Häufigkeit=67
Kombination: Aspekt="AMBIENCE", Polarität="NEUTRAL", Häufigkeit=67
Kombination: Aspekt="FOOD", Polarität="NEUTRAL", Häufigkeit=67
Kombination: Aspekt="PRICE", Polarität="POSITIVE", Häufigkeit=66
Kombination: Aspekt="GENERAL-IMPRESSION", Polarität="NEUTRAL", Häufigkeit=67
Kombination: Aspekt="AMBIENCE", Polarität="POSITIVE", Häufigkeit=67
Kombination: Aspekt="AMBIENCE", Polarität="NEGATIVE", Häufigkeit=66
Kombination: Aspekt="PRICE", Polarität="NEU

#### Generate Labels

In [10]:
n_aspects = 1
labels = []
idx_aspects = 0

for k in n_for_n_aspects:
    for n in range(k):
        label = []
        for aspect_idx in range(n_aspects):
            label.append(tuples[idx_aspects])
            idx_aspects += 1
        labels.append(label)
    n_aspects += 1

In [11]:
random.shuffle(labels)

In [12]:
labels

[[('FOOD', 'NEGATIVE'), ('SERVICE', 'NEUTRAL'), ('PRICE', 'NEGATIVE')],
 [('GENERAL-IMPRESSION', 'NEGATIVE'), ('PRICE', 'POSITIVE')],
 [('GENERAL-IMPRESSION', 'NEGATIVE')],
 [('PRICE', 'NEUTRAL'), ('PRICE', 'NEUTRAL')],
 [('FOOD', 'NEUTRAL'), ('SERVICE', 'NEUTRAL')],
 [('PRICE', 'NEUTRAL'), ('FOOD', 'POSITIVE'), ('FOOD', 'NEUTRAL')],
 [('AMBIENCE', 'NEUTRAL'), ('SERVICE', 'NEUTRAL')],
 [('GENERAL-IMPRESSION', 'NEGATIVE'),
  ('AMBIENCE', 'NEGATIVE'),
  ('FOOD', 'NEUTRAL')],
 [('GENERAL-IMPRESSION', 'NEGATIVE')],
 [('AMBIENCE', 'NEGATIVE'),
  ('GENERAL-IMPRESSION', 'NEUTRAL'),
  ('FOOD', 'NEGATIVE')],
 [('GENERAL-IMPRESSION', 'POSITIVE'),
  ('FOOD', 'POSITIVE'),
  ('PRICE', 'NEUTRAL')],
 [('PRICE', 'NEGATIVE')],
 [('SERVICE', 'NEUTRAL'), ('SERVICE', 'NEGATIVE')],
 [('AMBIENCE', 'NEGATIVE'), ('GENERAL-IMPRESSION', 'NEUTRAL')],
 [('FOOD', 'NEGATIVE'), ('PRICE', 'NEUTRAL')],
 [('PRICE', 'NEGATIVE')],
 [('AMBIENCE', 'NEGATIVE'), ('AMBIENCE', 'NEGATIVE')],
 [('SERVICE', 'POSITIVE'), ('AMBIENC

### Setup Prompt Template

In [12]:
with open('../prompt_template.txt', 'r') as file:
    PROMPT_TEMPLATE = file.read()

### Load Split

In [13]:
with open(DATASET_PATH, 'r', encoding='utf-8') as json_file:
    dataset = json.load(json_file)

### Setup Model

In [14]:
if MODEL_NAME == "Llama70B":
    llm = Llama(model_path=MODEL_PATHS[MODEL_NAME], seed=SEED, n_gpu_layers=1, n_ctx=CONTEXT_SIZE, verbose=False)
    clear_output(wait=False)
    def llm_model(text):
        return llm(prompt, max_tokens=MAX_TOKENS, stop=["Label:", "\n"], echo=True, top_p=1)["choices"][0]["text"][len(text):]

### Create Synthetic Samples

In [15]:
synth_dataset = []

In [16]:
for label in labels[:3]:
    
    valid_example = False
    invalid_xml_schema = 0
    invalid_xml_tags = 0
    aspect_polarity_in_text_but_not_in_label = 0
    more_than_one_sentences = 0
    
    while valid_example == False:
        unique_aspects = [aspect for aspect, _ in label if label.count((aspect, _)) == 1]
    
        ids_examples_for_aspects = get_examples_for_aspects_in_label(unique_aspects, dataset, random)
        additional_examples = [entry for entry in dataset if entry['id'] not in ids_examples_for_aspects]
        additional_examples = random.sample(additional_examples, 10-len(ids_examples_for_aspects))

        examples = additional_examples + [entry for entry in dataset if entry['id'] in ids_examples_for_aspects]
        examples_text = get_examples_as_text(examples)
    
        prompt_footer = f'\nLabel:{str(label)}\nPrediction:'
        prompt = PROMPT_TEMPLATE + examples_text + prompt_footer
    
        prediction = llm_model(prompt)
        
        if is_valid_xml(f'<input>{prediction}</input>') == False:
            invalid_xml_schema += 1
        else:
            if check_valid_aspect_xml(f'<input>{prediction}</input>') == False:
                invalid_xml_tags += 1
            else: 
                prediction_as_json = xml_to_json(prediction, label, MODEL_NAME, SPLIT)
                if prediction_as_json == "not-in-label":
                    aspect_polarity_in_text_but_not_in_label += 1
                else: 
                    if count_sentences_in_text(prediction_as_json["text"]) > 1:
                        more_than_one_sentences += 1
                    else:
                        valid_example = True
    
    prediction_as_json["llm_label"] = label
    prediction_as_json["llm_examples"] = examples
    prediction_as_json["llm_invalid_xml_schema"] = invalid_xml_schema
    prediction_as_json["llm_invalid_xml_tags"] = invalid_xml_tags
    prediction_as_json["llm_aspect_polarity_in_text_but_not_in_label"] = aspect_polarity_in_text_but_not_in_label
    prediction_as_json["llm_more_than_one_sentences"] = more_than_one_sentences
    
    synth_dataset.append(prediction_as_json)

In [17]:
# To-Do: Prüfen, ob tags valides xml :white_check_mark: 
# To-Do: Prüfen, welche Tag-Namen vorkommen :white_check_mark: 
# To-Do: Was, wenn es keinen Aspekt im Text gibt 
# To-Do: Validieren, dass Aspekt Tag valide ist " statt ', nur definierte tags
# To-Do: Prüfen, ob Polarität/Label valide :white_check_mark: 
# To-Do: Prüfen, ob es sich um einen ganzen Satz handelt white_check_mark: 

In [18]:
[ex["text"] for ex in synth_dataset]

['Wir haben uns vorher über Service informiert und der hieß, alles sei Muff.',
 'Leider habe das Essen keine gute Dauer hineingebracht und es waren zu wenig Einzelstücke im Angebot.',
 'Wir waren 4 Personen die im November letzten Jahres dort gegessen haben und alle von uns fanden das Essen total schlecht und ein Lokal im schlechtesten Sinne.']

In [19]:
json_file_path = f"synth/{MODEL_NAME}/split_{SPLIT}.json"
os.makedirs(os.path.dirname(json_file_path), exist_ok=True)

In [20]:
with open(json_file_path, "w") as outfile:
    json.dump(synth_dataset, outfile)

In [23]:
example = {'tags': [
  {'end': 34,
   'start': 27,
   'tag_with_polarity': 'PRICE-NEGATIVE',
   'tag_with_polarity_and_type': 'PRICE-NEGATIVE-explicit',
   'text': 'Verkauf',
   'type': 'label-explicit',
   'label': 'PRICE',
   'polarity': 'NEGATIVE'
  }
  ],
 'text': 'Wie überall geht es nur um Verkauf!'
}

In [30]:
# get_examples_as_text, xml_to_json
# xml_to_json(get_examples_as_text([example]), [("PRICE", "NEGATIVE")], MODEL_NAME, SPLIT)

get_examples_as_text([example])

'\nLabel:[(\'PRICE\', \'NEGATIVE\')]\nPrediction:Wie überall geht es nur um <aspect-term aspect="PRICE" polarity="NEGATIVE">Verkauf</aspect-term>'

In [34]:
#xml_to_json('Der <aspect-term aspect="PRICE" polarity="POSITIVE">Preis <aspect-term aspect="PRICE" polarity="POSITIVE">omg</aspect-term></aspect-term>', [("PRICE", "POSITIVE"), ("PRICE", "POSITIVE")], MODEL_NAME, SPLIT)
xml_to_json('<aspect-term aspect="PRICE" polarity="POSITIVE">Preis <aspect-term aspect="PRICE" polarity="POSITIVE">omg</aspect-term></aspect-term>', [("PRICE", "POSITIVE"), ("PRICE", "POSITIVE")], MODEL_NAME, SPLIT)

{'tags': [{'text': 'omg',
   'start': 6,
   'end': 9,
   'tag_with_polarity': 'PRICE-POSITIVE',
   'tag_with_polarity_and_type': 'PRICE-POSITIVE-explicit',
   'type': 'label-explicit',
   'label': 'PRICE',
   'polarity': 'POSITIVE'},
  {'text': 'Preis omg',
   'start': 0,
   'end': 9,
   'tag_with_polarity': 'PRICE-POSITIVE',
   'tag_with_polarity_and_type': 'PRICE-POSITIVE-explicit',
   'type': 'label-explicit',
   'label': 'PRICE',
   'polarity': 'POSITIVE'}],
 'text': 'Preis omg',
 'id': '973eb340-f7f1-436e-849a-b52ebc4517da',
 'model': 'Llama70B',
 'split': 0}

In [21]:
dataset[0]

{'tags': [{'end': 34,
   'start': 27,
   'tag_with_polarity': 'PRICE-NEGATIVE',
   'tag_with_polarity_and_type': 'PRICE-NEGATIVE-explicit',
   'text': 'Verkauf',
   'type': 'label-explicit',
   'label': 'PRICE',
   'polarity': 'NEGATIVE'}],
 'text': 'Wie überall geht es nur um Verkauf!',
 'aspect_available_without_judgement': False,
 'two_or_more_sentences': False,
 'id': '39945e4d-9eb3-4de3-a37a-ffb1a9917c96',
 'city': 'hamburg',
 'date': '2023-07-07',
 'title': 'Lokaiton ist toll ansonsten enttäuscht !!',
 'rating': 2.0,
 'review_id': 901244877,
 'page_index': 2,
 'author_name': 'christina k',
 'sentence_idx': 2,
 'language_code': 'de',
 'restaurant_id': 1483463,
 'author_location': '',
 'restaurant_name': 'Bullerei',
 'detected_language': 'de',
 'text_noanonymization': 'Meine Tochter , Ihr Vater und "Stiefmutter " sind nach der Abizeugnisvergabe in der Bullerei zu Essen gewesen . Ambiente toll , Essen überpreist und das man der Stiefmutter gratuliert ohne was zu hinterfragen ist sch