# Notebook: Split Dataset in folds

## Packages

In [1]:
from helper import divide_equally, get_examples_for_aspects_in_label, get_examples_as_text
from IPython.display import clear_output
from llama_cpp import Llama
import numpy as np
import itertools
import warnings
import random
import json

## Settings

In [2]:
SPLIT = 0

# Select Model for Synthesis
MODELS = ["Llama70B"]
MODEL = MODELS[0]

# Number of Examples to be generated with the LLM
N_SYNTH = 500

# Setup Classes/Polarities for Synthesis
CLASSES  = ["GENERAL-IMPRESSION", "FOOD", "SERVICE", "AMBIENCE", "PRICE"]
POLARITY = ["POSITIVE", "NEUTRAL", "NEGATIVE"]

## Parameters

In [3]:
DATASET_PATH = f'../03 dataset split/real/real_{SPLIT}.json'
MAX_TOKENS = 200
CONTEXT_SIZE = 2048
SEED = 43

In [4]:
random.seed(SEED)

## Code

### Create Labels for Synthesis

#### Calculate Number Examples with 1, 2 or 3 Aspects and total number of Aspects

In [5]:
n_for_n_aspects = divide_equally(500, random)
n_aspects_synth = sum([n_for_n_aspects[0], n_for_n_aspects[1] * 2, n_for_n_aspects[2] * 3])

In [6]:
n_aspects_synth, n_for_n_aspects

(1001, [166, 167, 167])

#### Generate Random Tuples

Each combination of aspect-category + polarity is equally frequent

In [7]:
combinations = list(itertools.product(CLASSES, POLARITY))
total_combinations = len(combinations)
desired_count_per_combination = n_aspects_synth // total_combinations
remaining_tuples = n_aspects_synth % total_combinations
tuples = []
for combination in combinations:
    for _ in range(desired_count_per_combination):
        tuples.append(combination)

if remaining_tuples > 0:
    tuples.extend(combinations[:remaining_tuples])
    
random.shuffle(tuples)

In [8]:
len(tuples), tuples[:10]

(1001,
 [('GENERAL-IMPRESSION', 'POSITIVE'),
  ('SERVICE', 'POSITIVE'),
  ('SERVICE', 'POSITIVE'),
  ('FOOD', 'POSITIVE'),
  ('SERVICE', 'NEGATIVE'),
  ('SERVICE', 'NEGATIVE'),
  ('PRICE', 'NEGATIVE'),
  ('SERVICE', 'POSITIVE'),
  ('GENERAL-IMPRESSION', 'NEGATIVE'),
  ('SERVICE', 'NEUTRAL')])

#### Generate Labels

In [9]:
n_aspects = 1
labels = []
idx_aspects = 0

for k in n_for_n_aspects:
    for n in range(k):
        label = []
        for aspect_idx in range(n_aspects):
            label.append(tuples[idx_aspects])
            idx_aspects += 1
        labels.append(label)
    n_aspects += 1

In [10]:
random.shuffle(labels)

### Setup Prompt Template

In [11]:
with open('../prompt_template.txt', 'r') as file:
    PROMPT_TEMPLATE = file.read()

### Load Split

In [12]:
with open(DATASET_PATH, 'r', encoding='utf-8') as json_file:
    dataset = json.load(json_file)

### Setup Model

In [13]:
llm = Llama(model_path="llama-2-13b.Q4_0.gguf", seed=SEED, n_gpu_layers=1, n_ctx=CONTEXT_SIZE, verbose=False)
clear_output(wait=False)

### Create Synthetic Samples

In [14]:
for label in labels[:1]:
    unique_aspects = list(set(aspect for aspect, _ in label))
    ids_examples_for_aspects = get_examples_for_aspects_in_label(unique_aspects, random, dataset)
    
    additional_examples = [entry for entry in dataset if entry['id'] not in ids_examples_for_aspects]
    additional_examples = random.sample(additional_examples, 10-len(ids_examples_for_aspects))

    examples = additional_examples + [entry for entry in dataset if entry['id'] in ids_examples_for_aspects]
    examples_text = get_examples_as_text(examples)
    
    prompt_footer = f'\nLabel:{str(label)}\nPrediction:'
    prompt = PROMPT_TEMPLATE + examples_text + prompt_footer
    
    print(prompt)
    prediction = llm(prompt, max_tokens=MAX_TOKENS, stop=["Label:", "\n"], echo=True, top_p=1)["choices"][0]["text"][len(prompt):]
    print(prediction)

Bitte erzeuge genau einen Satz einer Restaurantbewertung, die für das Training eines Modells für die Aspekt-basierte Sentiment Analyse verwendet werden kann.
Gegeben ist ein Label in Form eines Arrays, wobei für jedes Sentiment, mit dem eine Aspekt-Kategorie im Satz adressiert wird, ein Tuple (Aspekt-Kategorie, Aspekt-Sentiment) vorhanden ist.
Eine Aspekt-Kategorie kann mehrfach im Satz adressiert werden, auch mit verschiedenen Sentiment-Polaritäten.

* Aspekt-Kategorien hat nur diese Werte: ["GENERAL-IMPRESSION", "FOOD", "SERVICE", "PRICE", "AMBIENCE"]
* Sentiment nur diese Polaritäten: ["negative", "neutral", "positive"]

Auf Basis des Arrays soll genau ein deutscher Satz erzeugt werden, der die in den Tuple definierten Aspekte enthält.
Zusätzlich kann für ein im Label vorgegebenes Tuple (Aspekt-Kategorie, Aspekt-Sentiment) ein Aspekte-Term im Text vorliegen.
Der Aspekt-Term wird mithilfe eines xml-Tags markiert.

Gebe nur die Prediction zurück, ohne Kommentare oder zusätzlichen Text

In [80]:
import regex as re

s = 'Klassisches <aspect-term aspect="AMBIENCE" polarity="NEUTRAL"><aspect-term aspect="AMBIENCE" polarity="NEUTRAL">Brauerreigasthaus</aspect-term></aspect-term> mit typischen <aspect-term aspect="AMBIENCE" polarity="NEUTRAL">Schnitzel</aspect-term>.'

pattern = r'<aspect-term.+<\/aspect-term>'
matches = list(re.finditer(pattern, s, overlapped=True))

for match in matches:
    start = match.start()
    end = match.end()
    print(f"Match gefunden bei Position {start}-{end}: '{s[start:end]}'")



Match gefunden bei Position 12-245: '<aspect-term aspect="AMBIENCE" polarity="NEUTRAL"><aspect-term aspect="AMBIENCE" polarity="NEUTRAL">Brauerreigasthaus</aspect-term></aspect-term> mit typischen <aspect-term aspect="AMBIENCE" polarity="NEUTRAL">Schnitzel</aspect-term>'
Match gefunden bei Position 62-245: '<aspect-term aspect="AMBIENCE" polarity="NEUTRAL">Brauerreigasthaus</aspect-term></aspect-term> mit typischen <aspect-term aspect="AMBIENCE" polarity="NEUTRAL">Schnitzel</aspect-term>'
Match gefunden bei Position 172-245: '<aspect-term aspect="AMBIENCE" polarity="NEUTRAL">Schnitzel</aspect-term>'
