# Notebook: Split Dataset in folds

## Packages

In [35]:
from helper import divide_equally, get_examples_for_aspects_in_label, get_examples_as_text, xml_to_json, is_valid_xml, check_valid_aspect_xml, count_sentences_in_text
from IPython.display import clear_output
from itertools import cycle, islice
from dotenv import load_dotenv
from llama_cpp import Llama
import numpy as np
import itertools
import warnings
import random
import openai
import json
import uuid
import os

In [36]:
load_dotenv()

True

## Settings

In [37]:
SPLIT = 0
MODEL_ID = 0

## Parameters

In [38]:
DATASET_PATH = f'../03 dataset split/real/real_{SPLIT}.json'
MAX_TOKENS = 250
CONTEXT_SIZE = 4096
SEED = int(str(43) + str(SPLIT) + str(MODEL_ID))

In [39]:
# Setup Classes/Polarities for Synthesis
CLASSES  = ["GENERAL-IMPRESSION", "FOOD", "SERVICE", "AMBIENCE", "PRICE"]
POLARITIES = ["POSITIVE", "NEUTRAL", "NEGATIVE"]
COMBINATIONS = [(aspect, polarity) for polarity in POLARITIES for aspect in CLASSES]

In [40]:
STOP_CRITERIA = ["Label:", "\n"]

In [41]:
random.seed(SEED)

In [42]:
MODELS = ["Llama13B", "Llama70B", "Falcon40B", "GPT-3"]
# 175B, 70B und 40B
MODEL_PATHS = {"Llama13B": "llama-2-13b.Q4_0.gguf", "Llama70B": "llama-2-70b.Q4_0.gguf", "Falcon40B": "falcon-40b-Q4_K_S.gguf"}
MODEL_NAME = MODELS[MODEL_ID]

## Code

### Create Labels for Synthesis

In [43]:
label_ratio = {"1": int(0.1 * 500), "2": int(0.4 * 500), "3": int(0.5 * 500)}

In [44]:
label_ratio

{'1': 50, '2': 200, '3': 250}

### Calculate how many Aspect-Polarity Pairs should be used

In [45]:
n_tuples = 0
for key, value in label_ratio.items():
    n_tuples += int(key) * value
n_tuples

1200

### Create List of balanced aspect-polarity tuples

In [46]:
def get_500_labels():
    aspect_polarity_tuples_list = list(islice(cycle(COMBINATIONS), n_tuples))
    random.shuffle(aspect_polarity_tuples_list)
    
    tuples_list = []
    idx_start = 0
    for key, value in label_ratio.items():
        tuple_list_n_aspect = aspect_polarity_tuples_list[idx_start: idx_start + value*int(key)]

        k = 0
        for i in range(int(len(tuple_list_n_aspect)/int(key))):
            tuples_list.append(tuple_list_n_aspect[k:k+int(key)])
            k += int(key)
            
        idx_start += int(key) * int(value)
    random.shuffle(tuples_list)
    return tuples_list

labels = get_500_labels() + get_500_labels() + get_500_labels() + get_500_labels()
len(labels)

2000

### Setup Prompt Template

In [47]:
with open('../prompt_template.txt', 'r') as file:
    PROMPT_TEMPLATE = file.read()

### Load Split

In [48]:
with open(DATASET_PATH, 'r', encoding='utf-8') as json_file:
    dataset = json.load(json_file)

### Setup Model

In [49]:
if MODEL_NAME == "Llama70B":
    llm = Llama(model_path=MODEL_PATHS[MODEL_NAME], seed=SEED, n_gpu_layers=1, n_ctx=CONTEXT_SIZE, verbose=False, n_gqa=8)
    clear_output(wait=False)
    def llm_model(text):
        return llm(prompt, max_tokens=MAX_TOKENS, stop=STOP_CRITERIA, echo=True, top_p=1)["choices"][0]["text"][len(text):]
    
if MODEL_NAME == "Llama13B" or MODEL_NAME == "Falcon40B":
    llm = Llama(model_path=MODEL_PATHS[MODEL_NAME], seed=SEED, n_gpu_layers=1, n_ctx=CONTEXT_SIZE, verbose=False)
    clear_output(wait=False)
    def llm_model(text):
        return llm(prompt, max_tokens=MAX_TOKENS, stop=STOP_CRITERIA, echo=True, top_p=1)["choices"][0]["text"][len(text):]

In [50]:
if MODEL_NAME == "GPT-3":
    openai.api_key = os.getenv("OPENAI_API_KEY")
    def llm_model(text):
        response = openai.ChatCompletion.create(
           model="gpt-3.5-turbo",
           messages=[
              {"role": "user", "content": text}
           ],
           max_tokens=MAX_TOKENS,  
           temperature=0.7, 
           stop=STOP_CRITERIA
        )
        return response.choices[0].message.content.strip()

### Create Synthetic Samples

In [51]:
synth_dataset = []

In [52]:
for label in labels[:5]:
    
    valid_example = False
    invalid_xml_schema = 0
    invalid_xml_tags = 0
    aspect_polarity_in_text_but_not_in_label = 0
    more_than_one_sentences = 0
    
    while valid_example == False:
        unique_aspects = [aspect for aspect, _ in label if label.count((aspect, _)) == 1]
    
        ids_examples_for_aspects = get_examples_for_aspects_in_label(unique_aspects, dataset, random)
        additional_examples = [entry for entry in dataset if entry['id'] not in ids_examples_for_aspects]
        additional_examples = random.sample(additional_examples, 10-len(ids_examples_for_aspects))

        examples = additional_examples + [entry for entry in dataset if entry['id'] in ids_examples_for_aspects]
        examples_text = get_examples_as_text(examples)
    
        prompt_footer = f'\nLabel:{str(label)}\nPrediction:'
        prompt = PROMPT_TEMPLATE + examples_text + prompt_footer
    
        prediction = llm_model(prompt)
        
        if is_valid_xml(f'<input>{prediction}</input>') == False:
            invalid_xml_schema += 1
        else:
            if check_valid_aspect_xml(f'<input>{prediction}</input>') == False:
                invalid_xml_tags += 1
            else: 
                prediction_as_json = xml_to_json(prediction, label, MODEL_NAME, SPLIT)
                if prediction_as_json == "not-in-label":
                    aspect_polarity_in_text_but_not_in_label += 1
                else: 
                    if count_sentences_in_text(prediction_as_json["text"]) > 1:
                        more_than_one_sentences += 1
                    else:
                        valid_example = True
    
    prediction_as_json["llm_label"] = label
    prediction_as_json["llm_examples"] = examples
    prediction_as_json["llm_invalid_xml_schema"] = invalid_xml_schema
    prediction_as_json["llm_invalid_xml_tags"] = invalid_xml_tags
    prediction_as_json["llm_aspect_polarity_in_text_but_not_in_label"] = aspect_polarity_in_text_but_not_in_label
    prediction_as_json["llm_more_than_one_sentences"] = more_than_one_sentences
    
    synth_dataset.append(prediction_as_json)

In [53]:
json_file_path = f"synth/{MODEL_NAME}/split_{SPLIT}.json"
os.makedirs(os.path.dirname(json_file_path), exist_ok=True)

In [54]:
with open(json_file_path, "w") as outfile:
    json.dump(synth_dataset, outfile)

In [55]:
[(example["text"], example["llm_label"], [text["text"] for text in example["llm_examples"]]) for example in synth_dataset]

[('Mitten im Chalets und Hütten der Gegend',
  [('FOOD', 'NEUTRAL'), ('AMBIENCE', 'NEUTRAL')],
  ['Käsespätzle sind halt nicht mit einer Alm zu vergleichen.',
   'Käsespätzle sind halt nicht mit einer Alm zu vergleichen.',
   'LOC, eine beeindruckende Kuchen- und Tortenauswahl und ein Interior Design, das als Filmkulisse für eine Geschischte dienen könnte, die in den goldenen 1920er Jahren spielt.',
   'Suppe war nicht heiß, kümmerte den Service aber nicht.',
   'War zwar viel los, dennoch hat das Personal nicht an Höflichkeit und Spaß verloren.',
   'Niemand hatte den Tisch im Blick, keiner fühlte sich zuständig.',
   'Irgendwann kam unser Kellner nicht mehr, wir mussten andere Kellner ansprechen, damit sie ihn holten, um noch Getränke bestellen zu können.',
   'Käsespätzle sind halt nicht mit einer Alm zu vergleichen.',
   'LOC, eine beeindruckende Kuchen- und Tortenauswahl und ein Interior Design, das als Filmkulisse für eine Geschischte dienen könnte, die in den goldenen 1920er Jah