# Notebook: Split Dataset in folds

## Packages

In [1]:
from helper import divide_equally, get_examples_for_aspects_in_label, get_examples_as_text, xml_to_json, is_valid_xml, check_valid_aspect_xml, count_sentences_in_text, german_language_detected
from IPython.display import clear_output
from itertools import cycle, islice
from dotenv import load_dotenv
from llama_cpp import Llama
import numpy as np
import itertools
import warnings
import random
import openai
import json
import uuid
import os

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
load_dotenv()

True

## Settings

In [3]:
SPLIT = 4
MODEL_ID = 0
FEW_SHOTS = "fixed" # "fixed" or "random"

## Parameters

In [4]:
DATASET_PATH = f'../03 dataset split/real/real_{SPLIT}.json'
MAX_TOKENS = 250
CONTEXT_SIZE = 4096
TEMPERATURE = 0.7
SEED = int(str(43) + str(SPLIT) + str(MODEL_ID))

In [5]:
# Setup Classes/Polarities for Synthesis
CLASSES  = ["GENERAL-IMPRESSION", "FOOD", "SERVICE", "AMBIENCE", "PRICE"]
POLARITIES = ["POSITIVE", "NEUTRAL", "NEGATIVE"]
COMBINATIONS = [(aspect, polarity) for polarity in POLARITIES for aspect in CLASSES]

In [6]:
STOP_CRITERIA = ["Label:", "\n"]

In [7]:
random.seed(SEED)

In [8]:
MODELS = ["Llama13B", "Llama70B", "Falcon40B", "GPT-3"]
# 175B, 70B und 40B
MODEL_PATHS = {"Llama13B": "llama-2-13b.Q4_0.gguf", "Llama70B": "llama-2-70b.Q4_0.gguf", "Falcon40B": "falcon-40b-Q4_K_S.gguf"}
MODEL_NAME = MODELS[MODEL_ID]

In [9]:
SYNTH_PATH = f"synth/{MODEL_NAME}/{FEW_SHOTS}/split_{SPLIT}.json"

## Code

### Create Labels for Synthesis

In [10]:
label_ratio = {"1": int(0.1 * 500), "2": int(0.4 * 500), "3": int(0.5 * 500)}

In [11]:
label_ratio

{'1': 50, '2': 200, '3': 250}

### Calculate how many Aspect-Polarity Pairs should be used

In [12]:
n_tuples = 0
for key, value in label_ratio.items():
    n_tuples += int(key) * value
n_tuples

1200

### Create List of balanced aspect-polarity tuples

In [13]:
def get_500_labels():
    aspect_polarity_tuples_list = list(islice(cycle(COMBINATIONS), n_tuples))
    random.shuffle(aspect_polarity_tuples_list)
    tuples_list = []
    idx_start = 0
    for key, value in label_ratio.items():
        tuple_list_n_aspect = aspect_polarity_tuples_list[idx_start: idx_start + value*int(key)]

        k = 0
        for i in range(int(len(tuple_list_n_aspect)/int(key))):
            tuples_list.append(tuple_list_n_aspect[k:k+int(key)])
            k += int(key)
            
        idx_start += int(key) * int(value)
    random.shuffle(tuples_list)
    return tuples_list

labels = get_500_labels() + get_500_labels() + get_500_labels() + get_500_labels()
len(labels)

2000

### Setup Prompt Template

In [14]:
with open('../prompt_template.txt', 'r') as file:
    PROMPT_TEMPLATE = file.read()

### Load Split

In [15]:
with open(DATASET_PATH, 'r', encoding='utf-8') as json_file:
    dataset = json.load(json_file)

### Setup Model

In [16]:
if MODEL_NAME == "Llama70B":
    llm = Llama(model_path=MODEL_PATHS[MODEL_NAME], seed=SEED, n_gpu_layers=1, n_ctx=CONTEXT_SIZE, verbose=False, n_gqa=8)
    clear_output(wait=False)
    def llm_model(text):
        return llm(prompt, max_tokens=MAX_TOKENS, stop=STOP_CRITERIA, echo=True, top_p=1, temperature=TEMPERATURE)["choices"][0]["text"][len(text):]
    
if MODEL_NAME == "Llama13B" or MODEL_NAME == "Falcon40B":
    llm = Llama(model_path=MODEL_PATHS[MODEL_NAME], seed=SEED, n_gpu_layers=1, n_ctx=CONTEXT_SIZE, verbose=False)
    clear_output(wait=False)
    def llm_model(text):
        return llm(prompt, max_tokens=MAX_TOKENS, stop=STOP_CRITERIA, echo=True, top_p=1, temperature=TEMPERATURE)["choices"][0]["text"][len(text):]

In [17]:
if MODEL_NAME == "GPT-3":
    openai.api_key = os.getenv("OPENAI_API_KEY")
    def llm_model(text):
        response = openai.ChatCompletion.create(
           model="gpt-3.5-turbo",
           messages=[
              {"role": "user", "content": text}
           ],
           max_tokens=MAX_TOKENS,  
           temperature=TEMPERATURE, 
           stop=STOP_CRITERIA
        )
        return response.choices[0].message.content.strip()

### Create Fixed Examples

For the condition where I always use the same 10 Few-Shot Examples I will select random Examples

In [18]:
if FEW_SHOTS == "fixed":
    fixed_examples = get_examples_for_aspects_in_label(CLASSES, dataset, random, 2)

### Create Synthetic Samples

In [19]:
synth_dataset = []

In [20]:
idx = 0

In [21]:
for label in labels[:5]:
    
    valid_example = False
    invalid_xml_schema = 0
    invalid_xml_tags = 0
    aspect_polarity_in_text_but_not_in_label = 0
    more_than_one_sentences = 0
    no_german_language = 0
    
    n_retry = 0
    while valid_example == False:
        unique_aspects = [aspect for aspect, _ in label if label.count((aspect, _)) == 1]
    
        # get 2 random examples for each aspect category. for 
        if FEW_SHOTS == "fixed":
            ids_examples_for_aspects = fixed_examples
            additional_examples = []
        else:
            ids_examples_for_aspects = get_examples_for_aspects_in_label(unique_aspects, dataset, random, 2)
            additional_examples = [entry for entry in dataset if entry['id'] not in ids_examples_for_aspects]
            additional_examples = random.sample(additional_examples, 10-len(ids_examples_for_aspects))

        examples = additional_examples + [entry for entry in dataset if entry['id'] in ids_examples_for_aspects]
        random.shuffle(examples)
        examples_text = get_examples_as_text(examples)
    
        prompt_footer = f'\nLabel:{str(label)}\nPrediction:'
        prompt = PROMPT_TEMPLATE + examples_text + prompt_footer
    
        prediction = llm_model(prompt)
        
        if is_valid_xml(f'<input>{prediction}</input>') == False:
            invalid_xml_schema += 1
        else:
            if check_valid_aspect_xml(f'<input>{prediction}</input>') == False:
                invalid_xml_tags += 1
            else: 
                prediction_as_json = xml_to_json(prediction, label, MODEL_NAME, SPLIT)
                if prediction_as_json == "not-in-label":
                    aspect_polarity_in_text_but_not_in_label += 1
                else: 
                    if count_sentences_in_text(prediction_as_json["text"]) > 1:
                        more_than_one_sentences += 1
                    else:
                        if german_language_detected(prediction_as_json["text"]) == False:
                            no_german_language += 1
                        else:
                            valid_example = True
        print(f"current index: {idx}, n_retry: {n_retry}, text: {prediction}")
        n_retry += 1
    
    prediction_as_json["llm_label"] = label
    prediction_as_json["llm_examples"] = examples
    prediction_as_json["llm_invalid_xml_schema"] = invalid_xml_schema
    prediction_as_json["llm_invalid_xml_tags"] = invalid_xml_tags
    prediction_as_json["llm_aspect_polarity_in_text_but_not_in_label"] = aspect_polarity_in_text_but_not_in_label
    prediction_as_json["llm_more_than_one_sentences"] = more_than_one_sentences
    prediction_as_json["llm_no_german_language"] = no_german_language
    
    synth_dataset.append(prediction_as_json)
    idx+= 1

current index: 0, n_retry: 0, text: Viel zu teuer für die Mahlzeiten!<aspect-term aspect="GENERAL-IMPRESSION" polarity="NEGATIVE">Schönes Umfeld</aspect-term> und <aspect-term aspect="GENERAL-IMPRESSION" polarity="POSITIVE">Freundliches Personal</aspect-term>.
current index: 1, n_retry: 0, text: Nun ja, wer dafür bezahlt, kann auch so was haben.
current index: 2, n_retry: 0, text: <aspect-term aspect="FOOD" polarity="POSITIVE">Burger</aspect-term> gut, <aspect-term aspect="PRISE" polarity="NEGATIVE">Preis</aspect-term> teuer, <aspect-term aspect="FOOD" polarity="NEUTRAL">Sauce</aspect-term> auch nicht so fantastisch.
current index: 2, n_retry: 1, text: Mit <aspect-term aspect="FOOD" polarity="POSITIVE">Sepp</aspect-term> und dem <aspect-term aspect="PRICE" polarity="NEGATIVE">Preis</aspect-term> habe ich mich nicht beeindruckt, sondern eher enttäuscht gefühlt.
current index: 3, n_retry: 0, text: Das Essen war sehr sauber und frisch.
current index: 4, n_retry: 0, text: Kurze Wartezeit, 

In [22]:
os.makedirs(os.path.dirname(SYNTH_PATH), exist_ok=True)

In [23]:
with open(SYNTH_PATH, "w") as outfile:
    json.dump(synth_dataset, outfile)