# Notebook: Create Examples for Prompts

* Todo: Prüfen, wie lange Prompt wäre
* Todo: Classifier anpassen an neue Größen
* Prüfen ob wirklich neue Beispiele genommen werden

## Packages

In [45]:
from helper_few_shot_generation import get_examples_for_fixed_label, get_random_examples
from itertools import cycle, islice
import numpy as np
import random
import json

## Parameters

In [32]:
SEED = 43
random.seed(SEED)
N_RETRY_SETS = 25

In [33]:
# Setup Classes/Polarities for Synthesis
CLASSES  = ["GENERAL-IMPRESSION", "FOOD", "SERVICE", "AMBIENCE", "PRICE"]
POLARITIES = ["POSITIVE", "NEUTRAL", "NEGATIVE"]
COMBINATIONS = [(aspect, polarity) for polarity in POLARITIES for aspect in CLASSES]

## Code

### Create Function to get Labels

In [34]:
def get_n_labels(n_labels):
    label_ratio = {"1": int(0.5 * n_labels),
                   "2": int(0.3 * n_labels),
                   "3": int(0.2 * n_labels)}
    if n_labels != sum(label_ratio.values()):
        raise Exception("n_labels != sum(label_ratio.values())")

    n_tuples = 0
    for key, value in label_ratio.items():
        n_tuples += int(key) * value
    n_tuples

    aspect_polarity_tuples_list = list(islice(cycle(COMBINATIONS), n_tuples))
    random.shuffle(aspect_polarity_tuples_list)
    tuples_list = []
    idx_start = 0
    for key, value in label_ratio.items():
        tuple_list_n_aspect = aspect_polarity_tuples_list[idx_start: idx_start + value*int(
            key)]

        k = 0
        for i in range(int(len(tuple_list_n_aspect)/int(key))):
            tuples_list.append(tuple_list_n_aspect[k:k+int(key)])
            k += int(key)

        idx_start += int(key) * int(value)
    random.shuffle(tuples_list)
    return tuples_list

### Code to Count Prompt Size

In [35]:
from helper import get_examples_as_text
import tiktoken

with open('../prompt_template.txt', 'r') as file:
    PROMPT_TEMPLATE = file.read()

def num_tokens_from_string(string: str) -> int:
    # I will use the gpt tokenizer
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    num_tokens = len(encoding.encode(string))
    return num_tokens

def check_prompt_size(dataset, examples, label):
    few_shot_examples = [entry for entry in dataset if entry['id'] in examples]

    # Build Prompt
    examples_text = get_examples_as_text(few_shot_examples)
    prompt_footer = f'\nLabel:{str(label)}\nPrediction:'
    prompt = PROMPT_TEMPLATE + examples_text + prompt_footer
    return num_tokens_from_string(prompt)

### Create Fixed Examples

For the condition where I always use the same 10 Few-Shot Examples I will select random Examples

In [39]:
prompt_sizes = []

In [40]:
condition_size = [490, 500, 1000]
few_shot_examples_fixed = {}

for split_id in range(5):
    # Add index for split
    few_shot_examples_fixed[f"split_{split_id}"] = {}

    # Load real examples
    with open(f"../07 train classifier/real/split_{split_id}.json", 'r', encoding='utf-8') as json_file:
        dataset = json.load(json_file)

    # Define fixed labels for split
    labels = []
    for size in condition_size:
        labels += get_n_labels(size)

    few_shot_examples_fixed[f"split_{split_id}"]["labels_for_prediction"] = labels

    # Get 10 random examples
    fixed_examples = get_examples_for_fixed_label(CLASSES, dataset, random, 2)

    # Add 2000 Shuffles of fixed examples x 25 -> In case synthesis doesn't work
    few_shot_examples_fixed[f"split_{split_id}"]["few_shot_ids"] = {}
    for i in range(sum(condition_size)):
        few_shot_examples_fixed[f"split_{split_id}"]["few_shot_ids"][i] = {}

        for k in range(N_RETRY_SETS):
            shuffled_fixed_examples = fixed_examples.copy()
            random.shuffle(shuffled_fixed_examples)
            few_shot_examples_fixed[f"split_{split_id}"]["few_shot_ids"][i][k] = shuffled_fixed_examples

    # Calculate prompt size
    prompt_sizes.append(check_prompt_size(
        dataset, shuffled_fixed_examples, labels[i]))

In [52]:
print("Average Fixed Examples Prompt Size | avg: " + str(np.mean(prompt_sizes)) + " | " + str(str(np.max(prompt_sizes))))

Average Fixed Examples Prompt Size: 938.2


In [None]:
len(few_shot_examples_fixed["split_0"]["few_shot_ids"]), len(few_shot_examples_fixed["split_0"]["labels_for_prediction"])

(1, 1990)

In [None]:
with open("few_shot_examples/few_shot_examples_fixed.json", 'w') as json_file:
    json.dump(few_shot_examples_fixed, json_file)

### Create Random Examples

In [53]:
prompt_sizes = []

In [54]:
condition_size = [500, 500, 1000]
few_shot_examples_random = {}

for split_id in range(5):
    # Add index for split
    few_shot_examples_random[f"split_{split_id}"] = {}

    # Load real examples
    with open(f"../07 train classifier/real/split_{split_id}.json", 'r', encoding='utf-8') as json_file:
        dataset = json.load(json_file)

    # Define fixed labels for split
    labels = []
    for size in condition_size:
        labels += get_n_labels(size)
    few_shot_examples_random[f"split_{split_id}"]["labels_for_prediction"] = labels

    # Add Shuffles of fixed examples x 25 -> In case synthesis doesn't work
    few_shot_examples_random[f"split_{split_id}"]["few_shot_ids"] = {}
    for i in range(sum(condition_size)):
        if i % 1000 == 0:
            print(split_id, i)
        few_shot_examples_random[f"split_{split_id}"]["few_shot_ids"][i] = {}
        for k in range(N_RETRY_SETS):
            random_examples = get_random_examples(
                10, labels[i], dataset, random, CLASSES)
            few_shot_examples_random[f"split_{split_id}"]["few_shot_ids"][i][k] = random_examples
            # Calculate prompt size
            prompt_sizes.append(check_prompt_size(
                dataset, random_examples, labels[i]))

0 0
0 100
0 200
0 300
0 400
0 500
0 600
0 700
0 800
0 900
0 1000
0 1100
0 1200
0 1300
0 1400
0 1500
0 1600
0 1700
0 1800
0 1900
1 0
1 100
1 200
1 300
1 400
1 500
1 600
1 700
1 800
1 900
1 1000
1 1100
1 1200
1 1300
1 1400
1 1500
1 1600
1 1700
1 1800
1 1900
2 0
2 100
2 200
2 300
2 400
2 500
2 600
2 700
2 800
2 900
2 1000
2 1100
2 1200
2 1300
2 1400
2 1500
2 1600
2 1700
2 1800
2 1900
3 0
3 100
3 200
3 300
3 400
3 500
3 600
3 700
3 800
3 900
3 1000
3 1100
3 1200
3 1300
3 1400
3 1500
3 1600
3 1700
3 1800
3 1900
4 0
4 100
4 200
4 300
4 400
4 500
4 600
4 700
4 800
4 900
4 1000
4 1100
4 1200
4 1300
4 1400
4 1500
4 1600
4 1700
4 1800
4 1900


In [55]:
print("Average Fixed Examples Prompt Size | avg: " + str(np.mean(prompt_sizes)) + " | " + str(str(np.max(prompt_sizes))))

Average Fixed Examples Prompt Size | avg: 926.099176 | 1579


In [None]:
len(few_shot_examples_random["split_0"]["few_shot_ids"]), len(few_shot_examples_random["split_0"]["labels_for_prediction"])

(2000, 2000)

In [None]:
with open("few_shot_examples/few_shot_examples_random.json", 'w') as json_file:
    json.dump(few_shot_examples_random, json_file)

In [None]:
few_shot_examples_random["split_0"]["labels_for_prediction"][0]

[('GENERAL-IMPRESSION', 'NEGATIVE'), ('GENERAL-IMPRESSION', 'NEGATIVE')]