# Notebook: Create Examples for Prompts

## Packages

In [1]:
from helper_few_shot_generation import get_examples_for_classes, get_label_ratio_fixed, get_label_ratio_random
from helper_synthesis import get_examples_as_text
from itertools import cycle, islice
import numpy as np
import tiktoken
import random
import json

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Parameters

In [2]:
SEED = 43
random.seed(SEED)
N_RETRY_SETS = 25

In [3]:
# Setup Classes/Polarities for Synthesis
CLASSES  = ["GENERAL-IMPRESSION", "FOOD", "SERVICE", "AMBIENCE", "PRICE"]
POLARITIES = ["POSITIVE", "NEUTRAL", "NEGATIVE"]
COMBINATIONS = [(aspect, polarity) for polarity in POLARITIES for aspect in CLASSES]

## Code

### Create Function to get Labels

In [4]:
def get_n_labels(n_labels, label_ratio):
    if n_labels != sum(label_ratio.values()):
        raise Exception("n_labels != sum(label_ratio.values())")

    n_tuples = 0
    for key, value in label_ratio.items():
        n_tuples += int(key) * value

    aspect_polarity_tuples_list = list(islice(cycle(COMBINATIONS), n_tuples))
    random.shuffle(aspect_polarity_tuples_list)
    tuples_list = []
    idx_start = 0
    for key, value in label_ratio.items():
        tuple_list_n_aspect = aspect_polarity_tuples_list[idx_start: idx_start + value*int(
            key)]

        k = 0
        for i in range(int(len(tuple_list_n_aspect)/int(key))):
            tuples_list.append(tuple_list_n_aspect[k:k+int(key)])
            k += int(key)

        idx_start += int(key) * int(value)
    random.shuffle(tuples_list)
    return tuples_list

### Code to Count Prompt Size

In [5]:
with open('../prompt_template.txt', 'r') as file:
    PROMPT_TEMPLATE = file.read()

def num_tokens_from_string(string: str) -> int:
    # I will use the gpt tokenizer
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    num_tokens = len(encoding.encode(string))
    return num_tokens

def check_prompt_size(dataset, examples, label):
    few_shot_examples = [entry for entry in dataset if entry['id'] in examples]

    # Build Prompt
    examples_text = get_examples_as_text(few_shot_examples)
    prompt_footer = f'\nLabel:{str(label)}\nPrediction:'
    prompt = PROMPT_TEMPLATE + examples_text + prompt_footer
    return num_tokens_from_string(prompt)

### Create Fixed Examples

For the condition where I always use the same 25 Few-Shot Examples I will select random Examples

In [6]:
prompt_sizes = []

In [7]:
condition_size = [490, 500, 1000]
few_shot_examples_fixed = {}

for split_id in range(5):
    # Add index for split
    few_shot_examples_fixed[f"split_{split_id}"] = {}

    # Load real examples
    with open(f"../07 train classifier/real/split_{split_id}.json", 'r', encoding='utf-8') as json_file:
        dataset = json.load(json_file)

    # Define fixed labels for split
    labels = []
    for size in condition_size:
        label_ratio = get_label_ratio_fixed(size)
        labels += get_n_labels(size, label_ratio)

    few_shot_examples_fixed[f"split_{split_id}"]["labels_for_prediction"] = labels

    # Get 25 random examples
    fixed_examples = get_examples_for_classes(CLASSES, dataset, random, 5)

    # Save 25 random examples
    with open(f"../07 train classifier/real_fixed/split_{split_id}.json", 'w') as json_file:
        json.dump([example for example in dataset if example["id"] in fixed_examples], json_file)

    few_shot_examples_fixed[f"split_{split_id}"]["few_shot_ids"] = {}
    for i in range(sum(condition_size)):
        few_shot_examples_fixed[f"split_{split_id}"]["few_shot_ids"][i] = {}

        for k in range(N_RETRY_SETS):
            shuffled_fixed_examples = fixed_examples.copy()
            random.shuffle(shuffled_fixed_examples)
            few_shot_examples_fixed[f"split_{split_id}"]["few_shot_ids"][i][k] = shuffled_fixed_examples

            # Calculate prompt size
            prompt_sizes.append(check_prompt_size(
                dataset, shuffled_fixed_examples, labels[i]))

In [8]:
print("Average Fixed Examples Prompt Size | avg: " + str(np.mean(prompt_sizes)) + " | " + str(str(np.max(prompt_sizes))))

Average Fixed Examples Prompt Size | avg: 1765.3859296482412 | 1956


In [9]:
len(few_shot_examples_fixed["split_0"]["few_shot_ids"]), len(few_shot_examples_fixed["split_0"]["labels_for_prediction"])

(1990, 1990)

In [10]:
with open("few_shot_examples/few_shot_examples_fixed.json", 'w') as json_file:
    json.dump(few_shot_examples_fixed, json_file)

### Create Random Examples

In [11]:
prompt_sizes = []

In [12]:
condition_size = [500, 500, 500]
few_shot_examples_random = {}

for split_id in range(5):
    # Add index for split
    few_shot_examples_random[f"split_{split_id}"] = {}

    # Load real examples
    with open(f"../07 train classifier/real/split_{split_id}.json", 'r', encoding='utf-8') as json_file:
        dataset = json.load(json_file)

    # Define fixed labels for split
    labels = []
    for size in condition_size:
        label_ratio = get_label_ratio_random(size, dataset)
        labels += get_n_labels(size, label_ratio)
    few_shot_examples_random[f"split_{split_id}"]["labels_for_prediction"] = labels

    # Add Shuffles of fixed examples x 25 -> In case synthesis doesn't work
    few_shot_examples_random[f"split_{split_id}"]["few_shot_ids"] = {}
    for i in range(sum(condition_size)):
        if i % 500 == 0:
            print(split_id, i)
        few_shot_examples_random[f"split_{split_id}"]["few_shot_ids"][i] = {}
        for k in range(N_RETRY_SETS):
            random_examples = get_examples_for_classes(CLASSES, dataset, random, 5)
            few_shot_examples_random[f"split_{split_id}"]["few_shot_ids"][i][k] = random_examples
            # Calculate prompt size
            prompt_sizes.append(check_prompt_size(
                dataset, random_examples, labels[i]))

0 0
0 500
0 1000
1 0
1 500
1 1000
2 0
2 500
2 1000
3 0
3 500
3 1000
4 0
4 500
4 1000


In [13]:
print("Average Fixed Examples Prompt Size | avg: " + str(np.mean(prompt_sizes)) + " | " + str(str(np.max(prompt_sizes))))

Average Fixed Examples Prompt Size | avg: 1808.934544 | 2709


In [14]:
len(few_shot_examples_random["split_0"]["few_shot_ids"]), len(few_shot_examples_random["split_0"]["labels_for_prediction"])

(1500, 1500)

In [15]:
with open("few_shot_examples/few_shot_examples_random.json", 'w') as json_file:
    json.dump(few_shot_examples_random, json_file)

In [16]:
few_shot_examples_random["split_0"]["labels_for_prediction"][0]

[('GENERAL-IMPRESSION', 'POSITIVE')]