# Notebook: Format Label Studio Output of Annotated Synthetic Examples

## Packages

In [58]:
import json
import sys
import os
sys.path.append(os.path.abspath('../02 dataset split/'))
from format_labelstudio_json import format_json

## Code

### Load Examples

In [59]:
with open(f"annotation_datasets/synth_annotation_labelstudio_output/annotation_Llama70B_random_800.json", 'r') as json_file:
    synthetic_data_llama_random_600 = format_json(json.load(json_file))    

In [60]:
synthetic_data_llama_random_600 = synthetic_data_llama_random_600[0:120] + synthetic_data_llama_random_600[160:280] + \
    synthetic_data_llama_random_600[320:440] + \
    synthetic_data_llama_random_600[480:600] + synthetic_data_llama_random_600[640:760]

In [61]:
with open(f"annotation_datasets/synth_annotation_labelstudio_output/annotation_Llama70B_fixed_600.json", 'r') as json_file:
    synthetic_data_llama_fixed_600 = format_json(json.load(json_file))     

In [62]:
with open(f"annotation_datasets/synth_annotation_labelstudio_output/annotation_GPT-3_random_600.json", 'r') as json_file:
    synthetic_data_GPT_3_random_600 = format_json(json.load(json_file))     

In [63]:
with open(f"annotation_datasets/synth_annotation_labelstudio_output/annotation_GPT-3_fixed_600.json", 'r') as json_file:
    synthetic_data_GPT_3_fixed_600 = format_json(json.load(json_file))     

In [64]:
with open(f"annotation_datasets/synth_annotation_labelstudio_output/additional_annotations_for_fold_5.json", 'r') as json_file:
    synthetic_data_additional_annotations_fold_5 = format_json(json.load(json_file))   

In [65]:
annotated_dataset = synthetic_data_llama_random_600 + synthetic_data_llama_fixed_600 + \
    synthetic_data_GPT_3_random_600 + synthetic_data_GPT_3_fixed_600 + \
    synthetic_data_additional_annotations_fold_5

len(annotated_dataset)

2880

In [66]:
def filter_dataset(annotated_dataset):
    filtered_dataset = []
    examples_per_combination = 100
    combinations_count = {}

    for example in annotated_dataset:
        model = example['model']
        split = example['split']
        few_shot_condition = example['few_shot_condtion']

        key = (model, split, few_shot_condition)

        if key not in combinations_count:
            combinations_count[key] = 0

        if combinations_count[key] < examples_per_combination:
            filtered_dataset.append(example)
            combinations_count[key] += 1

    return filtered_dataset

annotated_dataset = filter_dataset(annotated_dataset)
len(annotated_dataset)

2400

In [67]:
len(list(set([ex["id"] for ex in annotated_dataset])))


2400

### Exclude / Count Examples without annotated Aspects

In [68]:
for llm in ["Llama70B", "GPT-3"]:
    for fs in ["fixed", "random"]:
        samples = [entry for entry in annotated_dataset if entry["model"] == llm and entry["few_shot_condtion"] == fs]
        samples_filtered = [entry for entry in samples if not entry['tags']]

        entries_with_empty_tags = [entry for entry in samples_filtered if not entry['tags']]
        print(llm, fs, len(entries_with_empty_tags), "von 720")

Llama70B fixed 36 von 720
Llama70B random 37 von 720
GPT-3 fixed 1 von 720
GPT-3 random 0 von 720


In [69]:
annotated_dataset = [entry for entry in annotated_dataset if entry['tags']]
len(annotated_dataset)

2326

### Remove Samples with Conflict

In [70]:
for llm in ["Llama70B", "GPT-3"]:
    for fs in ["fixed", "random"]:
        samples = [entry for entry in annotated_dataset if any(tag.get('polarity') == 'CONFLICT' for tag in entry['tags']) and entry["model"] == llm and entry["few_shot_condtion"] == fs]

        print(llm, fs, len(samples), "von", 720)

Llama70B fixed 7 von 720
Llama70B random 10 von 720
GPT-3 fixed 4 von 720
GPT-3 random 3 von 720


In [71]:
annotated_dataset = [entry for entry in annotated_dataset if not any(tag.get('polarity') == 'CONFLICT' for tag in entry['tags'])]
len(annotated_dataset)

2302

In [72]:
for llm in ["Llama70B", "GPT-3"]:
    for fs in ["fixed", "random"]:
        samples = [entry for entry in annotated_dataset if entry["model"] == llm and entry["few_shot_condtion"] == fs]
        print(llm, fs, len(samples), "of", 720)



Llama70B fixed 557 of 720
Llama70B random 553 of 720
GPT-3 fixed 595 of 720
GPT-3 random 597 of 720


### Save Examples

In [73]:
with open(f"annotation_datasets/annotated_synth_dataset.json", 'w', encoding='utf-8') as file:
    json.dump(annotated_dataset, file, ensure_ascii=False)