# Notebook: Format Label Studio Output of Annotated Synthetic Examples

## Packages

In [94]:
import json
import sys
import os
sys.path.append(os.path.abspath('../02 dataset split/'))
from format_labelstudio_json import format_json
import random
import pandas as pd
import numpy as np

In [95]:
random.seed(43)

## Code

### Load Examples

In [96]:
with open(f"annotation_datasets/synth_annotation_labelstudio_output/annotation_Llama3_70B_random_600.json", 'r') as json_file:
    synthetic_data_llama_random_600 = format_json(json.load(json_file))    

In [97]:
with open(f"annotation_datasets/synth_annotation_labelstudio_output/annotation_Llama3_70B_fixed_600.json", 'r') as json_file:
    synthetic_data_llama_fixed_600 = format_json(json.load(json_file))     

In [98]:
with open(f"annotation_datasets/synth_annotation_labelstudio_output/annotation_GPT-3_random_600.json", 'r') as json_file:
    synthetic_data_GPT_3_random_600 = format_json(json.load(json_file))     

In [99]:
with open(f"annotation_datasets/synth_annotation_labelstudio_output/annotation_GPT-3_fixed_600.json", 'r') as json_file:
    synthetic_data_GPT_3_fixed_600 = format_json(json.load(json_file))     

In [100]:
with open(f"annotation_datasets/synth_annotation_labelstudio_output/additional_annotations_for_fold_5.json", 'r') as json_file:
    synthetic_data_additional_annotations_fold_5 = format_json(json.load(json_file))   

In [101]:
synthetic_data_additional_annotations_fold_5 = [ex for ex in synthetic_data_additional_annotations_fold_5 if ex["model"] == "GPT-3"]

In [102]:
annotated_dataset = synthetic_data_llama_random_600 + synthetic_data_llama_fixed_600 + \
    synthetic_data_GPT_3_random_600 + synthetic_data_GPT_3_fixed_600 + \
    synthetic_data_additional_annotations_fold_5

len(annotated_dataset)

2640

In [103]:
confirmations = pd.read_csv('annotation_datasets/annotation_jakob_0_confirmation.csv')
confirmations_llama = pd.read_csv('annotation_datasets/annotation_jakob_llama3_confirmation.csv')
confirmations = confirmations + confirmations_llama
annotation_dict = dict(zip(confirmations['id'], confirmations['second_annotator_comment']))

In [104]:
def is_nan(value):
    try:
        return np.isnan(value)
    except TypeError:
        return False

In [105]:
def check_ids_in_annotation_dict(filtered_dataset, annotation_dict):
    ids_with_annotation = [id for id in annotation_dict.keys() if is_nan(annotation_dict[id]) == False]
    
    dataset_ids = [example["id"] for example in filtered_dataset]
    for idx, id in enumerate(ids_with_annotation):
        if id in dataset_ids:
            pass
        else:
            return False
    return True


In [106]:
def filter_dataset(annotated_dataset):
    has_annotations = False
    while has_annotations == False:
        filtered_dataset = []
        examples_per_combination = 100
        combinations_count = {}

        random.shuffle(annotated_dataset)

        for example in annotated_dataset:
            model = example['model']
            split = example['split']
            few_shot_condition = example['few_shot_condtion']

            key = (model, split, few_shot_condition)

            if key not in combinations_count:
                combinations_count[key] = 0

            if combinations_count[key] < examples_per_combination:
                filtered_dataset.append(example)
                combinations_count[key] += 1

        has_annotations = check_ids_in_annotation_dict(
            filtered_dataset, annotation_dict)

    return filtered_dataset


annotated_dataset = filter_dataset(annotated_dataset)
len(annotated_dataset)

2400

In [107]:
len(list(set([ex["id"] for ex in annotated_dataset])))


2400

### Count Examples without annotated Aspects

In [108]:
for llm in ["Llama3_70B", "GPT-3"]:
    for fs in ["fixed", "random"]:
        samples = [entry for entry in annotated_dataset if entry["model"] == llm and entry["few_shot_condtion"] == fs]
        samples_filtered = [entry for entry in samples if not entry['tags']]

        entries_with_empty_tags = [entry for entry in samples_filtered if not entry['tags']]
        print(llm, fs, len(entries_with_empty_tags), "von 600")

Llama3_70B fixed 10 von 600
Llama3_70B random 15 von 600
GPT-3 fixed 1 von 600
GPT-3 random 0 von 600


In [109]:
# annotated_dataset = [entry for entry in annotated_dataset if entry['tags']]
# len(annotated_dataset)

### Samples with Conflict

In [110]:
for llm in ["Llama3_70B", "GPT-3"]:
    for fs in ["fixed", "random"]:
        samples = [entry for entry in annotated_dataset if any(tag.get('polarity') == 'CONFLICT' for tag in entry['tags']) and entry["model"] == llm and entry["few_shot_condtion"] == fs]

        print(llm, fs, len(samples), "von", 600)

Llama3_70B fixed 8 von 600
Llama3_70B random 7 von 600
GPT-3 fixed 5 von 600
GPT-3 random 4 von 600


In [111]:
# annotated_dataset = [entry for entry in annotated_dataset if not any(tag.get('polarity') == 'CONFLICT' for tag in entry['tags'])]
# len(annotated_dataset)

In [112]:
for llm in ["Llama3_70B", "GPT-3"]:
    for fs in ["fixed", "random"]:
        samples = [entry for entry in annotated_dataset if entry["model"] == llm and entry["few_shot_condtion"] == fs]
        print(llm, fs, len(samples), "of", 600)



Llama3_70B fixed 600 of 600
Llama3_70B random 600 of 600
GPT-3 fixed 600 of 600
GPT-3 random 600 of 600


### Save Examples

In [113]:
with open(f"annotation_datasets/annotated_synth_dataset.json", 'w', encoding='utf-8') as file:
    json.dump(annotated_dataset, file, ensure_ascii=False)