# Notebook: Format Label Studio Output of Annotated Synthetic Examples

## Packages

In [111]:
import json
import sys
import os
sys.path.append(os.path.abspath('../02 dataset split/'))
from format_labelstudio_json import format_json

## Code

### Load Examples

In [112]:
with open(f"annotation_datasets/synth_annotation_labelstudio_output/annotation_Llama70B_random_800.json", 'r') as json_file:
    synthetic_data_llama_random_600 = format_json(json.load(json_file))    

In [113]:
synthetic_data_llama_random_600 = synthetic_data_llama_random_600[0:120] + synthetic_data_llama_random_600[160:280] + \
    synthetic_data_llama_random_600[320:440] + \
    synthetic_data_llama_random_600[480:600] + synthetic_data_llama_random_600[640:760]

In [114]:
with open(f"annotation_datasets/synth_annotation_labelstudio_output/annotation_Llama70B_fixed_600.json", 'r') as json_file:
    synthetic_data_llama_fixed_600 = format_json(json.load(json_file))     

In [115]:
with open(f"annotation_datasets/synth_annotation_labelstudio_output/annotation_GPT-3_random_600.json", 'r') as json_file:
    synthetic_data_GPT_3_random_600 = format_json(json.load(json_file))     

In [116]:
with open(f"annotation_datasets/synth_annotation_labelstudio_output/annotation_GPT-3_fixed_600.json", 'r') as json_file:
    synthetic_data_GPT_3_fixed_600 = format_json(json.load(json_file))     

In [117]:
annotated_dataset = synthetic_data_llama_random_600 + synthetic_data_llama_fixed_600 + \
    synthetic_data_GPT_3_random_600 + synthetic_data_GPT_3_fixed_600

len(annotated_dataset)

2400

In [118]:
annotated_dataset[0]

{'tags': [{'end': 0,
   'start': 0,
   'tag_with_polarity': 'GENERAL-IMPRESSION-POSITIVE',
   'tag_with_polarity_and_type': 'GENERAL-IMPRESSION-POSITIVE-no-phrase-implicit',
   'text': 'NULL',
   'type': 'label-implicit',
   'label': 'GENERAL-IMPRESSION',
   'polarity': 'POSITIVE'}],
 'text': 'Wir waren sehr zufrieden.',
 'two_or_more_sentences': False,
 'aspect_available_without_judgement': False,
 'id': 'a49a6f01-1ecc-4da0-b76b-f283f518fc60',
 'model': 'Llama70B',
 'split': 0,
 'few_shot_condtion': 'random'}

### Exclude / Count Examples without annotated Aspects

In [119]:
for llm in ["Llama70B", "GPT-3"]:
    for fs in ["random", "fixed"]:
        samples = [entry for entry in annotated_dataset if not entry['tags']
                   and entry["model"] == llm and entry["few_shot_condtion"] == fs]

        entries_with_empty_tags = [entry for entry in samples if not entry['tags']]
        print(llm, fs, len(entries_with_empty_tags), "von", 600)

Llama70B random 41 von 600
Llama70B fixed 38 von 600
GPT-3 random 0 von 600
GPT-3 fixed 1 von 600


In [120]:
annotated_dataset = [entry for entry in annotated_dataset if entry['tags']]
len(annotated_dataset)

2320

### Remove Samples with Conflict

In [121]:
for llm in ["Llama70B", "GPT-3"]:
    for fs in ["random", "fixed"]:
        samples = [entry for entry in annotated_dataset if any(tag.get('polarity') == 'CONFLICT' for tag in entry['tags']) and entry["model"] == llm and entry["few_shot_condtion"] == fs]

        print(llm, fs, len(samples), "von", 600)

Llama70B random 11 von 600
Llama70B fixed 8 von 600
GPT-3 random 4 von 600
GPT-3 fixed 4 von 600


In [122]:
annotated_dataset = [entry for entry in annotated_dataset if not any(tag.get('polarity') == 'CONFLICT' for tag in entry['tags'])]
len(annotated_dataset)

2293

In [123]:
for llm in ["Llama70B", "GPT-3"]:
    for fs in ["random", "fixed"]:
        samples = [entry for entry in annotated_dataset if entry["model"] == llm and entry["few_shot_condtion"] == fs]
        print(llm, fs, len(samples), "of", 600)



Llama70B random 548 of 600
Llama70B fixed 554 of 600
GPT-3 random 596 of 600
GPT-3 fixed 595 of 600


### Save Examples

In [124]:
with open(f"annotation_datasets/annotated_synth_dataset.json", 'w', encoding='utf-8') as file:
    json.dump(annotated_dataset, file, ensure_ascii=False)