# Notebook: Convert Annotation from LabelStudio to .json

## Packages

In [28]:
import json

## Parameters

In [29]:
ANNOTATIONS_PATH = "dataset_total/raw_annotations.json"

## Code

### Convert to json

In [30]:
with open(ANNOTATIONS_PATH, 'r', encoding='utf-8') as json_datei:
    data = json.load(json_datei)

In [31]:
def format_json(data):
    formatted_data = []

    for raw_entry in data:
        tags = []
        
        aspect_available_without_judgement = False
        two_or_more_sentences = False
        
        for annotation in raw_entry["annotations"]:
            for result in annotation["result"]:
                if "labels" in result["value"]:
                    tag = {
                        "end": result["value"]["end"],
                        "start": result["value"]["start"],
                        "tag_with_polarity": result["value"]["labels"][0],
                        "tag_with_polarity_and_type": result["value"]["labels"][0] + result.get("from_name", "")[5:],
                        "text": result["value"]["text"],
                        "type": result.get("from_name", "")
                    }

                    if tag["type"] == "label-implicit":
                        tag["end"] = 0
                        tag["start"] = 0
                        tag["text"] = "NULL"
                        tag["tag_with_polarity"] = tag["tag_with_polarity"][:-10]
                    else:
                        tag["text"] = result["value"]["text"]

                    # Splitte den Tag
                    tag_parts = tag["tag_with_polarity"].rsplit("-", 1)
                    if len(tag_parts) == 2:
                        tag["label"], tag["polarity"] = tag_parts
                    else:
                        tag["label"] = tag_parts[0]
                        tag["polarity"] = ""

                    tags.append(tag)
                    
                if "choices" in result["value"]:
                    if result["from_name"] == "aspect_available_without_judgement":
                        aspect_available_without_judgement = True
                    if result["from_name"] == "two_or_more_sentences":
                        two_or_more_sentences = True
                        
        text = raw_entry["data"]["text"]         
        entry = {"tags": tags, "text": text, "aspect_available_without_judgement": aspect_available_without_judgement, "two_or_more_sentences":two_or_more_sentences}

        raw_entry["data"] = {key: value for key, value in raw_entry["data"].items() if not key.startswith("Unnamed")}

        for key, value in raw_entry["data"].items():
            entry[key] = value
            
        formatted_data.append(entry)

    return formatted_data

formatted_json = format_json(data)

### Save Raw Annotations

In [32]:
with open('dataset_total/raw_dataset.json', 'w', encoding='utf-8') as json_file:
    json.dump(formatted_json, json_file, ensure_ascii=False)

### Remove Samples without Aspects

In [33]:
entries_with_empty_tags = [entry for entry in formatted_json if not entry['tags']]
len(entries_with_empty_tags), "von", len(formatted_json)

(14, 'von', 32)

In [34]:
formatted_json = [entry for entry in formatted_json if entry['tags']]
len(formatted_json)

18

### Remove Samples with Conflict

In [35]:
entries_with_polarity_conflict = [entry for entry in formatted_json if any(tag.get('polarity') == 'CONFLICT' for tag in entry['tags'])]

len(entries_with_polarity_conflict)

1

In [36]:
formatted_json = [entry for entry in formatted_json if not any(tag.get('polarity') == 'CONFLICT' for tag in entry['tags'])]
len(formatted_json)

17

### Remove Samples with #Sentences > 2

In [37]:
entries_with_two_or_more_sentences = [entry for entry in formatted_json if entry['two_or_more_sentences']]
entries_with_two_or_more_sentences

[]

In [38]:
formatted_json = [entry for entry in formatted_json if not entry['two_or_more_sentences']]
len(formatted_json)

17

### --> Delete Later: Enlarge Dataset

In [39]:
desired_length = 2500
copies_needed = desired_length // len(formatted_json)
duplicated_list = formatted_json * copies_needed
remaining_length = desired_length - len(duplicated_list)
if remaining_length > 0:
    duplicated_list += formatted_json[:remaining_length]

In [40]:
formatted_json = duplicated_list

### Save Formatted Dataset

In [41]:
with open('dataset_total/filtered_dataset.json', 'w', encoding='utf-8') as json_file:
    json.dump(formatted_json, json_file, ensure_ascii=False)