# Notebook: Convert Annotation from LabelStudio to .json

## Packages

In [1]:
import json

## Parameters

In [2]:
ANNOTATIONS_PATH = "dataset_total/raw_annotations.json"

## Code

### Convert to json

In [3]:
with open(ANNOTATIONS_PATH, 'r', encoding='utf-8') as json_datei:
    data = json.load(json_datei)

In [4]:
def format_json(data):
    formatted_data = []

    for raw_entry in data:
        tags = []
        
        aspect_available_without_judgement = False
        two_or_more_sentences = False
        
        for annotation in raw_entry["annotations"]:
            for result in annotation["result"]:
                if "labels" in result["value"]:
                    tag = {
                        "end": result["value"]["end"],
                        "start": result["value"]["start"],
                        "tag_with_polarity": result["value"]["labels"][0],
                        "tag_with_polarity_and_type": result["value"]["labels"][0] + result.get("from_name", "")[5:],
                        "text": result["value"]["text"],
                        "type": result.get("from_name", "")
                    }

                    if tag["type"] == "label-implicit":
                        tag["end"] = 0
                        tag["start"] = 0
                        tag["text"] = "NULL"
                        tag["tag_with_polarity"] = tag["tag_with_polarity"][:-10]
                    else:
                        tag["text"] = result["value"]["text"]

                    # Splitte den Tag
                    tag_parts = tag["tag_with_polarity"].rsplit("-", 1)
                    if len(tag_parts) == 2:
                        tag["label"], tag["polarity"] = tag_parts
                    else:
                        tag["label"] = tag_parts[0]
                        tag["polarity"] = ""

                    tags.append(tag)
                    
                if "choices" in result["value"]:
                    if result["from_name"] == "aspect_available_without_judgement":
                        aspect_available_without_judgement = True
                    if result["from_name"] == "two_or_more_sentences":
                        two_or_more_sentences = True
                        
        text = raw_entry["data"]["text"]         
        entry = {"tags": tags, "text": text, "aspect_available_without_judgement": aspect_available_without_judgement, "two_or_more_sentences":two_or_more_sentences}

        raw_entry["data"] = {key: value for key, value in raw_entry["data"].items() if not key.startswith("Unnamed")}

        for key, value in raw_entry["data"].items():
            entry[key] = value
            
        formatted_data.append(entry)

    return formatted_data

formatted_json = format_json(data)

### Save Raw Annotations

In [5]:
with open('dataset_total/raw_dataset.json', 'w', encoding='utf-8') as json_file:
    json.dump(formatted_json, json_file, ensure_ascii=False)

### Remove Samples without Aspects

In [6]:
entries_with_empty_tags = [entry for entry in formatted_json if not entry['tags']]
len(entries_with_empty_tags), "von", len(formatted_json)

(4, 'von', 17)

In [7]:
formatted_json = [entry for entry in formatted_json if entry['tags']]
len(formatted_json)

13

### Remove Samples with Conflict

In [8]:
entries_with_polarity_conflict = [entry for entry in formatted_json if any(tag.get('polarity') == 'CONFLICT' for tag in entry['tags'])]

len(entries_with_polarity_conflict)

0

In [9]:
formatted_json = [entry for entry in formatted_json if not any(tag.get('polarity') == 'CONFLICT' for tag in entry['tags'])]
len(formatted_json)

13

### Remove Samples with #Sentences > 2

In [10]:
entries_with_two_or_more_sentences = [entry for entry in formatted_json if entry['two_or_more_sentences']]
entries_with_two_or_more_sentences

[{'tags': [{'end': 0,
    'start': 0,
    'tag_with_polarity': 'SERVICE-POSITIVE',
    'tag_with_polarity_and_type': 'SERVICE-POSITIVE-no-phrase-implicit',
    'text': 'NULL',
    'type': 'label-implicit',
    'label': 'SERVICE',
    'polarity': 'POSITIVE'},
   {'end': 0,
    'start': 0,
    'tag_with_polarity': 'GENERAL-IMPRESSION-POSITIVE',
    'tag_with_polarity_and_type': 'GENERAL-IMPRESSION-POSITIVE-no-phrase-implicit',
    'text': 'NULL',
    'type': 'label-implicit',
    'label': 'GENERAL-IMPRESSION',
    'polarity': 'POSITIVE'},
   {'end': 80,
    'start': 75,
    'tag_with_polarity': 'AMBIENCE-NEGATIVE',
    'tag_with_polarity_and_type': 'AMBIENCE-NEGATIVE-explicit',
    'text': 'Lokal',
    'type': 'label-explicit',
    'label': 'AMBIENCE',
    'polarity': 'NEGATIVE'}],
  'text': 'Freundlichkeit und Qualität.Aber wenn Sie als 2 Personen in das halb volle Lokal mit mehreren unbesetzten Tischen kommen, werden nur „Notplätze“ angeboten.',
  'aspect_available_without_judgement': 

In [11]:
formatted_json = [entry for entry in formatted_json if not entry['two_or_more_sentences']]
len(formatted_json)

12

### --> !!!!!!!!!Delete Later: Enlarge Dataset

In [12]:
[etr["id"] for etr in formatted_json]

['8f5a5889-8cce-40c4-a9d4-ec68c9ad3447',
 'dfa4a582-b715-4fe5-ba4a-f143441b97bb',
 '69f59b51-d6f8-497e-a1da-b78bb5ca1236',
 '58ac65fd-3680-4c27-9a7d-edec844c8a95',
 '6175b364-b0b9-4c9c-8630-74ff8b77ddb2',
 'a56c16ea-da9b-40cf-b3f4-d4aecd9a6a66',
 '19ebfe0f-3cac-46cf-b9cb-a2939cba656c',
 '83a022dd-ce94-4e2e-82c4-eb793ea8407d',
 'b6e3dc4e-40fb-435a-94c1-b971e702ca3e',
 '91b66d74-8997-479a-b3d7-30ce2c96cfa4',
 '91b3b176-0e70-43c1-ac5c-206e973e77da',
 '3cbfda42-767f-4370-83de-d703f66e0e16']

In [13]:
desired_length = 2500
copies_needed = desired_length // len(formatted_json)
duplicated_list = formatted_json * copies_needed
remaining_length = desired_length - len(duplicated_list)
if remaining_length > 0:
    duplicated_list += formatted_json[:remaining_length]

In [14]:
import uuid

formatted_json = []

for i in range(len(duplicated_list)):
    new_obj = duplicated_list[i].copy()
    new_obj["id"] = str(uuid.uuid4())
    formatted_json.append(new_obj)

In [15]:
len(formatted_json)

2500

### Save Formatted Dataset

In [16]:
with open('dataset_total/filtered_dataset.json', 'w', encoding='utf-8') as json_file:
    json.dump(formatted_json, json_file, ensure_ascii=False)