# Notebook: Convert Annotation from LabelStudio to .json

## Packages

In [56]:
from format_labelstudio_json import format_json
import json

## Parameters

In [57]:
ANNOTATIONS_PATH = "dataset_total/raw_annotations.json"

## Code

### Convert to json

In [58]:
with open(ANNOTATIONS_PATH, 'r', encoding='utf-8') as json_datei:
    data = json.load(json_datei)

In [59]:
formatted_json = format_json(data)

### Save Raw Annotations

In [60]:
with open('dataset_total/raw_dataset.json', 'w', encoding='utf-8') as json_file:
    json.dump(formatted_json, json_file, ensure_ascii=False)

### Remove Samples with #Sentences > 2

In [61]:
entries_with_two_or_more_sentences = [entry for entry in formatted_json if entry['two_or_more_sentences']]
len(entries_with_two_or_more_sentences)

97

In [62]:
formatted_json = [entry for entry in formatted_json if not entry['two_or_more_sentences']]
len(formatted_json)

3903

### Remove Samples without Aspects

In [63]:
entries_with_empty_tags = [entry for entry in formatted_json if not entry['tags']]
len(entries_with_empty_tags), "von", len(formatted_json)

(3374, 'von', 3903)

In [64]:
formatted_json = [entry for entry in formatted_json if entry['tags']]
len(formatted_json)

529

### Remove Samples with Conflict

In [65]:
entries_with_polarity_conflict = [entry for entry in formatted_json if any(tag.get('polarity') == 'CONFLICT' for tag in entry['tags'])]

len(entries_with_polarity_conflict)

7

In [66]:
formatted_json = [entry for entry in formatted_json if not any(tag.get('polarity') == 'CONFLICT' for tag in entry['tags'])]
len(formatted_json)

522

### --> !!!!!!!!!Delete Later: Enlarge Dataset

In [12]:
[etr["id"] for etr in formatted_json]

['29201856-9893-4eec-a44a-82174cbd867c',
 '5ed24454-faec-4944-b968-b84b9fa5cb6e',
 '294fe7a8-c829-4706-89f7-cf88341c4828',
 '81343575-4711-4a44-bcb1-932dc3c2e0c0',
 'b654d8c6-c609-48d6-9bbb-cac33caeff87',
 'a573f6a9-6567-4d77-8e6d-c5cfdf78dc74',
 '1972c356-75bc-4bbc-9932-495d713c6b51',
 '2d528a49-8af5-42a5-80d1-20c3a4e1cf2d',
 'cde1e8fb-a4f8-49a6-bc3e-34ddadb670d7',
 '23c4c4af-0b35-4256-8c05-fac8dff02e2b',
 'fa1c9d79-d4db-4aed-a549-9f349d466620',
 '8e0f1396-e617-4cc9-b53b-1928f11590e1',
 '30520dd4-df42-488a-bfe8-9dd184564053',
 '3eb34534-c9e0-4586-92bd-cb727eb4087c',
 '10066923-8b41-4ea5-9693-6ac859cbe6dd']

In [13]:
desired_length = 2500
copies_needed = desired_length // len(formatted_json)
duplicated_list = formatted_json * copies_needed
remaining_length = desired_length - len(duplicated_list)
if remaining_length > 0:
    duplicated_list += formatted_json[:remaining_length]

In [14]:
import uuid

formatted_json = []

for i in range(len(duplicated_list)):
    new_obj = duplicated_list[i].copy()
    new_obj["id"] = str(uuid.uuid4())
    formatted_json.append(new_obj)

In [15]:
len(formatted_json)

2500

### Save Formatted Dataset

In [16]:
with open('dataset_total/filtered_dataset.json', 'w', encoding='utf-8') as json_file:
    json.dump(formatted_json, json_file, ensure_ascii=False)