# Notebook: Convert Annotation from LabelStudio to .json

## Packages

In [1]:
from format_labelstudio_json import format_json
import json

## Parameters

In [2]:
ANNOTATIONS_PATH = "dataset_total/raw_annotations.json"

## Code

### Convert to json

In [3]:
with open(ANNOTATIONS_PATH, 'r', encoding='utf-8') as json_datei:
    data = json.load(json_datei)

In [4]:
formatted_json = format_json(data)

In [5]:
len(formatted_json)

5000

### Save Raw Annotations

In [6]:
with open('dataset_total/raw_dataset.json', 'w', encoding='utf-8') as json_file:
    json.dump(formatted_json, json_file, ensure_ascii=False)

### Remove Samples with #Sentences > 2

In [7]:
entries_with_two_or_more_sentences = [entry for entry in formatted_json if entry['two_or_more_sentences']]
len(entries_with_two_or_more_sentences)

589

In [8]:
formatted_json = [entry for entry in formatted_json if not entry['two_or_more_sentences']]
len(formatted_json)

4411

### Remove Samples without Aspects

In [9]:
entries_with_empty_tags = [entry for entry in formatted_json if not entry['tags']]
len(entries_with_empty_tags), "von", len(formatted_json)

(1291, 'von', 4411)

In [10]:
formatted_json = [entry for entry in formatted_json if entry['tags']]
len(formatted_json)

3120

### Remove Samples with Conflict

In [11]:
entries_with_polarity_conflict = [entry for entry in formatted_json if any(tag.get('polarity') == 'CONFLICT' for tag in entry['tags'])]

len(entries_with_polarity_conflict)

42

In [12]:
formatted_json = [entry for entry in formatted_json if not any(tag.get('polarity') == 'CONFLICT' for tag in entry['tags'])]
len(formatted_json)

3078

In [13]:
with open('dataset_total/dataset_filtered.json', 'w', encoding='utf-8') as json_file:
    json.dump(formatted_json, json_file, ensure_ascii=False)

### Limit to a maximum of 3000 Examples

In [14]:
formatted_json = formatted_json[:3000]
len(formatted_json)

3000

### Save Formatted Dataset

In [15]:
with open('dataset_total/dataset_filtered_3000.json', 'w', encoding='utf-8') as json_file:
    json.dump(formatted_json, json_file, ensure_ascii=False)