#### Importing required libraries

In [60]:
import json
from sklearn.model_selection import train_test_split

In [61]:
with open('NER_TRAIN_JUDGEMENT.json') as file:
    data = json.load(file)

In [66]:
with open('NER_TEST_JUDGEMENT.json') as file:
    test_data = json.load(file)

In [63]:
train, val = train_test_split(data, test_size=0.15, random_state=42)

#### BIO Encoding of data

In [64]:
def bio_label_encoding(text, annotations):
    tokens = text.split(' ')
    labels = ['O'] * len(tokens)
    for annotation in annotations[0]['result']: 
        start = annotation['value']['start']
        end = annotation['value']['end']
        label = annotation['value']['labels'][0]
        label = label.upper()
        label_start_token = None
        label_end_token = None
        curr_token_index = 0
        for i, token in enumerate(tokens):
            curr_token_index += len(token) + 1
            if curr_token_index > start and label_start_token is None:
                label_start_token = i
            if curr_token_index > end and label_end_token is None: 
                label_end_token = i
                break
        if label_end_token is None:
            label_end_token = len(tokens) - 1
        for i in range(label_start_token, label_end_token + 1):
            if i == label_start_token:
                labels[i] = 'B_' + label
            else:
                labels[i] = 'I_' + label
    return labels

#### Iterating over each sentence and encoding it

In [65]:
def convert_to_bio(data):
    processed_data = {}
    for i in range(len(data)):
        id = data[i]['id']
        annotations = data[i]['annotations']
        text = data[i]['data']['text']
        labels = bio_label_encoding(text, annotations)
        processed_data[id] = {'text': text, 'labels': labels}
    return processed_data

In [68]:
processed_train = convert_to_bio(train)
processed_val = convert_to_bio(val)
processed_test = convert_to_bio(test_data)

In [69]:
# dumping the processed data
with open('NER_TRAIN_JUDGEMENT_PROCESSED.json', 'w') as file:
    json.dump(processed_train, file, indent=4)
with open('NER_VAL_JUDGEMENT_PROCESSED.json', 'w') as file:
    json.dump(processed_val, file, indent=4)
with open('NER_TEST_JUDGEMENT_PROCESSED.json', 'w') as file:
    json.dump(processed_test, file, indent=4)