#### Importing required libraries

In [142]:
import json
from sklearn.model_selection import train_test_split

In [143]:
with open('NER_TRAIN_JUDGEMENT.json') as file:
    data = json.load(file)

In [144]:
with open('NER_TEST_JUDGEMENT.json') as file:
    test_data = json.load(file)

In [145]:
train, val = train_test_split(data, test_size=0.15, random_state=42)

#### BIO Encoding of data

In [146]:
unique_labels = set()
for i in range(len(data)):
    for annotation in data[i]['annotations'][0]['result']:
        label = annotation['value']['labels'][0]
        unique_labels.add(label)
print(unique_labels)

{'CASE_NUMBER', 'PETITIONER', 'JUDGE', 'GPE', 'DATE', 'WITNESS', 'COURT', 'RESPONDENT', 'ORG', 'OTHER_PERSON', 'STATUTE', 'PRECEDENT', 'PROVISION'}


#### Cleaning Data - Replacing escape sequences with spaces

In [147]:
def clean_text(text):
    special_chars = ['\x05', '\t', '\n', '\x0c', '\x11', '\x12', '\x13', '\x14', '\x16', '\x1a', '\x80', '\x9d', '\xa0', '\xad', '\uf076']
    for char in special_chars:
        text = text.replace(char, ' ')
    return text

#### Storing all the label boundaries in a list

In [148]:
def border_index(annotations):
    border_indices = []
    for annotation in annotations[0]['result']:
        start = annotation['value']['start']
        end = annotation['value']['end']
        label = annotation['value']['labels'][0]
        label = label.upper()
        border_indices.append([start, end, label])
    border_indices.sort(key=lambda x: x[0])
    return border_indices

#### Adding spaces on the boundaries of labels where there is no space

In [149]:
def border_spacing(text, border_indices):
    i = 0
    while i < len(text):
        for border in border_indices:
            if (i==border[0] or i==border[1]):
                index = border_indices.index(border)
                if (i==border[0] and i!=0 and text[i-1]!=' '):
                    text = text[:i] + ' ' + text[i:]
                    for j in range(index, len(border_indices)):
                        if border_indices[j][0] >= i:
                            border_indices[j][0] += 1
                        if border_indices[j][1] >= i:
                            border_indices[j][1] += 1
                if (i==border[1] and i!=len(text)-1 and text[i]!=' '):
                    text = text[:i] + ' ' + text[i:]
                    for j in range(index, len(border_indices)):
                        if border_indices[j][0] >= i:
                            border_indices[j][0] += 1
                        if border_indices[j][1] >= i:
                            border_indices[j][1] += 1
                i += 1
        i += 1
    return text, border_indices

#### Performing Tokenization by space and BIO Encoding

In [150]:
def bio_encoding(text, border_indices):
    tokens = text.split(' ')
    labels = ['O'] * len(tokens)
    for annotation in border_indices:
        start = annotation[0]
        end = annotation[1]
        label = annotation[2]
        label = label.upper()
        label_start_token = None
        label_end_token = None
        curr_token_index = 0
        i = 0
        while (i < len(text)):
            if (text[i] == ' '):
                i += 1
            else:
                curr_word = ''
                if (i == start):
                    label_start_token = curr_token_index
                    while (i < end):
                        current_word = ''
                        if (text[i] == ' '):
                            while (text[i] == ' '):
                                i += 1
                        else: 
                            while (i < len(text) and text[i] != ' '):
                                current_word += text[i]
                                i += 1
                            if (tokens[curr_token_index] == current_word):
                                curr_token_index += 1
                    label_end_token = curr_token_index
                else: 
                    while (i < len(text) and text[i] != ' '):
                        curr_word += text[i]
                        i += 1
                    if (tokens[curr_token_index] == curr_word):
                        curr_token_index += 1
        if (label_end_token == None):
            label_end_token = len(tokens) - 1
        if (label_start_token == None):
            continue
        for i in range(label_start_token, label_end_token):
            if i == label_start_token:
                labels[i] = 'B_' + label
            else:
                labels[i] = 'I_' + label
    return labels

#### BIO Encoding of data

In [151]:
def convert_to_bio(data):
    processed_data = {}
    for i in range(len(data)):
        id = data[i]['id']
        annotations = data[i]['annotations']
        text = data[i]['data']['text']
        text = clean_text(text)
        border_indices = border_index(annotations)
        text, border_indices = border_spacing(text, border_indices)
        labels = bio_encoding(text, border_indices)
        processed_data[id] = {'text': text, 'labels': labels}
    return processed_data

#### Saving the data

In [152]:
processed_train = convert_to_bio(train)
processed_val = convert_to_bio(val)
processed_test = convert_to_bio(test_data)

In [153]:
# dumping the processed data
with open('NER_TRAIN_JUDGEMENT_PROCESSED.json', 'w') as file:
    json.dump(processed_train, file, indent=4)
with open('NER_VAL_JUDGEMENT_PROCESSED.json', 'w') as file:
    json.dump(processed_val, file, indent=4)
with open('NER_TEST_JUDGEMENT_PROCESSED.json', 'w') as file:
    json.dump(processed_test, file, indent=4)