In [2]:
import json
from bs4 import BeautifulSoup

# read json
with open('covenants.json', 'r') as file:
    data = json.load(file)  

all_labels = []

output_file_path = 'output.txt'

relations = {}

for document in data:
    html_content = document['data']['html']
    soup = BeautifulSoup(html_content, 'html.parser')
    full_text = soup.get_text()
    
    for result in document['annotations'][0]['result']:

        try:
            start = result['value']['globalOffsets']['start']
            end = result['value']['globalOffsets']['end']
            string = result['value']['text']
            label = result['value']['hypertextlabels'][0]
            label_id = result['id']

            all_labels.append((start, end, label, string, label_id))
            

        except KeyError:
            # dealing with continuation links
            from_id = result['from_id']
            to_id = result['to_id']
            
            if from_id not in relations:
                relations[from_id] = []
    
            relations[from_id].append(to_id)

    all_labels.sort(key=lambda x: x[0])
    
    all_labels_dict = {tup[4]: tup for tup in all_labels}

    # Add intermediate "O" labels
    complete_labels = []
    current_position = 0
    for start, end, label, string, label_id in all_labels:
        if current_position < start:
            complete_labels.append((current_position, start, "O", full_text[current_position:start]))
            
        complete_labels.append((start, end, label, string))
        
        current_position = end
        
        if label_id in relations:
            for to_id in relations[label_id]:
                to_id_start = all_labels_dict[to_id][0]
                to_id_end = all_labels_dict[to_id][1]
                to_id_len = to_id_end - to_id_start
                to_id_str = all_labels_dict[to_id][3]
                complete_labels.append((current_position, current_position+to_id_len, label, to_id_str))
                current_position += to_id_len
        
        
    if current_position < len(full_text):
        complete_labels.append((current_position, len(full_text), "O", full_text[current_position:len(full_text)]))


    # Perform IOB tagging
    iob_tags = []
    prev_label = "O"
    
    for start, end, label, string in complete_labels:
        if label == "O":
            words = string.split()
            iob_tags.extend([(word, "O") for word in words])
        else:
            words = string.split()
            for i, word in enumerate(words):
                if i == 0 and label != prev_label:
                    iob_tags.append((word, f"B-{label}"))
                else:
                    iob_tags.append((word, f"I-{label}"))
        
        prev_label = label
        
    with open(output_file_path, 'w', encoding='utf-8') as out_file:
        for word, tag in iob_tags:
            out_file.write(f"{word}\t{tag}\n")
            if word.endswith('.'):
                out_file.write("\n") 
                
    print(f"Processed string has been written to {output_file_path}")
    
    # print(relations)
    
    break


Processed string has been written to output.txt
