In [7]:
import json

def convert_to_conll(json_data):
    output_lines = ["-DOCSTART- -X- O O\n\n"]
    
    for item in json_data:
        text = item['data']['text']
        annotations = item['annotations'][0]['result'] if item['annotations'] else []
        
        # Sort annotations by start position
        sorted_annotations = sorted(annotations, key=lambda x: x['value']['start'])
        
        # Create list of tokens with their labels
        current_pos = 0
        tokens = []
        prev_label = None
        
        for annotation in sorted_annotations:
            start = annotation['value']['start']
            end = annotation['value']['end']
            label = annotation['value']['labels'][0]
            
            # Add any text before the current annotation as O (Outside) tokens
            if start > current_pos:
                prefix_text = text[current_pos:start]
                prefix_tokens = prefix_text.strip().split()
                for token in prefix_tokens:
                    if token.strip():
                        tokens.append((token, 'O'))
                        prev_label = None  # Reset previous label after O tokens
            
            # Add the annotated token with proper B/I prefix
            annotated_text = text[start:end]
            if annotated_text.strip():
                # If previous label exists and is the same type, use I- prefix
                if prev_label and prev_label.endswith(label):
                    prefix = 'I-'
                else:
                    prefix = 'B-'
                
                tokens.append((annotated_text, f"{prefix}{label}"))
                prev_label = f"{prefix}{label}"
            
            current_pos = end
        
        # Add any remaining text as O tokens
        if current_pos < len(text):
            remaining_text = text[current_pos:]
            remaining_tokens = remaining_text.strip().split()
            for token in remaining_tokens:
                if token.strip():
                    tokens.append((token, 'O'))
        
        # Convert to CONLL format
        for token, label in tokens:
            if token.strip():  # Skip empty tokens
                conll_line = f"{token} -X- _ {label}\n"
                output_lines.append(conll_line)
        
        # Add blank line between sentences
        output_lines.append("\n")
    
    return "".join(output_lines)

def main(input_file, output_file):
    # Read JSON data
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Convert to CONLL format
    conll_output = convert_to_conll(data)
    
    # Write output
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(conll_output)

In [8]:
if __name__ == "__main__":
    main(r"c:\Users\Sakib Ahmed\Desktop\Untitled-1.json", "output.conll")