In [15]:
import re
import json

In [16]:
def normalize_text(text):
    """
    Normalize the text to remove noisy characters and ensure consistency.
    """
    # Replace newline characters with a space
    text = text.replace("\n", " ")

    # Replace Unicode non-breaking spaces with regular spaces
    text = text.replace("\u00a0", " ")

    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    # Trim leading and trailing spaces
    return text.strip()

In [17]:
def adjust_entity_offsets(original_text, cleaned_text, entities):
    """
    Adjust entity offsets to match the cleaned text.
    """
    original_index = 0
    new_index = 0
    offset_map = {}
    cleaned_text_buffer = []

    for char in original_text:
        if char in ["\n", "\u00a0"]:  # Replace specific characters
            char = " "
        if char.isspace() and (not cleaned_text_buffer or cleaned_text_buffer[-1] == " "):  # Avoid multiple spaces
            original_index += 1
            continue

        cleaned_text_buffer.append(char)
        offset_map[original_index] = new_index
        original_index += 1
        new_index += 1

    # Adjust entity offsets based on the mapping
    adjusted_entities = []
    for entity in entities:
        adjusted_start = offset_map.get(entity['start'], None)
        adjusted_end = offset_map.get(entity['end'], None)
        if adjusted_start is not None and adjusted_end is not None:
            # Extract the text based on adjusted offsets
            adjusted_text = cleaned_text[adjusted_start:adjusted_end]
            # Validate adjusted text with original entity text
            if entity['text'].strip() in adjusted_text or adjusted_text in entity['text'].strip():
                adjusted_entities.append({
                    'start': adjusted_start,
                    'end': adjusted_end,
                    'type': entity['type'],
                    'text': adjusted_text
                })
            else:
                # Fallback to the original entity text if mismatch occurs
                adjusted_entities.append({
                    'start': adjusted_start,
                    'end': adjusted_end,
                    'type': entity['type'],
                    'text': entity['text']
                })
    return adjusted_entities

In [18]:
def clean_and_process_json(input_path, output_path):
    """
    Load, clean, and save the normalized JSON data.
    """
    with open(input_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    for item in data:
        for annotation in item['annotations']:
            raw_text = annotation.get('text', '')
            entities = [result['value'] for result in annotation['result']]

            # Normalize text
            cleaned_text = normalize_text(raw_text)

            # Adjust entity offsets
            annotation['text'] = cleaned_text
            annotation['result'] = adjust_entity_offsets(raw_text, cleaned_text, entities)

    # Save the cleaned data
    with open(output_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4, ensure_ascii=False)

In [19]:
input_json_path = '../Labeled_data/gold_labels/filtered_annotations.json'
output_json_path = '../Labeled_data/gold_labels/cleaned_annotations.json'

# Process the JSON file
clean_and_process_json(input_json_path, output_json_path)

print(f"Cleaned and normalized JSON saved to {output_json_path}")

Cleaned and normalized JSON saved to ../Labeled_data/gold_labels/cleaned_annotations.json
