In [1]:
import re
import json

In [8]:
def clean_text_and_adjust_entities(data):
    """
    Cleans the text and adjusts entity offsets to match the cleaned text.
    
    :param data: List of dictionaries containing 'text' and 'entities'.
    :return: Cleaned data with adjusted entity offsets.
    """
    cleaned_data = []

    for item in data:
        original_text = item["text"]
        entities = item["entities"]

        # Clean the text and generate an offset map
        cleaned_text, offset_map = clean_and_map_offsets(original_text)

        # Adjust entity offsets
        adjusted_entities = []
        for entity in entities:
            adjusted_start = offset_map.get(entity["start"], None)
            adjusted_end = offset_map.get(entity["end"], None)

            if adjusted_start is not None and adjusted_end is not None:
                # Extract the text based on adjusted offsets
                adjusted_text = cleaned_text[adjusted_start:adjusted_end]
                
                # Validate adjusted text with original entity text
                if entity["text"].strip() in adjusted_text or adjusted_text in entity["text"].strip():
                    # Append adjusted entity
                    adjusted_entities.append({
                        "start": adjusted_start,
                        "end": adjusted_end,
                        "type": entity["type"],
                        "text": adjusted_text
                    })
                else:
                    # Fallback: Use the original entity text if mismatch occurs
                    adjusted_entities.append({
                        "start": adjusted_start,
                        "end": adjusted_end,
                        "type": entity["type"],
                        "text": entity["text"]
                    })

        # Append cleaned text and adjusted entities to the result
        cleaned_data.append({
            "text": cleaned_text,
            "entities": adjusted_entities
        })

    return cleaned_data

In [9]:
def clean_and_map_offsets(text):
    """
    Cleans the text and creates a mapping of original to new offsets.

    :param text: Original text.
    :return: Cleaned text and a mapping of original to new offsets.
    """
    original_index = 0
    new_index = 0
    offset_map = {}
    cleaned_text = []

    for char in text:
        if char in ["\n", "\u00a0", "\"]:  # Replace specific characters
            char = " "
        if char.isspace() and (not cleaned_text or cleaned_text[-1] == " "):  # Avoid multiple spaces
            original_index += 1
            continue

        cleaned_text.append(char)
        offset_map[original_index] = new_index
        original_index += 1
        new_index += 1

    # Join the cleaned text
    cleaned_text = "".join(cleaned_text)
    return cleaned_text, offset_map


In [10]:
input_file_path = '../Labeled_data/gold_labels/reconstructed_gold_labels.json'
output_file_path = '../Labeled_data/gold_labels/cleaned_gold_labels.json'

with open(input_file_path, 'r', encoding='utf-8') as file:
    gold_labels = json.load(file)

# Clean and normalize the gold labels
fixed_gold_labels = clean_text_and_adjust_entities(gold_labels)

# Save the fixed data to a new JSON file
with open(output_file_path, 'w', encoding='utf-8') as file:
    json.dump(fixed_gold_labels, file, ensure_ascii=False, indent=4)

print(f"Fixed gold labels saved to {output_file_path}")

Fixed gold labels saved to ../Labeled_data/gold_labels/cleaned_gold_labels.json
