In [None]:
import re
import json

def preprocess_text(text, entities):
    # Ensure entity indices are valid and non-negative
    text_length = len(text)
    entities = [[start if start >= 0 else text_length + start, end if end >= 0 else text_length + end, label] for start, end, label in entities]
    # Sort entities by start index to handle overlapping correctly
    entities.sort(key=lambda x: x[0])
    # Track changes in the text
    changes = []
    def apply_substitution(pattern, repl, text):
        matches = list(re.finditer(pattern, text))
        new_text = re.sub(pattern, repl, text)
        offset = 0
        for match in matches:
            start, end = match.span()
            length_diff = len(re.sub(pattern, repl, match.group(0))) - (end - start)
            changes.append((start + offset, length_diff))
            offset += length_diff
        return new_text
    def update_entities():
        for i, (start, end, label) in enumerate(entities):
            for change_start, length_diff in changes:
                if change_start < start:
                    start += length_diff
                    end += length_diff
                elif change_start < end:
                    end += length_diff
            entities[i] = [start, end, label]
    text = apply_substitution(r'[#\(\)\[\]]', '', text)
    def format_long_numbers(match):
        number = match.group()
        formatted = '-'.join(number[i:i+4] for i in range(0, len(number), 4))
        return formatted
    text = re.sub(r'\b\d{5,}\b', format_long_numbers, text)
    text = apply_substitution(r'@', ' at ', text)
    def convert_number_to_string(match):
        number = match.group()
        formatted = '-'.join(number)
        return formatted
    text = re.sub(r'\d', convert_number_to_string, text)
    text = apply_substitution(r'([A-Za-z])(\d)', r'\1 \2', text)
    text = apply_substitution(r'(\d)([A-Za-z])', r'\1 \2', text)
    abbreviations = {
        'Im': "I'm",
        'Ive': "I've",
        'Id': "I'd",
        'Ill': "I'll"
    }
    for abbr, full_form in abbreviations.items():
        text = apply_substitution(abbr, full_form, text)
    text = apply_substitution(r'\bI_C\b', 'I C', text)
    text = apply_substitution(r'\bic\b', 'I C', text)
    text = apply_substitution(r'\(ppb\)', ',', text)
    text = apply_substitution(r'ppb', ',', text)
    # Update the entities after all changes
    update_entities()

    return text, entities
def preprocess_jsonl(input_path, output_path):
    preprocessed_data = []

    with open(input_path, 'r') as infile:
        for line in infile:
            item = json.loads(line)
            new_text, new_entities = preprocess_text(item["text"], item["entities"])
            preprocessed_data.append({"text": new_text, "entities": new_entities})

    with open(output_path, 'w') as outfile:
        for item in preprocessed_data:
            outfile.write(json.dumps(item) + '\n')
input_path = "transcript_output/merged_output/merged_data_true.jsonl"
output_path = 'transcript_output/preprocessed/merged_data_true.jsonl'
preprocess_jsonl(input_path, output_path)