## Processing annotated documents from Inception tool to BIO tag format for NER and mask IDs

In [None]:
import json
import os

# Define the label combination mapping
label_combination_mapping = {
    'Address': ['City', 'Place', 'Territory', 'Postcode', 'Country', 'Other:address', 'Unresolved address'],
    'Amount': ['Value', 'Unit'],
    'Other:ID': ['Document ID'],
    'Person': ['Family name', 'Given name', 'Initial name', 'Role', 'Profession', 'Unresolved name', 'Other name', 'Title', 'Given name-male', 'Given name-female'],
    'Date': ['Day', 'Month', 'Year', 'Standard abbreviation', 'Day of Week', 'Other:date', 'Unresolved:date', 'Calander event'],
    'Relation': ['Family relation', 'Professional relation', 'Social relation'],
    # Add other combinations as needed
}

def map_label(label):
    for combined_label, sub_labels in label_combination_mapping.items():
        if label in sub_labels:
            return combined_label
    return label  # Return the original label if no combination is found

def extract_tokens_tags_and_custom_ids(data, annotator):
    document_text = next((item for item in data['%FEATURE_STRUCTURES'] if item['%TYPE'] == 'uima.cas.Sofa'), {}).get('sofaString', '')

    tokens = []
    sentences = []
    named_entities = []
    custom_masking = []

    # Extract tokens, sentences, and custom masking information
    for item in data['%FEATURE_STRUCTURES']:
        if item['%TYPE'] == 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token':
            tokens.append({
                'begin': item['begin'],
                'end': item['end'],
                'text': document_text[item['begin']:item['end']],
                'ner_tag': 'O',  # Default NER tag
                'mask_id': 'O'  # Default for tokens not matching any mask
            })
        elif item['%TYPE'] == 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence':
            sentences.append({
                'begin': item['begin'],
                'end': item['end'],
                'tokens': []
            })
        elif item['%TYPE'] == 'de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity':
            value = item.get('value')
            if value:
                parts = value.split(':')
                if len(parts) > 2:
                    english_tag = ':'.join(parts[:2]).strip()
                else:
                    english_tag = parts[0].strip()
                combined_tag = map_label(english_tag)
                named_entities.append({
                    'begin': item['begin'],
                    'end': item['end'],
                    'value': combined_tag
                })
        elif item['%TYPE'] == 'webanno.custom.Masking':
            identifiers = item.get('identifiers')
            if identifiers:
                custom_masking.append({
                    'begin': item['begin'],
                    'end': item['end'],
                    'identifiers': identifiers
                })

    # Sort named entities by begin position and then by end position descending
    named_entities.sort(key=lambda x: (x['begin'], -x['end']))

    # Assign tokens to sentences
    for token in tokens:
        for sentence in sentences:
            if sentence['begin'] <= token['begin'] < sentence['end']:
                sentence['tokens'].append(token)
                break

    # Assign BIO tags to tokens for named entities
    for sentence in sentences:
        for token in sentence['tokens']:
            for ne in named_entities:
                if ne['begin'] <= token['begin'] < ne['end']:
                    prefix = "B-" if token['begin'] == ne['begin'] else "I-"
                    token['ner_tag'] = f"{prefix}{ne['value']}"
                    break

    # Assign custom identifiers to tokens using BIO tag format
    for token in tokens:
        matched = False
        for mask in custom_masking:
            if mask['begin'] <= token['begin'] < mask['end']:
                prefix = "B-" if token['begin'] == mask['begin'] else "I-"
                if mask['identifiers'] == 'Direct_id' or mask['identifiers'] == 'Indirect_id':
                    token['mask_id'] = f"{prefix}mask"
                else:
                    token['mask_id'] = f"{prefix}nomask"
                matched = True
                break
        if not matched:
            token['mask_id'] = 'O'  # Default to 'O' if no masks match

    # Debug: Print token information to check BIO tag assignments
    # for token in tokens:
    #     print(f"Token: {token['text']}, NER Tag: {token['ner_tag']}, Mask ID: {token['mask_id']}")

    # Prepare final output structure
    output_data = []
    for sentence in sentences:
        words = [token['text'] for token in sentence['tokens']]
        ner_tags = [token['ner_tag'] for token in sentence['tokens']]
        mask_ids = [token['mask_id'] for token in sentence['tokens']]
        output_data.append({'words': words, 'ner': ner_tags, 'mask_ids': mask_ids, 'annotator': annotator})

    return output_data

def process_files_in_directory(input_directory, output_directory):
    # Ensure output directory exists
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Walk through all files in the input directory
    for root, dirs, files in os.walk(input_directory):
        for file in files:
            annotator = 'unknown'
            if 'gold_standard' in root and file.endswith('.json'):
                annotator = 'gold_standard'
            elif 'annotator_1' in root and file.endswith('harshal.vilas.tarmale.json'):
                annotator = '1'
            elif 'annotator_2' in root and file.endswith('harshith.srinivas.json'):
                annotator = '2'
            else:
                continue  # Skip files that do not match the criteria

            input_path = os.path.join(root, file)
            with open(input_path, 'r', encoding='utf-8') as infile:
                data = json.load(infile)
                output_data = extract_tokens_tags_and_custom_ids(data, annotator)

            # Prepare output path keeping the relative path
            relative_path = os.path.relpath(root, input_directory)
            output_file_directory = os.path.join(output_directory, relative_path)
            if not os.path.exists(output_file_directory):
                os.makedirs(output_file_directory)

            output_path = os.path.join(output_file_directory, file.replace('.json', '_processed.json'))
            with open(output_path, 'w', encoding='utf-8') as outfile:
                json.dump(output_data, outfile, ensure_ascii=False, indent=4)

            print(f"Processed {input_path} and saved to {output_path}")

# Define your input and output directories
input_dir = '/content/drive/MyDrive/NLPMasterthesis/Tesseract-OCR/Documents/final_documents_from_inception'
output_dir = '/content/drive/MyDrive/NLPMasterthesis/Tesseract-OCR/Documents/final_documents_clubbed_with_gold_std_ner_and_mask_ids_in_BIO'

# Process all JSON files in the nested directory
process_files_in_directory(input_dir, output_dir)


Processed /content/drive/MyDrive/NLPMasterthesis/Tesseract-OCR/Documents/final_documents_from_inception/gold_standard/DOC_1.txt/CURATION_USER.json and saved to /content/drive/MyDrive/NLPMasterthesis/Tesseract-OCR/Documents/final_documents_clubbed_with_gold_std_ner_and_mask_ids_in_BIO/gold_standard/DOC_1.txt/CURATION_USER_processed.json
Processed /content/drive/MyDrive/NLPMasterthesis/Tesseract-OCR/Documents/final_documents_from_inception/gold_standard/DOC_2.txt/CURATION_USER.json and saved to /content/drive/MyDrive/NLPMasterthesis/Tesseract-OCR/Documents/final_documents_clubbed_with_gold_std_ner_and_mask_ids_in_BIO/gold_standard/DOC_2.txt/CURATION_USER_processed.json
Processed /content/drive/MyDrive/NLPMasterthesis/Tesseract-OCR/Documents/final_documents_from_inception/gold_standard/DOC_3.txt/CURATION_USER.json and saved to /content/drive/MyDrive/NLPMasterthesis/Tesseract-OCR/Documents/final_documents_clubbed_with_gold_std_ner_and_mask_ids_in_BIO/gold_standard/DOC_3.txt/CURATION_USER_p