In [1]:
import re
import json
from collections import defaultdict, Counter

# Custom Mapping

In [4]:
def conll_to_json(input_file, output_file, class_mapping_file, custom_mapping):
    data = []
    current_sentence = {"id": "0", "tokens": [], "ner_tags": []}
    sentence_id = 0
    tag_counter = Counter()

    # Invert the custom mapping for easier lookup
    tag_dict = {v: k for k, v in custom_mapping.items()}

    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith('-DOCSTART-') or line == '':
                if current_sentence["tokens"]:
                    data.append(current_sentence)
                    sentence_id += 1
                    current_sentence = {"id": str(sentence_id), "tokens": [], "ner_tags": []}
            else:
                parts = line.split()
                if len(parts) >= 4:
                    token, _, _, ner_tag = parts[:4]
                    current_sentence["tokens"].append(token)
                    if ner_tag in tag_dict:
                        current_sentence["ner_tags"].append(tag_dict[ner_tag])
                        tag_counter[ner_tag] += 1
                    else:
                        print(f"Warning: Tag '{ner_tag}' not found in mapping. Using 0 (O) instead.")
                        current_sentence["ner_tags"].append(0)
                        tag_counter['O'] += 1

    if current_sentence["tokens"]:
        data.append(current_sentence)

    # Write to JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        for item in data:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')

    # Write class mapping to a separate file
    with open(class_mapping_file, 'w', encoding='utf-8') as f:
        f.write("tag_mapping = {\n")
        for idx, tag in sorted(custom_mapping.items()):
            f.write(f"    {idx}: '{tag}',\n")
        f.write("}\n")

    print(f"Processed {len(data)} sentences.")
    print(f"Found {len(custom_mapping)} unique NER tags: {', '.join(custom_mapping.values())}")
    print(f"Tag counts: {dict(tag_counter)}")
    print("Tag Mapping:")
    for idx, tag in sorted(custom_mapping.items()):
        print(f"    {idx}: '{tag}',")
    print(f"Data saved to {output_file}")
    print(f"Class mapping saved to {class_mapping_file}")

# Custom mapping
custom_map = {
    0: 'O',
    1: 'B-Address',
    2: 'I-Address',
    3: 'B-Authority',
    4: 'I-Authority',
    5: 'B-Birth_Place',
    6: 'I-Birth_Place',
    7: 'B-Class',
    8: 'I-Class',
    9: 'B-Country',
    10: 'I-Country',
    11: 'B-DD',
    12: 'I-DD',
    13: 'B-DL_Class',
    14: 'I-DL_Class',
    15: 'B-Document_Number',
    16: 'I-Document_Number',
    17: 'B-Endorsement',
    18: 'I-Endorsement',
    19: 'B-Eyes',
    20: 'I-Eyes',
    21: 'B-First_Name',
    22: 'I-First_Name',
    23: 'B-Hair',
    24: 'I-Hair',
    25: 'B-Height',
    26: 'I-Height',
    27: 'B-Issuance_Number',
    28: 'I-Issuance_Number',
    29: 'B-Last_Name',
    30: 'I-Last_Name',
    31: 'B-License_Number',
    32: 'I-License_Number',
    33: 'B-License_Type',
    34: 'I-License_Type',
    35: 'B-Other',
    36: 'I-Other',
    37: 'B-Restrictions',
    38: 'I-Restrictions',
    39: 'B-Sex',
    40: 'I-Sex',
    41: 'B-State',
    42: 'I-State',
    43: 'B-Weight',
    44: 'I-Weight',
}

In [5]:
# Usage
conll_to_json(r'c:\Users\Sakib Ahmed\Desktop\dev_combined.conll', 'dataset_custom.json', 'tag_mapping_custom.txt', custom_mapping=custom_map)

Processed 181 sentences.
Found 45 unique NER tags: O, B-Address, I-Address, B-Authority, I-Authority, B-Birth_Place, I-Birth_Place, B-Class, I-Class, B-Country, I-Country, B-DD, I-DD, B-DL_Class, I-DL_Class, B-Document_Number, I-Document_Number, B-Endorsement, I-Endorsement, B-Eyes, I-Eyes, B-First_Name, I-First_Name, B-Hair, I-Hair, B-Height, I-Height, B-Issuance_Number, I-Issuance_Number, B-Last_Name, I-Last_Name, B-License_Number, I-License_Number, B-License_Type, I-License_Type, B-Other, I-Other, B-Restrictions, I-Restrictions, B-Sex, I-Sex, B-State, I-State, B-Weight, I-Weight
Tag counts: {'O': 8375, 'B-State': 159, 'B-License_Number': 180, 'B-Endorsement': 104, 'B-Class': 113, 'B-Restrictions': 95, 'B-Sex': 132, 'B-Height': 131, 'I-Height': 277, 'B-Weight': 76, 'B-Eyes': 122, 'B-Last_Name': 184, 'B-First_Name': 184, 'B-Address': 208, 'I-Address': 1048, 'B-Document_Number': 58, 'B-Other': 64, 'B-Hair': 40, 'B-DD': 85, 'I-DD': 79, 'B-Country': 110, 'I-License_Number': 212, 'I-First