In [1]:
import re
import json
from collections import defaultdict, Counter

# Custom Mapping

In [4]:
def conll_to_json(input_file, output_file, class_mapping_file, custom_mapping):
    data = []
    current_sentence = {"id": "0", "tokens": [], "ner_tags": []}
    sentence_id = 0
    tag_counter = Counter()

    # Invert the custom mapping for easier lookup
    tag_dict = {v: k for k, v in custom_mapping.items()}

    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith('-DOCSTART-') or line == '':
                if current_sentence["tokens"]:
                    data.append(current_sentence)
                    sentence_id += 1
                    current_sentence = {"id": str(sentence_id), "tokens": [], "ner_tags": []}
            else:
                parts = line.split()
                if len(parts) >= 4:
                    token, _, _, ner_tag = parts[:4]
                    current_sentence["tokens"].append(token)
                    if ner_tag in tag_dict:
                        current_sentence["ner_tags"].append(tag_dict[ner_tag])
                        tag_counter[ner_tag] += 1
                    else:
                        print(f"Warning: Tag '{ner_tag}' not found in mapping. Using 0 (O) instead.")
                        current_sentence["ner_tags"].append(0)
                        tag_counter['O'] += 1

    if current_sentence["tokens"]:
        data.append(current_sentence)

    # Write to JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        for item in data:
            # Convert to the desired format
            formatted_item = {
                "id": item["id"],
                "tokens": item["tokens"],
                "ner_tags": item["ner_tags"]
            }
            json.dump(formatted_item, f, ensure_ascii=False, separators=(',', ':'))
            f.write('\n')

    # Write class mapping to a separate file
    with open(class_mapping_file, 'w', encoding='utf-8') as f:
        f.write("tag_mapping = {\n")
        for idx, tag in sorted(custom_mapping.items()):
            f.write(f"    {idx}: '{tag}',\n")
        f.write("}\n")

    print(f"Processed {len(data)} sentences.")
    print(f"Found {len(custom_mapping)} unique NER tags: {', '.join(custom_mapping.values())}")
    print(f"Tag counts: {dict(tag_counter)}")
    print("Tag Mapping:")
    for idx, tag in sorted(custom_mapping.items()):
        print(f"    {idx}: '{tag}',")
    print(f"Data saved to {output_file}")
    print(f"Class mapping saved to {class_mapping_file}")

# Custom mapping
custom_map = {
    0: 'O',
    1: 'B-PER',
    2: 'I-PER',
    3: 'B-ORG',
    4: 'I-ORG',
    5: 'B-LOC',
    6: 'I-LOC',
    7: 'B-MISC',
    8: 'I-MISC',
}

In [5]:
# Usage
conll_to_json(r'c:\Users\Sakib Ahmed\Desktop\dev_combined.conll', 'dataset_custom.json', 'tag_mapping_custom.txt', custom_mapping=custom_map)


Processed 3960 sentences.
Found 9 unique NER tags: O, B-PER, I-PER, B-ORG, I-ORG, B-LOC, I-LOC, B-MISC, I-MISC
Tag counts: {'O': 92746, 'B-ORG': 3216, 'B-LOC': 2299, 'B-MISC': 835, 'I-MISC': 1192, 'I-ORG': 2098, 'I-LOC': 878}
Tag Mapping:
    0: 'O',
    1: 'B-PER',
    2: 'I-PER',
    3: 'B-ORG',
    4: 'I-ORG',
    5: 'B-LOC',
    6: 'I-LOC',
    7: 'B-MISC',
    8: 'I-MISC',
Data saved to train - No Per.json
Class mapping saved to mapping_train.txt

