In [18]:
import json
import pandas as pd
from transformers import AutoTokenizer

with open('../labeled_data/gold_labels/reconstructed_gold_labels.json', 'r') as f:
    data = json.load(f)

texts = [item['text'] for item in data]
entities = [item['entities'] for item in data]
df = pd.DataFrame({'text': texts, 'entities': entities})

In [20]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
df['tokenized'] = df['text'].apply(
    lambda x: tokenizer(x, return_offsets_mapping=True, truncation=True, padding=True)
)

In [22]:
def align_labels_to_tokens(text, entities, tokenized):
    offset_mapping = tokenized['offset_mapping']
    labels = ["O"] * len(offset_mapping) # Initialize all tokens with "O"

    for entity in entities:
        start, end, label_type = entity['start'], entity['end'], entity['type']

        for idx, (token_start, token_end) in enumerate(offset_mapping):
            if token_start is None or token_end is None:
                continue
            if token_start >= start and token_end <= end:
                if token_start == start:
                    labels[idx] = f"B-{label_type}"
                else:
                    labels[idx] = f"I-{label_type}"

    return labels

df['labels'] = df.apply(lambda row: align_labels_to_tokens(row['text'], row['entities'], row['tokenized']), axis=1)

In [23]:
# Save to JSON or CSV format
output_data = []
for _, row in df.iterrows():
    tokens = tokenizer.convert_ids_to_tokens(row['tokenized']['input_ids'])
    labels = row['labels']
    output_data.append({'tokens': tokens, 'labels': labels})

# Save the processed data
with open('tokenized_ner_data.json', 'w') as f:
    json.dump(output_data, f, indent=4)