In [1]:
import pandas as pd
import json
from tqdm import tqdm
import re

In [3]:
label2id_binary = {
    'O': 0,
    'I-hallucination': 1,
    'ignore': -100
}
id2label_binary = {v: k for k, v in label2id_binary.items()}

print(label2id_binary)
print(id2label_binary)

def tags_to_ids(tags, label2id):
    return [label2id[tag] for tag in tags]

def convert_to_binary_labels(entry):
    labels_binary = [1 if label != 'O' else 0 for label in entry['tags']]
    return labels_binary

def convert_to_binary_tags(entry):
    tags_binary = ['hallucination' if tag != 'O' else 'O' for tag in entry['tags']]
    return tags_binary

def convert_to_binary_bio_tags(entry):
    bio_tags_binary = []
    for tag in entry['tags']:
        if tag.startswith('ignore'):
            bio_tags_binary.append('ignore')
        elif tag.startswith('I-'):
            bio_tags_binary.append('I-hallucination')
        else:
            bio_tags_binary.append('O')
    return bio_tags_binary

def add_binary_labels_and_tags(input_file, output_file):
    # Load the processed dataset
    print(f'Reading {input_file}')

    with open(input_file, 'r') as infile:
        data = json.load(infile)

    # Add binary labels and tags to each entry
    for entry in tqdm(data, desc='Processing'):
        entry['labels_binary'] = convert_to_binary_labels(entry)
        entry['tags_binary'] = convert_to_binary_tags(entry)
        entry['bio_tags_binary'] = convert_to_binary_bio_tags(entry)
        entry['bio_labels_binary'] = tags_to_ids(entry['bio_tags_binary'], label2id_binary)

    # Save the updated dataset to a new JSON file
    print(f'Writing the data in  {output_file}')
    
    with open(output_file, 'w') as outfile:
        json.dump(data, outfile, indent=4)

    # Save label2id and id2label dictionaries
    mappings = {
        'label2id': label2id_binary,
        'id2label': id2label_binary
    }
    with open('label_mappings.json', 'w') as mapping_file:
        json.dump(mappings, mapping_file, indent=4)

    print("Binary labels, tags, and BIO tags added and saved successfully.")
    print("Label mappings saved to label_mappings.json.")


file_directory = 'data_v1'

input_file = f'{file_directory}/train_all_final.json'
output_file = 'train_all_binary.json'
add_binary_labels_and_tags(input_file, output_file)

input_file = f'{file_directory}/test_all_final.json'
output_file = 'test_all_binary.json'
add_binary_labels_and_tags(input_file, output_file)
