In [1]:
import json
from sklearn.model_selection import train_test_split

# Load the data stored in annotated_output.json
with open('../veridion-5/data/annotated_output.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Filter out entries without text content
data_with_text_content = [entry for entry in data if entry.get('text_content')]

# Tokenize the text_content and add the words field to each entry
for entry in data_with_text_content:
    entry['words'] = entry['text_content'].split()
    del entry['text_content']  # Remove text_content field as it's no longer needed

# Split the data into train and test sets 
train_data, test_data = train_test_split(data_with_text_content, test_size=0.1, random_state=42)

# Function to save in test and train json files for NER task
def save_to_json(data, filename):
    formatted_data = []
    for entry in data:
        formatted_entry = {
            "url": entry["url"],
            "words": entry["words"],
            "ner_tags": entry["tags"]
        }
        formatted_data.append(formatted_entry)
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(formatted_data, f, ensure_ascii=False, indent=2)

# Save the train and test data into json files
save_to_json(train_data, 'data/train_data.json')
save_to_json(test_data, 'data/test_data.json')

