In [3]:
import json
import re
from collections import defaultdict

def parse_txt_to_json(file_path):
    data = defaultdict(lambda: {
        "title": "",
        "abstract": "",
        "entity": [],
        "relation": []
    })
    
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Identify title and abstract lines
            if "|t|" in line:
                docid, _, title = line.strip().partition("|t|")
                data[docid]["title"] = title
            elif "|a|" in line:
                docid, _, abstract = line.strip().partition("|a|")
                data[docid]["abstract"] = abstract
            # Parse entity lines
            elif len(line.split()) == 6:
                docid, start, end, mention, entity_type, entity_id = line.strip().split("\t")
                entity = {
                    "start": int(start),
                    "end": int(end),
                    "mention": mention,
                    "type": entity_type,
                    "id": entity_id
                }
                data[docid]["entity"].append(entity)
            # Parse relation lines
            elif len(line.split()) == 4 and "CID" in line:
                docid, _, subj_id, obj_id = line.strip().split("\t")
                relation = {
                    "type": "chem_disease:related",
                    "subj": subj_id,
                    "obj": obj_id
                }
                data[docid]["relation"].append(relation)

    # Convert defaultdict to list of dicts for JSON output
    output = [{"docid": docid, "title": details["title"], "abstract": details["abstract"], 
               "entity": details["entity"], "relation": details["relation"]} for docid, details in data.items()]

    # Save to JSON file
    output_path = file_path.replace(".txt", "_converted.json")
    with open(output_path, 'w', encoding='utf-8') as json_file:
        json.dump(output, json_file, indent=4)

    print(f"Data successfully converted and saved to {output_path}")

# Specify the path to the input .txt file
file_path = '/Users/kavithakamarthy/Downloads/SSR-PU/dataset/CDR_DevelopmentSet.PubTator.txt'
parse_txt_to_json(file_path)


Data successfully converted and saved to /Users/kavithakamarthy/Downloads/SSR-PU/dataset/CDR_DevelopmentSet.PubTator_converted.json
