In [2]:
icd_text=open('/data/icd_data/icd10cm_tabular_2023.txt','r').read()

In [3]:
import re

def get_disease_types(icd_text):

    """
    Extracts disease codes and names from a given 
    ICD (International Classification of Diseases) text.

    Args:
        icd_text (str): The input ICD text containing 
        disease codes and names.

    Returns:
        tuple: A tuple containing two lists - disease_codes
        and disease_names.
            disease_codes (list): List of extracted disease codes.
            disease_names (list): List of extracted disease names.
    """

    block_data=[]
    lines=[]
    for line in icd_text.split('\n'):
        lines.append(line)

    disease_codes = []
    disease_names = []

    for item in lines:
        match = re.match(r'[A-Z0-9]+\.[A-Z0-9]+',item)
        if match:
            code = match.group(0)
            name = item[match.end():].strip()
            disease_codes.append(code)
            disease_names.append(name)

    return disease_codes,disease_names

In [4]:
disease_codes,disease_names=get_disease_types(icd_text)

In [5]:
print(disease_codes)
print(disease_names)

['A00.0', 'A00.1', 'A00.9', 'A01.0', 'A01.00', 'A01.01', 'A01.02', 'A01.03', 'A01.04', 'A01.05', 'A01.09', 'A01.1', 'A01.2', 'A01.3', 'A01.4', 'A02.0', 'A02.1', 'A02.2', 'A02.20', 'A02.21', 'A02.22', 'A02.23', 'A02.24', 'A02.25', 'A02.29', 'A02.8', 'A02.9', 'A03.0', 'A03.1', 'A03.2', 'A03.3', 'A03.9', 'A04.0', 'A04.1', 'A04.2', 'A04.3', 'A04.4', 'A04.5', 'A04.6', 'A04.7', 'A04.71', 'A04.72', 'A04.8', 'A04.9', 'A05.0', 'A05.1', 'A05.2', 'A05.3', 'A05.4', 'A05.8', 'A05.9', 'A06.0', 'A06.1', 'A06.2', 'A06.3', 'A06.4', 'A06.5', 'A06.6', 'A06.7', 'A06.8', 'A06.81', 'A06.82', 'A06.89', 'A06.9', 'A07.0', 'A07.1', 'A07.2', 'A07.3', 'A07.4', 'A07.8', 'A07.9', 'A08.0', 'A08.1', 'A08.11', 'A08.19', 'A08.2', 'A08.3', 'A08.31', 'A08.32', 'A08.39', 'A08.4', 'A08.8', 'A15.0', 'A15.4', 'A15.5', 'A15.6', 'A15.7', 'A15.8', 'A15.9', 'A17.0', 'A17.1', 'A17.8', 'A17.82', 'A17.83', 'A17.89', 'A17.9', 'A18.0', 'A18.01', 'A18.02', 'A18.03', 'A18.09', 'A18.1', 'A18.10', 'A18.11', 'A18.12', 'A18.13', 'A18.14', 

In [6]:
def get_parent_codes(disease_codes):
    """
    Extracts unique parent codes from a list of disease codes.

    Args:
        disease_codes (list): List of disease codes.

    Returns:
        list: List of unique parent codes.
    """
    parent_codes = []
    for icd in disease_codes:
        parent_subpart = icd.split(".")[0]
        if parent_subpart not in parent_codes:
            parent_codes.append(parent_subpart)

    return parent_codes

In [7]:
print(parent_codes)

['A00', 'A01', 'A02', 'A03', 'A04', 'A05', 'A06', 'A07', 'A08', 'A15', 'A17', 'A18', 'A19', 'A20', 'A21', 'A22', 'A23', 'A24', 'A25', 'A26', 'A27', 'A28', 'A30', 'A31', 'A32', 'A36', 'A37', 'A38', 'A39', 'A40', 'A41', 'A42', 'A43', 'A44', 'A48', 'A49', 'A50', 'A51', 'A52', 'A53', 'A54', 'A56', 'A59', 'A60', 'A63', 'A66', 'A67', 'A68', 'A69', 'A71', 'A74', 'A75', 'A77', 'A79', 'A80', 'A81', 'A82', 'A83', 'A84', 'A85', 'A87', 'A88', 'A92', 'A93', 'A95', 'A96', 'A98', 'B00', 'B01', 'B02', 'B05', 'B06', 'B07', 'B08', 'B10', 'B15', 'B16', 'B17', 'B18', 'B19', 'B25', 'B26', 'B27', 'B30', 'B33', 'B34', 'B35', 'B36', 'B37', 'B38', 'B39', 'B40', 'B41', 'B42', 'B43', 'B44', 'B45', 'B46', 'B47', 'B48', 'B50', 'B51', 'B52', 'B53', 'B55', 'B56', 'B57', 'B58', 'B60', 'B65', 'B66', 'B67', 'B68', 'B69', 'B70', 'B71', 'B73', 'B74', 'B76', 'B77', 'B78', 'B81', 'B82', 'B83', 'B85', 'B87', 'B88', 'B90', 'B94', 'B95', 'B96', 'B97', 'B99', 'C00', 'C02', 'C03', 'C04', 'C05', 'C06', 'C08', 'C09', 'C10', 'C11'

In [9]:
def extract_parent_diseases(icd_text, parent_codes):
    """
    Extracts parent diseases from a given ICD (International Classification of Diseases) text based on specified parent codes.

    Args:
        icd_text (str): The input ICD text containing disease codes and names.
        parent_codes (list): List of parent codes to filter the diseases.

    Returns:
        list: List of extracted parent diseases.
    """

    parent_disease = []
    lines = icd_text.split('\n')

    for line in lines:
        if line.strip() == '':
            continue
        get_parent_code = line.split()[0]

        if get_parent_code in parent_codes:
            get_parent_disease = line.split(get_parent_code)[1]
            parent_disease.append(get_parent_disease.strip())

    return parent_disease

In [10]:
print(parent_disease)

['Cholera', 'Typhoid and paratyphoid fevers', 'Other salmonella infections', 'Shigellosis', 'Other bacterial intestinal infections', 'Other bacterial foodborne intoxications, not elsewhere classified', 'Amebiasis', 'Other protozoal intestinal diseases', 'Viral and other specified intestinal infections', 'Respiratory tuberculosis', 'Tuberculosis of nervous system', 'Tuberculosis of other organs', 'Miliary tuberculosis', 'Plague', 'Tularemia', 'Anthrax', 'Brucellosis', 'Glanders and melioidosis', 'Rat-bite fevers', 'Erysipeloid', 'Leptospirosis', 'Other zoonotic bacterial diseases, not elsewhere classified', "Leprosy [Hansen's disease] Includes: infection due to Mycobacterium leprae", 'Infection due to other mycobacteria', 'Listeriosis', 'Diphtheria', 'Whooping cough', 'Scarlet fever', 'Meningococcal infection', 'Streptococcal sepsis', 'Other sepsis', 'Actinomycosis', 'Nocardiosis', 'Bartonellosis', 'Other bacterial diseases, not elsewhere classified', 'Bacterial infection of unspecified

In [11]:

def get_chapter_blocks(icd_text):

    """
    Extracts chapter codes and names from a given 
    ICD (International Classification of Diseases) text.

    Args:
        icd_text (str): The input ICD text containing chapter information.

    Returns:
        tuple: A tuple containing two lists - chapter_codes and chapter_names.
            chapter_codes (list): List of extracted chapter codes.
            chapter_names (list): List of extracted chapter names.
    """

    block_data=[]
    lines=[]
    for line in icd_text.split('\n'):
        lines.append(line)

    for i in range(0,len(lines)):
        if lines[i].strip()=="This chapter contains the following blocks:":
            for j in range(i+1,len(lines)):
                if lines[j].strip()=='':
                    break
                block_data.append(lines[j].strip())
            break
    
    chapter_codes = []
    chapter_names = []

    for item in block_data:
        match = re.match(r'([A-Z]+\d+(-[A-Z]+\d+)?)', item)
        if match:
            code = match.group(1)
            name = item[match.end():].strip()
            chapter_codes.append(code)
            chapter_names.append(name)

    return chapter_codes,chapter_names

In [12]:
chapter_codes, chapter_names = get_chapter_blocks(icd_text)


In [13]:
chapter_blocks=[chapter_names[i]+ str(" (")+chapter_codes[i]+str(")") for i in range(len(chapter_names)) if len(chapter_codes)==len(chapter_names)]
print(chapter_blocks)

['Intestinal infectious diseases (A00-A09)', 'Tuberculosis (A15-A19)', 'Certain zoonotic bacterial diseases (A20-A28)', 'Other bacterial diseases (A30-A49)', 'Infections with a predominantly sexual mode of transmission (A50-A64)', 'Other spirochetal diseases (A65-A69)', 'Other diseases caused by chlamydiae (A70-A74)', 'Rickettsioses (A75-A79)', 'Viral and prion infections of the central nervous system (A80-A89)', 'Arthropod-borne viral fevers and viral hemorrhagic fevers (A90-A99)', 'Viral infections characterized by skin and mucous membrane lesions (B00-B09)', 'Other human herpesviruses (B10)', 'Viral hepatitis (B15-B19)', 'Human immunodeficiency virus [HIV] disease (B20)', 'Other viral diseases (B25-B34)', 'Mycoses (B35-B49)', 'Protozoal diseases (B50-B64)', 'Helminthiases (B65-B83)', 'Pediculosis, acariasis and other infestations (B85-B89)', 'Sequelae of infectious and parasitic diseases (B90-B94)', 'Bacterial and viral infectious agents (B95-B97)', 'Other infectious diseases (B99)'

In [14]:
def create_hierarchy(code, name, disease_codes, disease_names):
    """
    Recursively creates a hierarchy of disease subtypes based on the given code and name.

    Args:
        code (str): The disease code.
        name (str): The disease name.
        disease_codes (list): List of all disease codes.
        disease_names (list): List of all disease names.

    Returns:
        dict: A dictionary representing the hierarchy of disease subtypes.
    """
    subtypes = []
    for sub_code, sub_name in zip(disease_codes, disease_names):
        if sub_code.startswith(code) and sub_code != code:
            subtypes.append(create_hierarchy(sub_code, sub_name, disease_codes, disease_names))
    return {
        "name": name,
        "icd_code": code,
        "subtypes": subtypes
    }

def generate_disease_hierarchy(parent_codes, parent_disease, disease_codes, disease_names):
    """
    Generates a hierarchical structure of diseases based on parent codes and disease data.

    Args:
        parent_codes (list): List of parent disease codes.
        parent_disease (list): List of parent disease names.
        disease_codes (list): List of all disease codes.
        disease_names (list): List of all disease names.

    Returns:
        list: List of dictionaries representing the hierarchical structure of diseases.
    """
    result = []

    for parent_code, parent_name in zip(parent_codes, parent_disease):
        parent_entry = {
            "disease": parent_name,
            "icd_code": parent_code,
            "subtypes": [create_hierarchy(sub_code, sub_name, disease_codes, disease_names) for sub_code, sub_name in zip(disease_codes, disease_names) if sub_code.startswith(parent_code) and sub_code != parent_code and len(sub_code) == 5]
        }
        result.append(parent_entry)

    return result


In [19]:
import json

def write_to_json(data, output_file_path):
    """
    Writes the given data to a JSON file.

    Args:
        data: The data to be written to the JSON file.
        output_file_path (str): The path to the output JSON file.
    """
    with open(output_file_path, 'w') as outfile:
        json.dump(data, outfile)

output_file_path = '/insuranceLLM/data_extract/output/data_icd.json'
write_to_json(result, output_file_path)
