In [4]:
import xmltodict

# 忽略命名空间
namespaces = {
    'http://www.drugbank.ca': None
}

with open('data/drugbank.xml', 'r', encoding="utf-8") as file:
    xml_data = file.read()

# XML转换为dict
xml_dict = xmltodict.parse(xml_data, process_namespaces=True, namespaces=namespaces)

In [5]:
import json

drugs: list[dict] = xml_dict.get('drugbank').get('drug')
drug_test = drugs[0]
json_test = json.dumps(drug_test)
# print(json_test)

In [None]:
import re

"""
classification: 分类  ——尚未解析
general-references: 文献引用  ——尚未解析
synonyms: 同义词 ——尚未解析
products: 产品 ——尚未解析
    product:
        name（名称）: 药物产品的名称
        labeller（生产厂商）: 药物产品的生产厂商
        ndc-id（NDC ID）: 药物产品的国家药品代码（NDC ID）
        ndc-product-code（NDC产品代码）: 药物产品的国家药品代码产品代码
        dpd-id（加拿大药物产品数据库的DPD ID）: 药物产品在加拿大药物产品数据库中的DPD ID。仅在产品在加拿大上市时出现。DPD ID from the            Canadian Drug Product Database. Only present for drugs that are marketed in Canada.
        ema-product-code（欧洲药品管理局数据库的EMA产品代码）: 药物产品在欧洲药品管理局数据库中的EMA产品代码。仅在产品在欧洲联盟市场上市时出现。    EMA product code from the European Medicines Agency Database. Only present for products that are authorised by          central procedure for marketing in the European Union.
        ema-ma-number（欧洲药品管理局数据库的EMA营销授权号）: 药物产品在欧洲药品管理局数据库中的EMA营销授权号。仅在产品在欧洲联盟市场上市时出现。     EMA marketing authorisation number from the European Medicines Agency Database.
                    Only present for products that are authorised by central procedure for marketing in the European Union.
        started-marketing-on（上市日期）: 药物产品上市的日期。
        ended-marketing-on（停售日期）: 药物产品停售的日期。
        dosage-form（剂型）: 药物产品的剂型。
        strength（剂量）: 药物产品的剂量。
        route（给药途径）: 药物产品的给药途径。
        fda-application-number（FDA申请号）: 药物产品的FDA申请号。
        generic（是否为仿制药）: 药物产品是否为仿制药。
        over-the-counter（是否为非处方药）: 药物产品是否为非处方药。
        approved（是否获得批准上市）: 药物产品是否已获得批准上市。
        country（国家）: 药物产品的来源国家。"US" (美国), "Canada" (加拿大), 和 "EU" (欧联) Drug products are currently only imported     from the U.S. (FDA) and Canada (Canadian Drug Product Database, or DPD).
        source（信息来源）: 药物产品信息的来源。"FDA NDC" (美国FDA国家药品代码), "DPD" (加拿大药物产品数据库), 和 "EMA" (欧洲药品管理局数据库) Drug products are currently only imported from the FDA and the Canadian Drug Product Database, or DPD.
"""


def process_drug(drug: dict) -> None:
    """Process the drug dictionary."""
    process_ids(drug)
    process_groups(drug)


def process_ids(drug: dict) -> None:
    """Process the list of IDs into a dictionary with the ID type as the key."""
    ids = drug.get('drugbank-id')
    for id in ids:
        if type(id) != str:
            drug['id'] = id.get('#text')
            continue
        column = re.match(r'([A-Za-z]+)(\d+)', id).group(1).lower()
        drug[column] = id
    del drug['drugbank-id']


def process_groups(drug: dict) -> None:
    """Destructure the list of groups into a list of strings."""
    drug['groups'] = drug.get('groups').get('group')


In [None]:
from typing import Dict, List


def iterate_references_to_module(drug: dict, reference_list: List[dict]) -> None:
    """Destructure the list of references into an independent module."""
    references: List[dict] = []
    origin_references = drug.get('general-references')
    articles: Dict[str,list]|None = origin_references.get('articles')
    textbooks: Dict[str,list]|None = origin_references.get('textbooks')
    links: Dict[str,list]|None = origin_references.get('links')
    attachments: Dict[str,list]|None = origin_references.get('attachments')
    drug_id = drug.get('id')
    if articles:
        for article in articles.get('article'):
            references.append(
                {'drug_id': drug_id,
                 'type': 'article',
                 'ref-id': article.get('ref-id'),
                 'core_sign': article.get('pubmed-id'),
                 'citation': article.get('citation')
                 })
    if textbooks:
        for textbook in textbooks.get('textbook'):
            references.append(
                {'drug_id': drug_id,
                 'type': 'textbook',
                 'ref-id': textbook.get('ref-id'),
                 'core_sign': textbook.get('isbn'),
                 'citation': textbook.get('citation')
                 })
    if links:
        for link in links.get('link'):
            references.append(
                {'drug_id': drug_id,
                 'type': 'link',
                 'ref-id': link.get('ref-id'),
                 'core_sign': link.get('title'),
                 'citation': link.get('url')
                 })
    if attachments:
        for attachment in attachments.get('attachment'):
            references.append(
                {'drug_id': drug_id,
                 'type': 'attachment',
                 'ref-id': attachment.get('ref-id'),
                 'core_sign': attachment.get('title'),
                 'citation': attachment.get('url')
                 })
    reference_list.extend(references)

In [None]:
import box

# Iterate through drugs
for drug_dict in drugs:
    drug = box.Box(drug_dict)

    # Access specific information about each drug
    drugbank_id = drug['drugbank-id']
    name = drug['name']
    description = drug['description']

    # Print or process the information as needed
    print(f"Drugbank ID: {drugbank_id}, Name: {name}, Description: {description}")
