In [2]:
import xmltodict

# 忽略命名空间
namespaces = {
    'http://www.drugbank.ca': None
}

with open('data/drugbank.xml', 'r', encoding="utf-8") as file:
    xml_data = file.read()

# XML转换为dict
xml_dict = xmltodict.parse(xml_data, process_namespaces=True, namespaces=namespaces)

In [62]:
import copy

drugs: list[dict] = xml_dict.get('drugbank').get('drug')
drug_test= drugs[231:250]
drug_source = copy.deepcopy(drug_test)

In [58]:
import re

def process_drug(drug: dict) -> None:
    """Process the drug dictionary."""
    process_meta(drug)
    process_ids(drug)
    process_list(drug)

def process_meta(drug: dict) -> None:
    """Process the drug metadata and format"""
    drug['type'] = drug.get('@type')
    del drug['@type']
    drug['created'] = drug.get('@created')
    del drug['@created']
    drug['updated'] = drug.get('@updated')
    del drug['@updated']
    
def process_ids(drug: dict) -> None:
    """Process the list of IDs into a dictionary with the ID type as the key."""
    ids = drug.get('drugbank-id')
    for drugbank_id in ids:
        if type(drugbank_id) != str:
            drug['id'] = drugbank_id.get('#text')
            continue
        column = re.match(r'([A-Za-z]+)(\d+)', drugbank_id).group(1).lower()+'_id'
        drug[column] = drugbank_id
    del drug['drugbank-id']

def process_list(drug: dict) -> None:
    """Destructure the list of single property into a list of strings."""
    
    # groups
    groups = drug.get('groups').get('group')
    drug['groups'] = [groups] if type(groups) != list else groups
    # affected-organisms
    organism = drug.get('affected-organisms')
    if organism:
        organisms = drug.get('affected-organisms').get('affected-organism')
        drug['affected-organisms'] = [organisms] if type(organisms) != list else organisms

In [59]:
from typing import Dict, List


def iterate_references_to_module(drug: dict, reference_list: List[dict]) -> None:
    """Destructure the list of references into an independent module."""
    references: List[dict] = []
    origin_references = drug.get('general-references')
    articles: Dict[str,list]|None = origin_references.get('articles')
    textbooks: Dict[str,list]|None = origin_references.get('textbooks')
    links: Dict[str,list]|None = origin_references.get('links')
    attachments: Dict[str,list]|None = origin_references.get('attachments')
    drug_id = drug.get('id')
    if articles:
        val = articles.get('article')
        if type(val) != list:
            val = [val]
        for article in val:
            references.append(
                {'drug_id': drug_id,
                 'type': 'article',
                 'ref-id': article.get('ref-id'),
                 'core_sign': article.get('pubmed-id'),
                 'citation': article.get('citation')
                 })
    if textbooks:
        val = textbooks.get('textbook')
        if type(val) != list:
            val = [val]
        for textbook in val:
            references.append(
                {'drug_id': drug_id,
                 'type': 'textbook',
                 'ref-id': textbook.get('ref-id'),
                 'core_sign': textbook.get('isbn'),
                 'citation': textbook.get('citation')
                 })
    if links:
        val = links.get('link')
        if type(val) != list:
            val = [val]
        for link in val:
            references.append(
                {'drug_id': drug_id,
                 'type': 'link',
                 'ref-id': link.get('ref-id'),
                 'core_sign': link.get('title'),
                 'citation': link.get('url')
                 })
    if attachments:
        val = attachments.get('attachment')
        if type(val) != list:
            val = [val]
        for attachment in val:
            references.append(
                {'drug_id': drug_id,
                 'type': 'attachment',
                 'ref-id': attachment.get('ref-id'),
                 'core_sign': attachment.get('title'),
                 'citation': attachment.get('url')
                 })
    reference_list.extend(references)
    del drug['general-references']

def iterate_source_to_module(drug: dict, source_list: List[dict])->None:
    """Process the packagers and manufacturers of the drug"""
    sources: List[dict] = []
    drug_id = drug.get('id')
    packagers = drug.get('packagers')
    manufacturers = drug.get('manufacturers')
    if packagers:
        val = packagers.get('packager')
        if type(val) != list:
            val = [val]
        for packager in val:
            sources.append({
                'drug_id': drug_id,
                'type': 'packager',
                'name': packager.get('name'),
                'generic': '',
                'url': packager.get('url')
            })
    if manufacturers:
        val = manufacturers.get('manufacturer')
        if type(val) != list:
            val = [val]
        for manufacturer in val:
            sources.append({
                'drug_id': drug_id,
                'type': 'manufacturer',
                'name': manufacturer.get('#text'),
                'generic': 1 if manufacturer.get('@generic')=='true' else 0,
                'url': manufacturer.get('@url')
            })
    source_list.extend(sources)
    del drug['packagers']
    del drug['manufacturers']

def iterate_price_to_module(drug: dict, price_list: List[dict])->None:
    prices: List[dict] = []
    drug_id = drug.get('id')
    drug_prices = drug.get('prices')
    if drug_prices:
        val = drug_prices.get('price')
        if type(val) != list:
            val = [val]
        for drug_price in val:
            cost = drug_price.get('cost')
            prices.append({
                'drug_id': drug_id,
                'cost': cost.get('#text'),
                'currency': cost.get('@currency'),
                'unit': drug_price.get('unit'),
                'description': drug_price.get('description')
            })
    price_list.extend(prices)
    del drug['prices']

In [66]:
from typing import Tuple

def copy_dict(drug:dict, academic:dict, property: str) -> None:
    academic[property] = drug.get(property) if property in drug else None

def separate_academic(drug: dict) -> Tuple[dict,dict]:
    academic = {}
    academic['name'] = drug.get('name')
    academic['id'] = drug.get('id')
    academic['type'] = drug.get('type')
    academic['cas-number'] = drug.get('cas-number')
    del drug['cas-number']
    academic['unii'] = drug.get('unii')
    del drug['unii']
    academic['average-mass'] = drug.get('average-mass')
    del drug['average-mass']
    academic['monoisotopic-mass'] = drug.get('monoisotopic-mass')
    del drug['monoisotopic-mass']
    # 药物合成的参考文献
    academic['synthesis-reference'] = drug.get('synthesis-reference')
    del drug['synthesis-reference']
    # 药理学，作用原理
    academic['pharmacodynamics'] = drug.get('pharmacodynamics')
    del drug['pharmacodynamics']
    # 作用机制
    academic['mechanism-of-action'] = drug.get('mechanism-of-action')
    del drug['mechanism-of-action']
    # 毒性
    academic['toxicity'] = drug.get('toxicity')
    del drug['toxicity']
    # 代谢
    academic['metabolism'] = drug.get('metabolism')
    del drug['metabolism']
    # 吸收
    academic['absorption'] = drug.get('absorption')
    del drug['absorption']
    # 半衰期
    academic['half-life'] = drug.get('half-life')
    del drug['half-life']
    # 结合蛋白
    academic['protein-binding'] = drug.get('protein-binding')
    del drug['protein-binding']
    # 消除途径
    academic['route-of-elimination'] = drug.get('route-of-elimination')
    del drug['route-of-elimination']
    # 分布量
    academic['volume-of-distribution'] = drug.get('volume-of-distribution')
    del drug['volume-of-distribution']
    # 肾血浆清除率
    academic['clearance'] = drug.get('clearance')
    del drug['clearance']
    # AHFS 编码
    academic['ahfs-codes'] = drug.get('ahfs-codes')
    del drug['ahfs-codes']
    # 蛋白质序列数据库(PDB)
    academic['pdb-entries'] = drug.get('pdb-entries')
    del drug['pdb-entries']
    # 转运(蛋白)
    academic['transporters'] = drug.get('transporters')
    del drug['transporters']
    # 载体
    academic['carriers'] = drug.get('carriers')
    del drug['carriers']
    # 酶
    academic['enzymes'] = drug.get('enzymes')
    del drug['enzymes']
    # 靶点
    academic['targets'] = drug.get('targets')
    del drug['targets']
    # SNP 不良药物反应
    academic['snp-adverse-drug-reactions'] = drug.get('snp-adverse-drug-reactions')
    del drug['snp-adverse-drug-reactions']
    # SNP 效应
    academic['snp-effects'] = drug.get('snp-effects')
    del drug['snp-effects']
    # 化学反应
    academic['reactions'] = drug.get('reactions')
    del drug['reactions']
    # 途径(通路)
    academic['pathways'] = drug.get('pathways')
    del drug['pathways']
    # 外部链接
    academic['external-links'] = drug.get('external-links')
    del drug['external-links']
    # 其他外部标识符
    academic['external-identifiers'] = drug.get('external-identifiers')
    del drug['external-identifiers']
    # 实验属性(熔沸点等)
    academic['experimental-properties'] = drug.get('experimental-properties')
    del drug['experimental-properties']
    # 计算属性(分子量等)
    academic['calculated-properties'] = drug.get('calculated-properties')
    del drug['calculated-properties']
    # MSDS 化学品安全技术说明书
    academic['msds'] = drug.get('msds')
    del drug['msds']
    # salt
    academic['salts'] = drug.get('salts')
    del drug['salts']
    # 混合物
    academic['mixtures'] = drug.get('mixtures')
    del drug['mixtures']
    # 剂量
    academic['dosages'] = drug.get('dosages')
    del drug['dosages']
    # FDA 药品标签
    academic['fda-label'] = drug.get('fda-label')
    del drug['fda-label']
    # parents
    academic['parents'] = drug.get('parents')
    del drug['parents']
    return drug,academic

In [63]:
import csv
import json
def module_to_csv(file_path: str, data: List[dict]):
    with open(file_path,'w',newline='',encoding='utf-8') as f:
        writer = csv.DictWriter(f,fieldnames=list(data[0].keys()))
        writer.writeheader()
        writer.writerows(data)
def module_to_json(file_path: str, data: List[dict]):
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [64]:
references: List[dict] = []
sources: List[dict] = []
prices: List[dict] = []
drug_basic: List[dict] = []
drug_academic: List[dict] = []
for drug_example in drug_test:
    process_drug(drug_example)
    iterate_references_to_module(drug_example,references)
    iterate_source_to_module(drug_example,sources)
    iterate_price_to_module(drug_example,prices)
    drug_basic.append(drug_example)
module_to_csv('data/prices.csv',prices)
module_to_csv('data/references.csv',references)
module_to_csv('data/source.csv',sources)

In [ ]:
"""
classification: 分类  ——尚未解析
general-references: 文献引用  ——尚未解析
synonyms: 同义词 ——尚未解析
products: 产品 ——尚未解析
    product:
        name（名称）: 药物产品的名称
        labeller（生产厂商）: 药物产品的生产厂商
        ndc-id（NDC ID）: 药物产品的国家药品代码（NDC ID）
        ndc-product-code（NDC产品代码）: 药物产品的国家药品代码产品代码
        dpd-id（加拿大药物产品数据库的DPD ID）: 药物产品在加拿大药物产品数据库中的DPD ID。仅在产品在加拿大上市时出现。DPD ID from the            Canadian Drug Product Database. Only present for drugs that are marketed in Canada.
        ema-product-code（欧洲药品管理局数据库的EMA产品代码）: 药物产品在欧洲药品管理局数据库中的EMA产品代码。仅在产品在欧洲联盟市场上市时出现。    EMA product code from the European Medicines Agency Database. Only present for products that are authorised by          central procedure for marketing in the European Union.
        ema-ma-number（欧洲药品管理局数据库的EMA营销授权号）: 药物产品在欧洲药品管理局数据库中的EMA营销授权号。仅在产品在欧洲联盟市场上市时出现。     EMA marketing authorisation number from the European Medicines Agency Database.
                    Only present for products that are authorised by central procedure for marketing in the European Union.
        started-marketing-on（上市日期）: 药物产品上市的日期。
        ended-marketing-on（停售日期）: 药物产品停售的日期。
        dosage-form（剂型）: 药物产品的剂型。
        strength（剂量）: 药物产品的剂量。
        route（给药途径）: 药物产品的给药途径。
        fda-application-number（FDA申请号）: 药物产品的FDA申请号。
        generic（是否为仿制药）: 药物产品是否为仿制药。
        over-the-counter（是否为非处方药）: 药物产品是否为非处方药。
        approved（是否获得批准上市）: 药物产品是否已获得批准上市。
        country（国家）: 药物产品的来源国家。"US" (美国), "Canada" (加拿大), 和 "EU" (欧联) Drug products are currently only imported     from the U.S. (FDA) and Canada (Canadian Drug Product Database, or DPD).
        source（信息来源）: 药物产品信息的来源。"FDA NDC" (美国FDA国家药品代码), "DPD" (加拿大药物产品数据库), 和 "EMA" (欧洲药品管理局数据库) Drug products are currently only imported from the FDA and the Canadian Drug Product Database, or DPD.
"""