# 1. DrugBank 数据集处理
## 1.1 数据集介绍
## 1.2 数据集解析

In [2]:
import xmltodict

# 忽略命名空间
namespaces = {
    'http://www.drugbank.ca': None
}

with open('data/drugbank.xml', 'r', encoding="utf-8") as file:
    xml_data = file.read()

# XML转换为dict
xml_dict = xmltodict.parse(xml_data, process_namespaces=True, namespaces=namespaces)

## 1.3 数据集切片测试

In [62]:
import copy

drugs: list[dict] = xml_dict.get('drugbank').get('drug')
drug_test= drugs[231:250]
drug_source = copy.deepcopy(drug_test)

## 1.4 数据集基础信息处理

In [58]:
import re

def process_drug(drug: dict) -> None:
    """Process the drug dictionary."""
    process_meta(drug)
    process_ids(drug)
    process_list(drug)

def process_meta(drug: dict) -> None:
    """Process the drug metadata and format"""
    drug['type'] = drug.get('@type')
    del drug['@type']
    drug['created'] = drug.get('@created')
    del drug['@created']
    drug['updated'] = drug.get('@updated')
    del drug['@updated']
    
def process_ids(drug: dict) -> None:
    """Process the list of IDs into a dictionary with the ID type as the key."""
    ids = drug.get('drugbank-id')
    for drugbank_id in ids:
        if type(drugbank_id) != str:
            drug['id'] = drugbank_id.get('#text')
            continue
        column = re.match(r'([A-Za-z]+)(\d+)', drugbank_id).group(1).lower()+'_id'
        drug[column] = drugbank_id
    del drug['drugbank-id']

def process_list(drug: dict) -> None:
    """Destructure the list of single property into a list of strings."""
    
    # groups
    groups = drug.get('groups').get('group')
    drug['groups'] = [groups] if type(groups) != list else groups
    # affected-organisms
    organism = drug.get('affected-organisms')
    if organism:
        organisms = drug.get('affected-organisms').get('affected-organism')
        drug['affected-organisms'] = [organisms] if type(organisms) != list else organisms

## 1.5 数据集部分属性独立处理

In [59]:
from typing import Dict, List


def iterate_references_to_module(drug: dict, reference_list: List[dict]) -> None:
    """Destructure the list of references into an independent module."""
    references: List[dict] = []
    origin_references = drug.get('general-references')
    articles: Dict[str,list]|None = origin_references.get('articles')
    textbooks: Dict[str,list]|None = origin_references.get('textbooks')
    links: Dict[str,list]|None = origin_references.get('links')
    attachments: Dict[str,list]|None = origin_references.get('attachments')
    drug_id = drug.get('id')
    if articles:
        val = articles.get('article')
        if type(val) != list:
            val = [val]
        for article in val:
            references.append(
                {'drug_id': drug_id,
                 'type': 'article',
                 'ref-id': article.get('ref-id'),
                 'core_sign': article.get('pubmed-id'),
                 'citation': article.get('citation')
                 })
    if textbooks:
        val = textbooks.get('textbook')
        if type(val) != list:
            val = [val]
        for textbook in val:
            references.append(
                {'drug_id': drug_id,
                 'type': 'textbook',
                 'ref-id': textbook.get('ref-id'),
                 'core_sign': textbook.get('isbn'),
                 'citation': textbook.get('citation')
                 })
    if links:
        val = links.get('link')
        if type(val) != list:
            val = [val]
        for link in val:
            references.append(
                {'drug_id': drug_id,
                 'type': 'link',
                 'ref-id': link.get('ref-id'),
                 'core_sign': link.get('title'),
                 'citation': link.get('url')
                 })
    if attachments:
        val = attachments.get('attachment')
        if type(val) != list:
            val = [val]
        for attachment in val:
            references.append(
                {'drug_id': drug_id,
                 'type': 'attachment',
                 'ref-id': attachment.get('ref-id'),
                 'core_sign': attachment.get('title'),
                 'citation': attachment.get('url')
                 })
    reference_list.extend(references)
    del drug['general-references']

def iterate_source_to_module(drug: dict, source_list: List[dict])->None:
    """Process the packagers and manufacturers of the drug"""
    sources: List[dict] = []
    drug_id = drug.get('id')
    packagers = drug.get('packagers')
    manufacturers = drug.get('manufacturers')
    if packagers:
        val = packagers.get('packager')
        if type(val) != list:
            val = [val]
        for packager in val:
            sources.append({
                'drug_id': drug_id,
                'type': 'packager',
                'name': packager.get('name'),
                'generic': '',
                'url': packager.get('url')
            })
    if manufacturers:
        val = manufacturers.get('manufacturer')
        if type(val) != list:
            val = [val]
        for manufacturer in val:
            sources.append({
                'drug_id': drug_id,
                'type': 'manufacturer',
                'name': manufacturer.get('#text'),
                'generic': 1 if manufacturer.get('@generic')=='true' else 0,
                'url': manufacturer.get('@url')
            })
    source_list.extend(sources)
    del drug['packagers']
    del drug['manufacturers']

def iterate_price_to_module(drug: dict, price_list: List[dict])->None:
    prices: List[dict] = []
    drug_id = drug.get('id')
    drug_prices = drug.get('prices')
    if drug_prices:
        val = drug_prices.get('price')
        if type(val) != list:
            val = [val]
        for drug_price in val:
            cost = drug_price.get('cost')
            prices.append({
                'drug_id': drug_id,
                'cost': cost.get('#text'),
                'currency': cost.get('@currency'),
                'unit': drug_price.get('unit'),
                'description': drug_price.get('description')
            })
    price_list.extend(prices)
    del drug['prices']

## 1.6 数据集属性分离(基础信息与学术性信息)

In [66]:
from typing import Tuple

def copy_dict(drug:dict, academic:dict, property: str) -> None:
    academic[property] = drug.get(property) if property in drug else None

def separate_academic(drug: dict) -> Tuple[dict,dict]:
    academic = {}
    academic_properties = ['cas-number','unii','average-mass','monoisotopic-mass',
                           'synthesis-reference','pharmacodynamics',
                           'mechanism-of-action','toxicity','metabolism',
                           'absorption','half-life','protein-binding',
                           'route-of-elimination','volume-of-distribution',
                           'clearance','ahfs-codes','pdb-entries','transporters',
                           'carriers','enzymes','targets',
                           'snp-adverse-drug-reactions','snp-effects',
                           'reactions','pathways','external-links',
                           'external-identifiers','experimental-properties',
                           'calculated-properties','msds','salts','mixtures',
                           'dosages','fda-label','parents']
    
    for academic_property in academic_properties:
        copy_dict(drug,academic,academic_property)
        del drug[academic_property]

    copy_dict(drug,academic,'name')
    copy_dict(drug,academic,'id')
    copy_dict(drug,academic,'type')

    return drug,academic

## 1.7 数据集属性输出(csv 或 json)

In [63]:
import csv
import json
def module_to_csv(file_path: str, data: List[dict]):
    with open(file_path,'w',newline='',encoding='utf-8') as f:
        writer = csv.DictWriter(f,fieldnames=list(data[0].keys()))
        writer.writeheader()
        writer.writerows(data)
def module_to_json(file_path: str, data: List[dict]):
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

## 1.8 数据集处理结果测试

In [64]:
references: List[dict] = []
sources: List[dict] = []
prices: List[dict] = []
drug_basic: List[dict] = []
drug_academic: List[dict] = []
for drug_example in drug_test:
    process_drug(drug_example)
    iterate_references_to_module(drug_example,references)
    iterate_source_to_module(drug_example,sources)
    iterate_price_to_module(drug_example,prices)
    basic_drug, academic_drug = separate_academic(drug_example)
    drug_basic.append(basic_drug)
    drug_academic.append(academic_drug)
module_to_csv('data/prices.csv',prices)
module_to_csv('data/references.csv',references)
module_to_csv('data/source.csv',sources)