In [7]:
#This Code would parse the xml file from xml drugbank target file

import xml.etree.ElementTree as ET
import pandas as pd

def parse_drugbank_xml(file_path):
    ns = {'db': 'http://www.drugbank.ca'}
    tree = ET.parse(file_path)
    root = tree.getroot()
    data = []

    for drug in root.findall('db:drug', ns):
        drugbank_id = drug.find('db:drugbank-id[@primary="true"]', ns).text
        
        # Process targets
        for target in drug.findall('db:targets/db:target', ns):
            target_data = process_interaction(drugbank_id, target, ns, 'Target')
            if target_data:
                data.append(target_data)
        
        # Process carriers
        for carrier in drug.findall('db:carriers/db:carrier', ns):
            carrier_data = process_interaction(drugbank_id, carrier, ns, 'Carrier')
            if carrier_data:
                data.append(carrier_data)
        
        # Process enzymes
        for enzyme in drug.findall('db:enzymes/db:enzyme', ns):
            enzyme_data = process_interaction(drugbank_id, enzyme, ns, 'Enzyme')
            if enzyme_data:
                data.append(enzyme_data)
        
        # Process transporters
        for transporter in drug.findall('db:transporters/db:transporter', ns):
            transporter_data = process_interaction(drugbank_id, transporter, ns, 'Transporter')
            if transporter_data:
                data.append(transporter_data)

    return pd.DataFrame(data)

def process_interaction(drugbank_id, interaction, ns, interaction_type):
    polypeptide = interaction.find('db:polypeptide', ns)
    if polypeptide is not None:
        external_ids = polypeptide.findall('db:external-identifiers/db:external-identifier', ns)
        uniprot_id = next((e.find('db:identifier', ns).text for e in external_ids 
                           if e.find('db:resource', ns).text == 'UniProtKB'), None)
        
        if uniprot_id:
            return {
                'DrugBank ID': drugbank_id,
                'UniProt ID': uniprot_id,
                'Name': polypeptide.find('db:name', ns).text if polypeptide.find('db:name', ns) is not None else None,
                'Gene Name': polypeptide.find('db:gene-name', ns).text if polypeptide.find('db:gene-name', ns) is not None else None,
                'Type': interaction_type
            }
    return None

# Usage
df = parse_drugbank_xml(r"C:\PhD\Data\Drugbank_target\drugbank_all_full_database.xml\full database.xml")
print(df)

# Save to CSV
#df.to_csv('drugbank_interactions.csv', index=False)

      DrugBank ID UniProt ID  \
0         DB00001     P00734   
1         DB00002     P00533   
2         DB00002     O75015   
3         DB00002     P02745   
4         DB00002     P02746   
...           ...        ...   
29354     DB17992     E0W492   
29355     DB18236     Q07912   
29356     DB18704     Q99062   
29357     DB18716     P37321   
29358     DB18716     P05181   

                                                    Name Gene Name    Type  
0                                            Prothrombin        F2  Target  
1                       Epidermal growth factor receptor      EGFR  Target  
2      Low affinity immunoglobulin gamma Fc region re...    FCGR3B  Target  
3                  Complement C1q subcomponent subunit A      C1QA  Target  
4                  Complement C1q subcomponent subunit B      C1QB  Target  
...                                                  ...       ...     ...  
29354              Gaba-gated chloride channel, putative   8239847  Target  

In [12]:
df.to_csv('drugbank_interactions.csv', index=False)

In [16]:
#number of unique targets in the whole drugbank database(from .xml normal academic account
len(df["UniProt ID"].unique())

5056

In [19]:
df[df["DrugBank ID"]=="DB00006"]

Unnamed: 0,DrugBank ID,UniProt ID,Name,Gene Name,Type
21,DB00006,P00734,Prothrombin,F2,Target
22,DB00006,P05164,Myeloperoxidase,MPO,Enzyme
