In [None]:
! pip install pandas lxml

In [2]:
import pandas as pd
import lxml.etree as ET

In [3]:
def parse_drugbank_xml(xml_file_path):
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    ns = {'db': 'http://www.drugbank.ca'}

    drugs = []
    atc_codes = []

    for drug in root.findall('db:drug', ns):
        drug_name = drug.find('db:name', ns).text if drug.find('db:name', ns) is not None else 'No Name'
        
        atc_code_elements = drug.findall('db:atc-codes/db:atc-code', ns)
        codes = [code.get('code') for code in atc_code_elements] if atc_code_elements else ['No ATC Code']

        drugs.append(drug_name)
        atc_codes.append(', '.join(codes))
    
    return pd.DataFrame({'Drug': drugs, 'ATC Codes': atc_codes})

In [16]:
def parse_drugbank_xml(xml_file_path):
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    ns = {'db': 'http://www.drugbank.ca'}

    drugs = []
    atc_codes = []
    drugbank_ids = []
    pathways = []

    for drug in root.findall('db:drug', ns):
        drug_name = drug.find('db:name', ns).text if drug.find('db:name', ns) is not None else 'No Name'
        drugbank_id = drug.find('db:drugbank-id', ns).text if drug.find('db:drugbank-id', ns) is not None else 'No ID'
        
        atc_code_elements = drug.findall('db:atc-codes/db:atc-code', ns)
        codes = [code.get('code') for code in atc_code_elements] if atc_code_elements else ['No ATC Code']
        
        pathway_elements = drug.findall('db:pathways/db:pathway/db:name', ns)
        drug_pathways = [pathway.text for pathway in pathway_elements] if pathway_elements else ['No Pathways']

        drugs.append(drug_name)
        drugbank_ids.append(drugbank_id)
        atc_codes.append(', '.join(codes))
        pathways.append(', '.join(drug_pathways))
    
    return pd.DataFrame({'Drug': drugs, 'DrugBank ID': drugbank_ids, 'ATC Codes': atc_codes, 'Pathways': pathways})


In [17]:
xml_path = 'full_database.xml'
df = parse_drugbank_xml(xml_path)

df.to_csv('drugbank_atc_codes_out.csv', index=False)
print("CSV file has been created.")

CSV file has been created.


In [15]:
count = 0
for row in df.itertuples():
    if row._2 == 'No ATC Code':
        count+=1

print(f"Number of drugs without ATC codes: {count}")
print(f"Total number of drugs: {len(df)}")
print(f"Percentage of drugs without ATC codes: {count/len(df)*100:.2f}%")

Number of drugs without ATC codes: 13111
Total number of drugs: 16581
Percentage of drugs without ATC codes: 79.07%


In [None]:
# from ./download/drugbank_drug_targets.csv which is a dataframe consisting of 2 columsn called 'DrugBank ID', 'target'
# load drugbank_drug_targets as a pandas dataframe
# from /full_database/drugbank_atc_codes_out.csv
# load drugbank_atc_codes_out as a pandas dataframe
# for every drug entry in drugbank_drug_targets, find the corresponding drug entry in drugbank_atc_codes_out and append the entire row to the dataframe
# the new dataframe should look like 'DrugBank ID', 'target', 'Drug', 'DrugBank ID', 'ATC Codes', and 'Pathways'
# then drop the 'DrugBank ID' 4th column
# now sort the dataframe by the 'Pathways' in descending order
# convert the dataframe to a csv


In [25]:
def merge_and_sort_data(drug_targets_path, atc_codes_path, output_csv_path):
    # Load drugbank_drug_targets.csv and drugbank_atc_codes_out.csv as pandas dataframes
    drug_targets_df = pd.read_csv(drug_targets_path)
    atc_codes_df = pd.read_csv(atc_codes_path)

    drug_targets_df = drug_targets_df.rename(columns={'drug': 'DrugBank ID'})

    # Merge drug_targets_df with atc_codes_df on 'DrugBank ID' column
    merged_df = pd.merge(drug_targets_df, atc_codes_df, on='DrugBank ID', how='left')
    
    # Sort the merged dataframe by 'Pathways' column in descending order
    merged_df.sort_values(by='Pathways', ascending=False, inplace=True)

    # Drop the duplicate 'DrugBank ID' column
    merged_df.drop(merged_df.columns[3], axis=1, inplace=True)
    
    # Convert the dataframe to CSV
    merged_df.to_csv(output_csv_path, index=False)
    
    # Print the first 10 and last 10 elements of the dataframe
    print("First 10 elements:")
    print(merged_df.head(10))
    print("\nLast 10 elements:")
    print(merged_df.tail(10))

In [None]:
merge_and_sort_data('../download/drugbank_drug_targets.csv',
                    'drugbank_atc_codes_out.csv',
                    'merged_data.csv')

In [None]:
df = pd.read_csv('merged_data.csv')

split_df = df.assign(Pathways=df['Pathways'].str.split(', ')).explode('Pathways')
split_df = split_df.rename(columns={'Pathways' : 'Pathway'})

print(split_df.head(10))
print(split_df.tail(10))

In [33]:
sorted_df = split_df.sort_values(by='Pathway', ascending=False)
print(split_df.head)
print(split_df.tail)

<bound method NDFrame.head of       DrugBank ID     target          Drug   
0         DB03088  BE0002942  Pidolic acid  \
0         DB03088  BE0002942  Pidolic acid   
0         DB03088  BE0002942  Pidolic acid   
0         DB03088  BE0002942  Pidolic acid   
0         DB03088  BE0002942  Pidolic acid   
...           ...        ...           ...   
13673     DB08679  BE0001594           NaN   
13674     DB08696  BE0000779           NaN   
13675     DB08696  BE0004382           NaN   
13676     DB08700  BE0000915           NaN   
13677     DB08764  BE0002411           NaN   

                                       Pathway  
0      gamma-Glutamyltranspeptidase Deficiency  
0         gamma-Glutamyltransferase Deficiency  
0                    5-Oxoprolinase Deficiency  
0                       Glutathione Metabolism  
0                              5-Oxoprolinuria  
...                                        ...  
13673                                      NaN  
13674                    

In [34]:
print("Number of unique pathways:", sorted_df['Pathway'].nunique())

Number of unique pathways: 856


In [38]:
valid_pathways_df = sorted_df.dropna(subset=['Pathway'])

num_unique_drugs_with_pathways = valid_pathways_df['DrugBank ID'].nunique()

print("Total number of unique drugs with non-NaN pathways:", num_unique_drugs_with_pathways)

drug_targets_df = pd.read_csv("../download/drugbank_drug_targets.csv")
print("Total number of unique drugs:", drug_targets_df['drug'].nunique())
#print("Total number of unique targets:", drug_targets_df['target'].nunique())


Total number of unique drugs with non-NaN pathways: 5928
Total number of unique drugs: 6029
