In [1]:
! pip install pandas lxml

^C


In [9]:
import pandas as pd
import lxml.etree as ET

In [16]:
def parse_drugbank_xml(xml_file_path):
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    ns = {'db': 'http://www.drugbank.ca'}

    drugs = []
    atc_codes = []

    for drug in root.findall('db:drug', ns):
        drug_name = drug.find('db:name', ns).text if drug.find('db:name', ns) is not None else 'No Name'
        
        atc_code_elements = drug.findall('db:atc-codes/db:atc-code', ns)
        codes = [code.get('code') for code in atc_code_elements] if atc_code_elements else ['No ATC Code']

        drugs.append(drug_name)
        atc_codes.append(', '.join(codes))
    
    return pd.DataFrame({'Drug': drugs, 'ATC Codes': atc_codes})

In [4]:
def parse_drugbank_xml(xml_file_path):
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    ns = {'db': 'http://www.drugbank.ca'}

    drugs = []
    atc_codes = []
    drugbank_ids = []
    pathways = []

    for drug in root.findall('db:drug', ns):
        drug_name = drug.find('db:name', ns).text if drug.find('db:name', ns) is not None else 'No Name'
        drugbank_id = drug.find('db:drugbank-id', ns).text if drug.find('db:drugbank-id', ns) is not None else 'No ID'
        
        atc_code_elements = drug.findall('db:atc-codes/db:atc-code', ns)
        codes = [code.get('code') for code in atc_code_elements] if atc_code_elements else ['No ATC Code']
        
        pathway_elements = drug.findall('db:pathways/db:pathway/db:name', ns)
        drug_pathways = [pathway.text for pathway in pathway_elements] if pathway_elements else ['No Pathways']

        drugs.append(drug_name)
        drugbank_ids.append(drugbank_id)
        atc_codes.append(', '.join(codes))
        pathways.append(', '.join(drug_pathways))
    
    return pd.DataFrame({'Drug': drugs, 'DrugBank ID': drugbank_ids, 'ATC Codes': atc_codes, 'Pathways': pathways})

In [18]:
def parse_drugbank_xml(xml_file_path):
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    ns = {'db': 'http://www.drugbank.ca'}

    atc_codes = []
    drugbank_ids = []

    for drug in root.findall('db:drug', ns):
        drugbank_id = drug.find('db:drugbank-id', ns).text if drug.find('db:drugbank-id', ns) is not None else 'No ID'
        
        atc_code_elements = drug.findall('db:atc-codes/db:atc-code', ns)
        codes = [code.get('code') for code in atc_code_elements] if atc_code_elements else ['No ATC Code']

        drugbank_ids.append(drugbank_id)
        atc_codes.append(', '.join(codes))
    
    return pd.DataFrame({'DrugBank ID': drugbank_ids, 'ATC Codes': atc_codes})

In [19]:
xml_path = 'full_database.xml'
df = parse_drugbank_xml(xml_path)
print(df.head())
#df.to_csv('drugbank_atc_codes_out.csv', index=False)
#print("CSV file has been created.")

  DrugBank ID ATC Codes
0     DB00001   B01AE02
1     DB00002   L01FE01
2     DB00003   R05CB13
3     DB00004   L01XX29
4     DB00005   L04AB01


In [5]:
df = pd.read_csv('drugbank_atc_codes_out.csv')

df = df.drop(['Drug', 'Pathways'], axis=1)

df['ATC Codes'] = df['ATC Codes'].apply(lambda x: x if x == 'No ATC Code' else x.split(',')[0][0])

df = df.rename(columns={'ATC Codes': 'First letter ATC Codes'})

print(df.head())
print(df.tail())

  DrugBank ID First letter ATC Codes
0     DB00001                      B
1     DB00002                      L
2     DB00003                      R
3     DB00004                      L
4     DB00005                      L
      DrugBank ID First letter ATC Codes
16576     DB18713            No ATC Code
16577     DB18714            No ATC Code
16578     DB18715            No ATC Code
16579     DB18716            No ATC Code
16580     DB18717            No ATC Code


In [6]:
df.to_csv('drugbank_drug_and_first_letter_atc_codes.csv', index=False)

In [7]:
def assign_groups(df, column_name):
    atc_to_group = {}
    group_id = 0
    
    for index, row in df.iterrows():
        pathways = row[column_name]
        
        if pathways not in atc_to_group:
            atc_to_group[pathways] = group_id
            group_id += 1
        
        df.at[index, 'group'] = atc_to_group[pathways]
    
    return df

In [8]:
df = pd.read_csv('drugbank_drug_and_first_letter_atc_codes.csv')
df = assign_groups(df, 'First letter ATC Codes')
print(df.head())
print(df.tail())

  DrugBank ID First letter ATC Codes  group
0     DB00001                      B    0.0
1     DB00002                      L    1.0
2     DB00003                      R    2.0
3     DB00004                      L    1.0
4     DB00005                      L    1.0
      DrugBank ID First letter ATC Codes  group
16576     DB18713            No ATC Code    5.0
16577     DB18714            No ATC Code    5.0
16578     DB18715            No ATC Code    5.0
16579     DB18716            No ATC Code    5.0
16580     DB18717            No ATC Code    5.0


In [12]:
group_counts = df['group'].value_counts(normalize=True) * 100

all_groups = pd.Series(range(15))
group_percentages = all_groups.map(group_counts).fillna(0)

for group, percentage in group_percentages.items():
    print(f"Group {group}: {percentage:.2f}%")

Group 0: 1.03%
Group 1: 2.58%
Group 2: 1.30%
Group 3: 1.15%
Group 4: 0.40%
Group 5: 79.07%
Group 6: 2.49%
Group 7: 1.11%
Group 8: 0.98%
Group 9: 1.15%
Group 10: 2.13%
Group 11: 3.08%
Group 12: 0.72%
Group 13: 2.21%
Group 14: 0.60%


In [13]:
df_tsv = pd.read_csv('drug-mappings.tsv', sep='\t')
df_mappings = df_tsv[['drugbankId', 'chembl_id']]
print(df_mappings.head())

  drugbankId      chembl_id
0    DB13088  CHEMBL3545177
1    DB13089   CHEMBL230006
2    DB13082   CHEMBL260829
3    DB13083   CHEMBL459505
4    DB13080            NaN


In [14]:
df2 = pd.merge(df, df_mappings, left_on='DrugBank ID', right_on='drugbankId', how='left')

# drop the extra 'drugbankId' column if it exists after merging
df2.drop('drugbankId', axis=1, inplace=True)

print(df2.head())

  DrugBank ID First letter ATC Codes  group      chembl_id
0     DB00001                      B    0.0  CHEMBL1201666
1     DB00002                      L    1.0  CHEMBL1201577
2     DB00003                      R    2.0  CHEMBL1201431
3     DB00004                      L    1.0  CHEMBL1201550
4     DB00005                      L    1.0  CHEMBL1201572


In [16]:
df2.to_csv('drugbank_id_ATC_group_chembl_id.csv', index=False)

In [15]:
count = 0
for row in df.itertuples():
    if row._2 == 'No ATC Code':
        count+=1

print(f"Number of drugs without ATC codes: {count}")
print(f"Total number of drugs: {len(df)}")
print(f"Percentage of drugs without ATC codes: {count/len(df)*100:.2f}%")

Number of drugs without ATC codes: 13111
Total number of drugs: 16581
Percentage of drugs without ATC codes: 79.07%


from ./download/drugbank_drug_targets.csv which is a dataframe consisting of 2 columsn called 'DrugBank ID', 'target'
load drugbank_drug_targets as a pandas dataframe
from /full_database/drugbank_atc_codes_out.csv
load drugbank_atc_codes_out as a pandas dataframe
for every drug entry in drugbank_drug_targets, find the corresponding drug entry in drugbank_atc_codes_out and append the entire row to the dataframe
the new dataframe should look like 'DrugBank ID', 'target', 'Drug', 'DrugBank ID', 'ATC Codes', and 'Pathways'
then drop the 'DrugBank ID' 4th column
now sort the dataframe by the 'Pathways' in descending order
convert the dataframe to a csv

In [11]:
def merge_and_sort_data(drug_targets_path, atc_codes_path, output_csv_path):
    drug_targets_df = pd.read_csv(drug_targets_path)
    atc_codes_df = pd.read_csv(atc_codes_path)

    drug_targets_df = drug_targets_df.rename(columns={'drug': 'DrugBank ID'})

    merged_df = pd.merge(drug_targets_df, atc_codes_df, on='DrugBank ID', how='left')
    
    merged_df.sort_values(by='Pathways', ascending=False, inplace=True)

    # drop the duplicate 'DrugBank ID' column
    merged_df.drop(merged_df.columns[3], axis=1, inplace=True)
    
    merged_df.to_csv(output_csv_path, index=False)
    
    print("First 10 elements:")
    print(merged_df.head(10))
    print("\nLast 10 elements:")
    print(merged_df.tail(10))

In [None]:
merge_and_sort_data('../download/drugbank_drug_targets.csv',
                    'drugbank_atc_codes_out.csv',
                    'merged_data.csv')

In [6]:
df = pd.read_csv('merged_data.csv')

split_df = df.assign(Pathways=df['Pathways'].str.split(', ')).explode('Pathways')
split_df = split_df.rename(columns={'Pathways' : 'Pathway'})

print(split_df.head())
print(split_df.tail())

  DrugBank ID     target          Drug   
0     DB03088  BE0002942  Pidolic acid  \
0     DB03088  BE0002942  Pidolic acid   
0     DB03088  BE0002942  Pidolic acid   
0     DB03088  BE0002942  Pidolic acid   
0     DB03088  BE0002942  Pidolic acid   
0     DB03088  BE0002942  Pidolic acid   
1     DB03088  BE0001671  Pidolic acid   
1     DB03088  BE0001671  Pidolic acid   
1     DB03088  BE0001671  Pidolic acid   
1     DB03088  BE0001671  Pidolic acid   

                                   Pathway  
0  gamma-Glutamyltranspeptidase Deficiency  
0     gamma-Glutamyltransferase Deficiency  
0                5-Oxoprolinase Deficiency  
0                   Glutathione Metabolism  
0                          5-Oxoprolinuria  
0        Glutathione Synthetase Deficiency  
1  gamma-Glutamyltranspeptidase Deficiency  
1     gamma-Glutamyltransferase Deficiency  
1                5-Oxoprolinase Deficiency  
1                   Glutathione Metabolism  
      DrugBank ID     target Drug Pathway


In [7]:
sorted_df = split_df.sort_values(by='Pathway', ascending=False)
print(split_df.head)
print(split_df.tail)

<bound method NDFrame.head of       DrugBank ID     target          Drug   
0         DB03088  BE0002942  Pidolic acid  \
0         DB03088  BE0002942  Pidolic acid   
0         DB03088  BE0002942  Pidolic acid   
0         DB03088  BE0002942  Pidolic acid   
0         DB03088  BE0002942  Pidolic acid   
...           ...        ...           ...   
13673     DB08679  BE0001594           NaN   
13674     DB08696  BE0000779           NaN   
13675     DB08696  BE0004382           NaN   
13676     DB08700  BE0000915           NaN   
13677     DB08764  BE0002411           NaN   

                                       Pathway  
0      gamma-Glutamyltranspeptidase Deficiency  
0         gamma-Glutamyltransferase Deficiency  
0                    5-Oxoprolinase Deficiency  
0                       Glutathione Metabolism  
0                              5-Oxoprolinuria  
...                                        ...  
13673                                      NaN  
13674                    

In [8]:
print("Number of unique pathways:", sorted_df['Pathway'].nunique())

Number of unique pathways: 856


In [9]:
valid_pathways_df = sorted_df.dropna(subset=['Pathway'])

num_unique_drugs_with_pathways = valid_pathways_df['DrugBank ID'].nunique()

print("Total number of unique drugs with non-NaN pathways:", num_unique_drugs_with_pathways)

drug_targets_df = pd.read_csv("../download/drugbank_drug_targets.csv")
print("Total number of unique drugs:", drug_targets_df['drug'].nunique())
#print("Total number of unique targets:", drug_targets_df['target'].nunique())


Total number of unique drugs with non-NaN pathways: 5928
Total number of unique drugs: 6029


In [10]:
valid_pathways_df.to_csv('valid_pathways_df.csv', index=False)

In [7]:
data = pd.read_csv('merged_data.csv')
df = pd.DataFrame(data)
df = df.dropna(subset=['Pathways'])
df = assign_groups(df)
df = df.drop(columns=['Drug', 'Pathways'])
print(df)

      DrugBank ID     target  group
0         DB03088  BE0002942    0.0
1         DB03088  BE0001671    0.0
2         DB03088  BE0002941    0.0
3         DB03088  BE0002940    0.0
4         DB03088  BE0002939    0.0
...           ...        ...    ...
13455     DB01536  BE0000194  505.0
13456     DB01536  BE0000051  505.0
13457     DB01536  BE0001638  505.0
13458     DB00624  BE0000132  505.0
13459     DB07447  BE0003610  506.0

[13460 rows x 3 columns]


In [8]:
df.to_csv('groups.csv', index=False)


In [2]:
! pip install requests



In [3]:
import requests

In [7]:
def get_drugbank_id_from_chembl(full_chembl_id):
    chembl_id = full_chembl_id.split(':')[-1]
    
        unichem_url = f"https://www.ebi.ac.uk/unichem/rest/src_compound_id/{chembl_id}/1/2"
    
    try:
        response = requests.get(unichem_url)
        if response.status_code == 200:
            data = response.json()
            return [x['src_compound_id'] for x in data] if data else "No DrugBank ID found for this ChEMBL ID."
        else:
            return f"Failed to retrieve data from UniChem, status code: {response.status_code}"
    except Exception as e:
        return f"An error occurred: {str(e)}"
    
full_chembl_id = "CHEMBL.COMPOUND:CHEMBL1200531"
drugbank_ids = get_drugbank_id_from_chembl(full_chembl_id)
print(drugbank_ids)


Failed to retrieve data from UniChem, status code: 404


In [None]:
df = pd.read_csv('../../pairs.csv', header=None) 

list = df[1].tolist()

for id in list:
    drugbank_ids = get_drugbank_id_from_chembl(id)
    print(drugbank_ids)

In [45]:
groups = {'Fold 0': [(7, 402)], 'Fold 1': [(15, 11)], 'Fold 2': [(2, 1026)], 'Fold 3': [(13, 130)], 'Fold 4': [(3, 764), (10, 207), (11, 37), (12, 408)], 'Fold 5': [(0, 5588), (5, 1247)], 'Fold 6': [(4, 1948), (14, 77)], 'Fold 7': [(1, 4898)], 'Fold 8': [(6, 167), (8, 259)], 'Fold 9': [(9, 385)]}


fold_index = 0
df = pd.DataFrame
for fold, group_data in groups.items():
#group_data = groups[f"Fold {fold_index}"]
    list = []
    for (x,y) in group_data:
        entry = ('Group ', x, ' with ', y,' instances')
        list.append(entry)

#print('Group data', ''.join(str(x) for x in group_data))
    for x in list:
        print(''.join(map(str, x)))
    #print(''.join(map(str, list[0])))

Group 7 with 402 instances
Group 15 with 11 instances
Group 2 with 1026 instances
Group 13 with 130 instances
Group 3 with 764 instances
Group 10 with 207 instances
Group 11 with 37 instances
Group 12 with 408 instances
Group 0 with 5588 instances
Group 5 with 1247 instances
Group 4 with 1948 instances
Group 14 with 77 instances
Group 1 with 4898 instances
Group 6 with 167 instances
Group 8 with 259 instances
Group 9 with 385 instances


In [56]:
import pandas as pd


groups = {'Fold 0': [(7, 402)], 'Fold 1': [(15, 11)], 'Fold 2': [(2, 1026)], 'Fold 3': [(13, 130)], 'Fold 4': [(3, 764), (10, 207), (11, 37), (12, 408)], 'Fold 5': [(0, 5588), (5, 1247)], 'Fold 6': [(4, 1948), (14, 77)], 'Fold 7': [(1, 4898)], 'Fold 8': [(6, 167), (8, 259)], 'Fold 9': [(9, 385)]}

fold_number = 4

fold = groups.get(f"Fold {fold_number}", None)

list = []
for (x,y) in fold:
    entry = ('Group ', x, ' with ', y,' instances')
    list.append(entry)

sentence = 'Group partitions: '
for x in list:
   sentence += ''.join(map(str, x)) + ', '

print(sentence)

def get_group_partitions_string(groups, fold_number):
    fold = groups.get(f"Fold {fold_number}", None)

    list = []
    for (x,y) in fold:
        entry = ('Group ', x, ' with ', y,' instances')
        list.append(entry)

    sentence = 'Group partitions: '
    for x in list:
        sentence += ''.join(map(str, x)) + ', '
    
    return sentence

print(get_group_partitions_string(groups, 2))


Group partitions: Group 3 with 764 instances, Group 10 with 207 instances, Group 11 with 37 instances, Group 12 with 408 instances, 
Group partitions: Group 2 with 1026 instances, 


In [6]:

import pandas as pd

test_set_groups_and_counts = [[(1, 1004)], [(5, 66), (9, 384)], [(0, 1178), (14, 35)], [(6, 765)], [(3, 256)], [(12, 1959)], [(13, 412), (15, 162)], [(2, 5610), (4, 4953), (11, 175)], [(7, 16), (10, 201)], [(8, 378)]]

test_set_groups_and_counts_2 = [[(1, 963)], [(3, 245), (8, 435)], [(7, 17)], [(9, 412)], [(11, 146), (14, 42), (15, 161)], [(5, 66), (13, 402)], [(6, 740), (12, 1953)], [(4, 4939), (10, 222)], [(2, 5616)], [(0, 1195)]]

groups_df = pd.DataFrame()

group_strings = [str(fold) for fold in test_set_groups_and_counts]
df = pd.DataFrame(group_strings, columns=['(Group number, Group instance count)'])
groups_df = pd.concat([groups_df, df], ignore_index=True)
group_strings = [str(fold) for fold in test_set_groups_and_counts_2]
df = pd.DataFrame(group_strings, columns=['(Group number, Group instance count)'])
groups_df = pd.concat([groups_df, df], ignore_index=True)

groups_df.to_csv("groups_and_counts_test.csv", sep=",", index=False)