# Import Libraries

In [None]:
from bs4 import BeautifulSoup
import requests
import csv
import time
import numpy as np
import re
import pandas as pd
import json
import joblib

from selenium import webdriver
from selenium.webdriver import Firefox, Chrome, Edge
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options

# Web Scraping

## DrugBank Online
Site: https://go.drugbank.com/drugs

## Retrive Data All Pages

In [None]:
%%time
# Drug Names
drug_names = []
# DrugBank Accession Numbers
drug_bans = []

page = 1
while page <= 110:
    url = "https://go.drugbank.com/drugs?approved=1&c=name&d=up&page="+str(page)
    req = requests.get(url)
    soup = BeautifulSoup(req.text, "html.parser")

    drugs = soup.find_all(
    'td', {'class': 'name-value text-sm-center drug-name'})
    
    for dr in drugs:
        drug_names.append(dr.text)
        ban = (dr.find('a').get("href").split('/')[2])
        drug_bans.append(ban)
    range_ = [n for n in range(101)[1::20]]
    if page in range_:
        print("Completed Scraping:", page)
    page += 1

## Convert to Pandas dataFrame

In [None]:
test_df = pd.DataFrame([drug_names, drug_bans]).T
test_df.columns = ['Name','DrugBankAccessionNumber']
test_df

# Execution


## Function: Get Browser

In [None]:
 def get_driver():
    driver = webdriver.Chrome()
    driver.set_page_load_timeout(10)
    driver.implicitly_wait(10)
    return driver

## Function: Table Multi-page

In [None]:
def table_multiple_page(dr_number:str, table_id:str, table_content,section_header):
    t_content = table_content
    
    keys=[]
    for k in t_content.find_all('th'):
        key = k.text.replace('\n','').lower().replace(' ', '_').replace('-','_')
        if key != 'integrate_drug_drug_interactions_in_your_software':
            keys.append(key)
  
    
    subDict = []
    if table_id:
        pagination = t_content.find('ul')
        if table_id:
            browser = Chrome()
            browser.maximize_window()
            wait = WebDriverWait(browser, timeout=10)
            browser.get(f"https://go.drugbank.com/drugs/{dr_number}#{section_header}")
            xpath = f'//*[@id="{table_id}_next"]/a'
            i = 1
            while True:
                
                range_ = [n for n in range(101)[1::10]]
                time.sleep(1)
                new_html = browser.page_source
                new_soup = BeautifulSoup(new_html)
                time.sleep(0.5)
                
                table = new_soup.find('table',{'id':f'{table_id}'}).find('tbody')

                if i in range_:
                    print('Page:',i)
                info = new_soup.find('div',{'class':'dataTables_info','id':f'{table_id}_info'})
                if info:
                    print(info.text)
                    
                values = []
                for v in table.find_all('tr'):
                    k = 0
                    k_value = []
                    for k in range(len(keys)):
                        value = v.find_all('td')[k]
                        k_value.append(value.text)
                        k += 1
                    d = dict(zip(keys,k_value))
                    subDict.append(d)
                
                try:
                    next_element = wait.until(EC.visibility_of_element_located((By.XPATH, xpath)))
                    next_element.click()
                    i += 1

                except:
                    browser.quit()
                    break
        else:
            values = []
            table =  t_content.find('table').find('tbody')

            for v in table.find_all('tr'):
                k = 0
                k_value = []
                for k in range(len(keys)):

                    value = v.find_all('td')[k]
                    k_value.append(value.text)
                    k += 1
                d = dict(zip(keys,k_value))
                subDict.append(d)
            
    else:
        values = []
        table =  t_content.find('table').find('tbody')

        for v in table.find_all('tr'):
            k = 0
            k_value = []
            for k in range(len(keys)):
            
                value = v.find_all('td')[k]
                k_value.append(value.text)
                k += 1
            d = dict(zip(keys,k_value))
            subDict.append(d)
    return subDict

## Function: Image Multi-Page

In [None]:
def image_multiple_page(dr_number:str, content, section_header):
    
    driver = get_driver()
    url = f"https://go.drugbank.com/drugs/{dr_number}#{section_header}"
    driver.get(url)

    innerHTML = driver.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(innerHTML, 'lxml')
    
    time.sleep(2)
    
    xpath = '//*[@id="product-carousel"]/ol/li[2]'
    product_images = [] 
    
    image_page = len(content.find_all('li'))
    rnd = 1
    
    while rnd <= image_page :
        time.sleep(1)
        images_contents = soup.find('div', {'id' : 'product-carousel-row'})
        
        for img in images_contents.find_all('img'):
            image = img['src']
            if image:
                if image not in product_images:
                    product_images.append(image)
        
        try:
            product_image_next_element = driver.find_element(By.XPATH, product_image_next_xpath)
            product_image_next_element.click()
            rnd += 1
        except:
            driver.quit()
            break
    return product_images

## Execution 

In [None]:
%%time
drug_cat = []
# Set index of start drug scraping
drug_index = 286

for number in test_df.DrugBankAccessionNumber[drug_index::]:
    
    driver = get_driver()
    url = f"https://go.drugbank.com/drugs/{number}"
    driver.get(url)

    innerHTML = driver.execute_script('return document.body.innerHTML')
    main_soup = BeautifulSoup(innerHTML, 'lxml')
    
    time.sleep(1)
    
    # Drug Informations
    dls = [dl for dl in main_soup.find_all('dl') if dl.get('class') == None]
    drug_name = main_soup.find('h1').text
    
    print('-'*100,f'\n({drug_index}) {drug_name} ({number})')
    
    main_keys = ['Name','DrugBankAccessionNumber']
    main_values = [drug_name,number]
    time.sleep(1)
    
    h2s = [h2 for h2 in main_soup.find_all('h2') if h2.get('id')]

    h3s = [h3.text for h3 in main_soup.find_all('h3')]

    for h2 in h2s:
        h2keys = []
        h2values = []
    
        h2_index = h2s.index(h2)
        for k in dls[h2_index].find_all('dt'):
            if k:
                key = k.get('id')
                h2keys.append(key)
            else:
                key = np.nan
                h2keys.append(key)
        for v in dls[h2_index].find_all('dd'):
            if v:
                if v.find('a', attrs={'class': 'locked-incopy track-link'}) == None:
                    if v.find('table'):
                        table_id = v.find('table').get('id')
                        header_section = h2.get('id')
                        if v.find('p'):
                            des_value = v.find('p').text
                            value = table_multiple_page(number,table_id, v, header_section)
                            p_table = {'description':des_value, table_id:value}
                            h2values.append(p_table)
                        else:
                            value = table_multiple_page(number,table_id, v, header_section)
                            d_table = {table_id:value}
                            h2values.append(d_table)

                    elif v.find('div', {'id' : 'product-carousel-row'}):
                        header_section = h2.get('id')
                        content = v.find('div', {'id' : 'product-carousel-row'})
                        value = image_multiple_page(number,content, header_section)
                        h2values.append(value)

                    elif v.find('img'):
                        structure_path = 'https://go.drugbank.com' + v.find('img').get('src')
                        h2values.append(structure_path)


                    elif v.find('dl'):
                        dt_keys = []
                        for dt in v.find_all('dt'):
                            dt_keys.append(dt.text)

                        dd_values = []
                        for dd in v.find_all('dd'):
                            dd_values.append(dd.text)
                        d = dict(zip(dt_keys,dd_values))
                        h2values.append(d)
                    elif v.find('a'):
                        a_keys = []
                        for a in v.find_all('a'):
                            a_keys.append(a.text)
                        li_values = []
                        for li in v.find_all('li'):
                            li_values.append(li.text)
                        d = dict(zip(a_keys,li_values))
                        h2values.append(d)

                    elif v.find('ul'):
                        li_s = [li.text for li in v.find_all('li')]
                        h2values.append(li_s)

                    elif v.find('ul', attrs={'class': 'list-unstyled metabolite-pathway'}):
                        if v.find('p'):
                            paragraphs = []
                            for p in v.find_all('p'):
                                paragraphs.append(p.text)
                            spans = []
                            for span in v.find_all('span', attrs={'class': 'metabolite'}):
                                spans.append(span.text)
                            a_s = []
                            for a in v.find_all("a"):
                                a_s.append(a.text)

                            d = dict(zip(spans,a_s))
                            d_p = {'Description':paragraphs,'metabolite':d}
                            h2values.append(d_p)

                        else:
                            spans = []
                            for span in v.find_all('span', attrs={'class': 'metabolite'}):
                                spans.append(span.text)
                            a_s = []
                            for a in v.find_all("a"):
                                a_s.append(a.text)
                            d = dict(zip(spans,a_s))
                            h2values.append(d)
                    elif v.find('br'):
                        h2values.append(v.text)
                    else:
                        value = v.text
                        h2values.append(value)
                else:
                    
                    title = v.find('div', attrs={'class': 'title'})
                    subtitle =  v.find('div', attrs={'class': 'subtitle'})
                    if title and subtitle:
                        d = {'title':title.text,'subtitle':subtitle.text}
                        h2values.append(d)
                    elif title:
                        h2values.append(title.text)
                    else:
                        h2values.append(np.nan)
                        
            # Empty value
            else:
                h2values.append(np.nan)
        
        h2_sub_dict = dict(zip(h2keys,h2values))
        main_keys.append(h2.text)
        main_values.append(h2_sub_dict)
    # h3
    for h3 in main_soup.find_all('div',{'class':'bond-list-container'}):
        header = h3.get('id')

        bond_cards = h3.find_all('div',{'class':'bond card'})
        d_bond_cards = []
        for bc in bond_cards:
            sub_header = bc.find('strong').text
            h3keys = []
            for k in bc.find_all('dt'):
                key = k.text.replace(' ', '_').lower()
                h3keys.append(key)
            h3values = []
            for v in bc.find_all('dd'):
                value = v.text
                h3values.append(value)

            d = dict(zip(h3keys,h3values))
            sh_d = {sub_header:d}
            d_bond_cards.append(sh_d)
        main_keys.append(header)
        main_values.append(d_bond_cards)
    range_ = [i for i in range(1,len(test_df),100)]
    if drug_index in range_:
        print(drug_index)
    print('Main Keys:',len(main_keys))
    print('Main Values:',len(main_values))
    drug_dict = dict(zip(main_keys,main_values))
    
    drug_cat.append(drug_dict)   
    joblib.dump(
        drug_cat, f'drug_list.pkl')
    drug_index += 1
    
print('Completed Scraping')

### Read CSV finished scraping

In [None]:
path = 'drugBank.csv'
df1 = pd.read_csv(path)

### Load Variable

In [None]:
path = 'drug_list.pkl'
drug_temp_list = joblib.load(path)
df2 = pd.DataFrame(drug_temp_list)
df2

### Concatination df1 + df2

### Concatination

In [None]:
drugs_concat = pd.concat([df1, df2], ignore_index=True)
drugs_concat

Unnamed: 0,Name,DrugBankAccessionNumber,Identification,Pharmacology,Interactions,Products,Categories,Chemical Identifiers,References,Clinical Trials,Pharmacoeconomics,Properties,Spectra,targets,enzymes,carriers,transporters
0,1-Palmitoyl-2-oleoyl-sn-glycero-3-(phospho-rac...,DB11331,{'summary': '1-Palmitoyl-2-oleoyl-sn-glycero-3...,{'indication': {'title': 'Reduce drug developm...,{'drug-interactions': 'This information should...,{None: {'title': 'Drug product information fro...,"{'drug-categories': 'Not Available', 'classifi...","{'unii': {}, 'cas-number': '185435-28-3', 'inc...","{'general-references': 'Not Available', 'exter...","{'clinical-trials': {None: [{'phase': '2', 'st...","{'manufacturers': 'Not Available', 'packagers'...","{'state': 'Not Available', 'experimental-prope...","{'mass-spec': 'Not Available', 'spectra': 'Not...",,,,
1,"1,2-Benzodiazepine",DB12537,"{'generic-name': '1,2-Benzodiazepine', 'drugba...",{'indication': {'title': 'Reduce drug developm...,{'drug-interactions': {'drug-interactions-tabl...,,{'drug-categories': {'Benzazepines': 'Benzazep...,"{'unii': {}, 'cas-number': '264-60-8', 'inchi-...","{'general-references': 'Not Available', 'exter...","{'clinical-trials': {None: [{'phase': '4', 'st...","{'manufacturers': 'Not Available', 'packagers'...","{'state': 'Not Available', 'experimental-prope...","{'mass-spec': 'Not Available', 'spectra': {'sp...",[{'1. GABA(A) Receptor (Protein Group)': {'kin...,[{'1. Cytochrome P450 3A4': {'kind': 'Protein'...,,
2,"1,2-Distearoyllecithin",DB14099,"{'brand-names': 'Lumason', 'generic-name': '1,...",{'indication': {'title': 'Reduce drug developm...,{'drug-interactions': 'This information should...,{None: {'title': 'Drug product information fro...,{'drug-categories': {'Glycerophosphates': 'Gly...,"{'unii': {}, 'cas-number': '4539-70-2', 'inchi...","{'general-references': 'Not Available', 'exter...","{'clinical-trials': {None: [{'phase': '4', 'st...","{'manufacturers': 'Not Available', 'packagers'...","{'state': 'Solid', 'experimental-properties': ...","{'mass-spec': 'Not Available', 'spectra': 'Not...",,,,
3,"1,2-icosapentoyl-sn-glycero-3-phosphoserine",DB14096,"{'brand-names': 'EnBrace HR, EnLyte', 'generic...",{'indication': {'title': 'Reduce drug developm...,{'drug-interactions': 'This information should...,{None: {'title': 'Drug product information fro...,"{'drug-categories': 'Not Available', 'classifi...","{'unii': {}, 'cas-number': 'Not Available', 'i...","{'general-references': 'Not Available', 'exter...","{'clinical-trials': {None: [{'phase': '4', 'st...","{'manufacturers': 'Not Available', 'packagers'...","{'state': 'Solid', 'experimental-properties': ...","{'mass-spec': 'Not Available', 'spectra': 'Not...",,,,
4,2-mercaptobenzothiazole,DB11496,"{'generic-name': '2-mercaptobenzothiazole', 'd...",{'indication': {'title': 'Reduce drug developm...,{'drug-interactions': 'This information should...,{None: {'title': 'Drug product information fro...,{'drug-categories': {'Anti-Infective Agents': ...,"{'unii': {}, 'cas-number': '149-30-4', 'inchi-...",{'general-references': {'Article': 'Agnusdei C...,{'clinical-trials': {None: []}},"{'manufacturers': 'Not Available', 'packagers'...","{'state': 'Not Available', 'experimental-prope...","{'mass-spec': 'Not Available', 'spectra': {'sp...","[{'1. Thyroid peroxidase': {'kind': 'Protein',...",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454,Cefpodoxime,DB01416,{'summary': 'Cefpodoxime is a third-generation...,{'indication': {'title': 'Reduce drug developm...,{'drug-interactions': {'drug-interactions-tabl...,{None: {'title': 'Drug product information fro...,{'atc-codes': {'J01DD13 — Cefpodoxime': 'J01DD...,"{'unii': {}, 'cas-number': '80210-62-4', 'inch...","{'synthesis-reference': {}, 'general-reference...","{'clinical-trials': {None: [{'phase': '4', 'st...","{'manufacturers': 'Not Available', 'packagers'...","{'state': 'Solid', 'experimental-properties': ...","{'mass-spec': 'Not Available', 'spectra': {'sp...",[{'1. Peptidoglycan synthase FtsI': {'kind': '...,,,
455,Cefprozil,DB01150,{'summary': 'Cefprozil is a cephalosporin anti...,{'indication': {'title': 'Reduce drug developm...,{'drug-interactions': {'drug-interactions-tabl...,{None: {'title': 'Drug product information fro...,{'atc-codes': {'J01DC10 — Cefprozil': 'J01DC —...,"{'unii': {}, 'cas-number': '92665-29-7', 'inch...","{'synthesis-reference': {}, 'general-reference...","{'clinical-trials': {None: [{'phase': '1', 'st...","{'manufacturers': 'Not Available', 'packagers'...","{'state': 'Solid', 'experimental-properties': ...","{'mass-spec': 'Not Available', 'spectra': {'sp...",[{'1. Penicillin-binding protein 1A': {'kind':...,,,
456,Cefradine,DB01333,{'summary': 'Cefradine is a first-generation c...,{'indication': {'title': 'Reduce drug developm...,{'drug-interactions': {'drug-interactions-tabl...,{None: {'title': 'Drug product information fro...,{'atc-codes': {'J01DB09 — Cefradine': 'J01DB —...,"{'unii': {}, 'cas-number': '38821-53-3', 'inch...","{'synthesis-reference': {}, 'general-reference...",{'clinical-trials': {None: []}},"{'manufacturers': 'Not Available', 'packagers'...","{'state': 'Solid', 'experimental-properties': ...","{'mass-spec': 'Not Available', 'spectra': {'sp...",[{'1. Penicillin-binding protein 1A': {'kind':...,[{'1. Cytochrome P450 3A4': {'kind': 'Protein'...,,[{'1. Solute carrier family 22 member 5': {'ki...
457,Ceftaroline fosamil,DB06590,{'summary': 'Ceftaroline fosamil is an antibac...,{'indication': {'title': 'Reduce drug developm...,{'drug-interactions': {'drug-interactions-tabl...,{None: {'title': 'Drug product information fro...,{'atc-codes': {'J01DI02 — Ceftaroline fosamil'...,"{'unii': {}, 'cas-number': '229016-73-3', 'inc...",{'synthesis-reference': 'http://www.google.com...,"{'clinical-trials': {None: [{'phase': '4', 'st...","{'manufacturers': 'Not Available', 'packagers'...","{'state': 'Solid', 'experimental-properties': ...","{'mass-spec': 'Not Available', 'spectra': {'sp...",,,,


# Export to CSV

In [None]:
drugs_concat.to_csv('drugBank.csv', index=False)