In [1]:
# Libraries
import pandas as pd
pd.set_option('display.max_columns', 40)
pd.set_option('display.width', 2000)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from bs4 import BeautifulSoup
import requests

### Functions

In [34]:
def parse(drug_name, url_string):
    get_url = requests.get(url_string, timeout=5)  
    page_content = BeautifulSoup(get_url.content, 'lxml')
    containers = page_content.findAll('span', 
                 {'class':'IndicationsAndDoseContent-module--indicationText--iUQa1'})
    
    df = pd.DataFrame(columns = ['drug_name', 'indication'])

    for item in containers:
        text = item.contents[0]
        interim_df = pd.DataFrame([[drug_name, text]], columns=df.columns)
        df = pd.concat([df, interim_df], ignore_index=True)
        #df = df.append({'drug_name': drug_name, 'indication': text}, ignore_index=True)
    
    return df

In [35]:
def drug_indication_scraping(drug_list, base_url):
    final_drug_df = pd.DataFrame(columns = ['drug_name', 'indication'])

    for drug in drug_list:
        
        full_url = base_url + drug
        
        drug_df = parse(drug, full_url)   
        final_drug_df = pd.concat([final_drug_df, drug_df], ignore_index=True)
    
    return final_drug_df

In [77]:
def get_all_bnf_drugs():
    # Get content
    get_url = requests.get('https://bnf.nice.org.uk/drugs/', timeout=5)  
    page_content = BeautifulSoup(get_url.content, 'lxml')
    containers = page_content.findAll('a')
    #print(len(containers))
    # Exstract text
    text_list = []
    for item in containers:
        text = item.contents[0]
        text_list.append(text)
    # Find location of first and last drug in whole list
    start = text_list.index("Abacavir")
    end = text_list.index("Zuclopenthixol decanoate")
    # Filter
    filtered_drug_list = text_list[start:end+1]
    # Remove space from the end so web links work 
    filtered_drug_list = [x.rstrip() for x in filtered_drug_list]
    # Replace / with dash so web links work
    filtered_drug_list = [x.replace("/", "-") for x in filtered_drug_list]
    # Replace , with nothing so web links work
    filtered_drug_list = [x.replace(",", "") for x in filtered_drug_list]
    # Replace () with nothing so web links work
    filtered_drug_list = [x.replace("(", "") for x in filtered_drug_list]
    filtered_drug_list = [x.replace(")", "") for x in filtered_drug_list]
    # Replace space with dash so web links work
    filtered_drug_list = [x.replace(" ", "-") for x in filtered_drug_list]
    # Replace é with e so web links work
    filtered_drug_list = [x.replace("é", "e") for x in filtered_drug_list]

    return filtered_drug_list

### Run

In [None]:
# Scrape data from bnf
drug_list = get_all_bnf_drugs()
base_url = 'https://bnf.nice.org.uk/drugs/'
final_drug_df = drug_indication_scraping(drug_list, base_url)

In [80]:
final_drug_df

Unnamed: 0,drug_name,indication
0,Abacavir,HIV infection in combination with other antire...
1,Abacavir-with-dolutegravir-and-lamivudine,HIV infection
2,Abacavir-with-lamivudine,HIV infection in combination with other antire...
3,Abacavir-with-lamivudine-and-zidovudine,HIV infection (use only if patient is stabilis...
4,Abatacept,Moderate-to-severe active rheumatoid arthritis...
...,...,...
5474,Zuclopenthixol,Schizophrenia and other psychoses
5475,Zuclopenthixol-acetate,"Short-term management of acute psychosis,"
5476,Zuclopenthixol-acetate,"Short-term management of mania,"
5477,Zuclopenthixol-acetate,Short-term management of exacerbation of chron...


In [83]:
final_drug_df.drug_name.nunique()
len(drug_list)

1695

1701

In [84]:
# These drug not included - mainly as just interact with drugs or link did not work 
set(drug_list) - set(final_drug_df.drug_name.unique().tolist())

{'Anti-D-Rh',
 'Cranberry',
 'Dairy-products',
 'Enteral-feeds',
 'Grapefruit',
 "St-John's-wort"}

In [87]:
# Save
#final_drug_df.to_csv('bnf_drug_indications.csv', index=False)