# Hospital Names Worldwide via Wiki
**Website: ** [List of Hospitals in the World via Wikipedia](https://en.wikipedia.org/wiki/Lists_of_hospitals)
***

In [15]:
import re
from bs4 import BeautifulSoup
from requests import get

In [16]:
# Filtering out False Negatives
list_of_noise = ("Template", "List","Category", "Not", "Guid", " ", "\n", "You", "Find", "Support", "Visit", "About", "How",
                 "Permanent", "More", "Information", "Text", "Wiki", "Feature", "wmf", "History", "Since", "<", "ALERT")

## Helper Functions

In [17]:
# Checks if the link has no page
def check_content(soup):
    """
    Checks whether the page that was returned has content    
    """
    
    for section in soup.find_all("div"):
        try:
            if re.match("noarticletext", section["class"][0]):
                return(False)
        except:
            continue
    return (True)

In [18]:
def retrieve_list_format(name_list, new_soup, new_content_list):
    """
    Retrieves all the list tags and extracts all list elements
    """
    
    # Checks if the wiki-page is not empty
    if (check_content(new_soup)):
        # Iterate through all unordered list
        for ul in new_content_list:
            # Iterate through all list elements 
            for li in ul.find_all("li"):
                try:
                    if re.match("<a href|<a class", str(li.contents[0])):
                        try:
                            if (re.match("List", li.contents[0]["title"])):
                                name_list.append(str(li.contents[0]["title"]))
                            else:
                                name_list.append(str(li.contents[0].contents[0]))
                        except:
                            continue
                    else:
                        try:
                            if re.match("<a class", str(li.contents[0])):
                                    name_list.append(str(li.contents[0]["title"]))
                            elif (not "<" in str(li.contents[0])):
                                name_list.append(str(li.contents[0]))
                        except:
                               continue
                except:
                    continue
    return(name_list)

In [19]:
def retrieve_table_format(name_list, new_soup, content_list):
    """
    Retrieves all the table tags and extracts the first element of each row
    """
    
    remove_navbox(new_soup)

    for table in content_list:
        tr_list = table.find_all("tr")
        for tr in tr_list[1:]:
            try:
                if re.match("<a href|<a class", str(tr.td.contents[0])):
                    name_list.append(str(tr.td.a.contents[0]))
                else:
                    name_list.append(str(tr.td.contents[0]))
            except:
                continue
    return (name_list)

In [20]:
def retrieve_html(url):
    """
    Retrieves html from webpage and returns content of interest
    """
    
    response = get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    mw_output = soup.find("div", class_="mw-parser-output")
    
    return(soup, mw_output)

In [21]:
def num_hospitals(hospitals):
    """ Prints how many hospitals are in the list"""
    hospitals = set(hospitals)
    print("Number of Hospitals: {}".format(len(hospitals)))

In [22]:
def print_hospitals(hospitals):
    """ Prints all hospital names in the list"""
    for hospital in hospitals:
        print(hospital + "\n")

In [23]:
def remove_navbox(soup):
    """
    Removes the navigation box of each page
    """
    for navbox in soup.find_all("div", class_= "navbox"):
        navbox.decompose()

In [24]:
def extract_from_US(name_list, new_soup, new_mw_output):
    """
    Extracts content from the US page which are separated by state
    """
    table_object = new_mw_output.find_all("table")
    
    new_content_list = table_object[0].find_all("ul")
    states = retrieve_list_format(name_list, new_soup, new_content_list)

    state_dir = [state.replace(" ", "_") for state in states]
    
    for state in state_dir:
        country_page = "https://en.wikipedia.org/wiki/" + state
        if (state.startswith("List")):
            new_soup, new_mw_output = retrieve_html(country_page)
            remove_navbox(new_soup)
            
            #print(state)
            list_format_content = new_mw_output.find_all("ul")
            name_list = retrieve_list_format(name_list, new_soup, list_format_content)
        
            temp_name_list = []
            table_format_content = new_mw_output.find_all("table")
            if (len(table_format_content) > 0):
                table_content = retrieve_table_format(temp_name_list, new_soup, table_format_content)
                name_list += [hospital for hospital in table_content]
    
    return(name_list)

## Multi-Page Webscrapper

In [25]:
def multi_webScrapper(url):  
    """
    Scrapes through multiple pages, starting from the page 
    containing all the countries within a continent
    """
    soup, mw_output = retrieve_html(url)
    
    # Removes all navbox within the page
    remove_navbox(soup)
        
    content_list = mw_output.find_all("ul")

    # Stores all Hospital Names
    name_list = []

    for links in content_list:
        for link in links.find_all("li"):
            try:
                country_dir = link.contents[0].contents[0].replace(" ", "_")
                country_page = "https://en.wikipedia.org/wiki/" + country_dir
                if (country_dir.startswith("List")):
                    new_soup, new_mw_output = retrieve_html(country_page)
                    
                    # Removes all navbox within the page
                    remove_navbox(new_soup)
                    
                    if (country_dir == "List_of_hospitals_in_the_United_States"):
                        name_list = extract_from_US(name_list, new_soup, new_mw_output)
                    else:
                        new_content_list = new_mw_output.find_all("ul")
                        name_list = retrieve_list_format(name_list, new_soup, new_content_list)
                        
                        temp_name_list = [] # Temporary stores content from tables
                        table_format_content = new_mw_output.find_all("table")
                        if (len(table_format_content) > 0):
                            table_content = retrieve_table_format(temp_name_list, new_soup, table_format_content)
                            name_list += [hospital for hospital in table_content]
                else:
                    break
            except:
                continue

    # Filters out noise
    new_name_list = []   
    new_name_list = ([name for name in name_list if not name.startswith(list_of_noise) and len(name) > 3])
    return(set(new_name_list))

## Single-Page Webscrapper

In [26]:
def single_webScrapper(url):
    """
    Scrapes through a wiki page, to retrieve all hospital names
    """
    
    soup, mw_output = retrieve_html(url)
    remove_navbox(soup)
    content_list = mw_output.find_all("ul")
    table_content_list = mw_output.find_all("table")
    
    # Stores all Hospital Names
    name_list = []
    temp_list = []
    
    # Retrieves content if in table format
    if (len(table_content_list) > 0):
        retrieve_table_format(name_list, soup, table_content_list)
        
    # Retrives content if in list format
    retrieve_list_format(name_list, soup, content_list)

    # Filters out noise
    new_name_list2 = []   
    new_name_list2 = ([name for name in name_list if not name.startswith(list_of_noise) and len(name) > 3])
    return(new_name_list2)

#single_webScrapper("https://en.wikipedia.org/wiki/List_of_hospitals_in_Argentina")

## North America

In [27]:
# Scraps for all North American hospitals
na_hospitals = multi_webScrapper("https://en.wikipedia.org/wiki/Lists_of_hospitals_in_North_America")

In [28]:
# Number of hospitals in North America
num_hospitals(na_hospitals)

Number of Hospitals: 7491


## Europe

In [29]:
# Scraps for all European hospitals
europe_hospitals = multi_webScrapper("https://en.wikipedia.org/wiki/Lists_of_hospitals_in_Europe")

In [42]:
# Number of hospitals in Europe
num_hospitals(europe_hospitals)

Number of Hospitals: 3065


## Asia

In [31]:
asia_hospitals = multi_webScrapper("https://en.wikipedia.org/wiki/Lists_of_hospitals_in_Asia")

In [32]:
# Number of hospitals in Asia
num_hospitals(asia_hospitals)

Number of Hospitals: 6714


## Oceania

In [33]:
oceania_hospitals = multi_webScrapper("https://en.wikipedia.org/wiki/Lists_of_hospitals_in_Oceania")

In [34]:
# Number of hospitals in Oceania
num_hospitals(oceania_hospitals)

Number of Hospitals: 1250


## Africa

In [35]:
africa_hospitals = multi_webScrapper("https://en.wikipedia.org/wiki/Lists_of_hospitals_in_Africa")

In [36]:
# Number of hospitals in Africa
num_hospitals(africa_hospitals)

Number of Hospitals: 1658


## South America

In [37]:
sa_hospitals = multi_webScrapper("https://en.wikipedia.org/wiki/Lists_of_hospitals_in_South_America")

In [38]:
# Number of hospitals in Africa
num_hospitals(sa_hospitals)

Number of Hospitals: 593


## Write to File

In [39]:
def write_2_file(textfile, entity_list):
    for entity in entity_list:
        entity.replace("\n", "")
        textfile.write("\t" + entity + "\n")
        
    textfile.close()

In [40]:
# Write to File
na_file = open("NA_Hospitals.txt", "w", encoding = "utf8")
write_2_file(na_file, na_hospitals)

In [41]:
# Write to File
europe_file = open("Europe_Hospitals.txt", "w", encoding = "utf8")
write_2_file(europe_file, europe_hospitals)

In [43]:
# Write to File
asia_file = open("Asia_Hospitals.txt", "w", encoding = "utf8")
write_2_file(asia_file, asia_hospitals)

In [44]:
# Write to File
oceania_file = open("Oceania_Hospitals.txt", "w", encoding = "utf8")
write_2_file(oceania_file, oceania_hospitals)

In [45]:
# Write to File
africa_file = open("Africa_Hospitals.txt", "w", encoding = "utf8")
write_2_file(africa_file, africa_hospitals)

In [46]:
# Write to File
sa_file = open("SA_Hospitals.txt", "w", encoding = "utf8")
write_2_file(sa_file, sa_hospitals)