## For assistancing updating or fixing this notebook, please contact:
### Redefining Default LLC
www.redefining-default.com
<br>
kirsten@redefining-default.com
<br>
571-510-0139

In [27]:
from selenium import webdriver #to navigate to url
from bs4 import BeautifulSoup #to parse html
import re #regex to search by wildcards using re.compile within html
import pandas as pd #to make dataframe and write to csv
import datetime #to create a timestamp on CSV exports for version control
import unicodedata #to fix character encoding errors to improve human readability
import numpy as np #to get unique list values
import glob #to find all CSV files in current directory
import os #to get filepath of CSV file

In [28]:
#Site to be scraped
base_url = 'https://www.urmc.rochester.edu/urm-labs/service-centers.aspx'

In [29]:
#Open Firefox and get website
driver = webdriver.Firefox()
driver.implicitly_wait(30)
driver.get(base_url)

In [30]:
#Parse entire page
basesoup = BeautifulSoup(driver.page_source, "html.parser")

In [31]:
#Extract Lab IDs as common key between selectors
labs = []  #Create a list to store IDs
all_div = basesoup.find_all("div", {"class": "location-details"})
for div in all_div:
    labs.append(div.find("h2").get("id"))

In [32]:
#Unit Test: How many Lab IDs are there?
CountLabs = len(labs)
if CountLabs < 30:
    print('Note: ',30-CountLabs,' fewer lab(s) then when originally set up.')
elif CountLabs > 30:
    print('Note: ',CountLabs-30, ' more lab(s) then when originally set up.')
elif CountLabs == 30:
    print('Everything is as expected. No change in number of labs since originally set up.')

Everything is as expected. No change in number of labs since originally set up.


In [33]:
#Find Lab Location Names
names = {}  #Create a dictionary to store ID and Names as key:value pairs
for div in all_div:
    i = all_div.index(div)
    names.update({div.find("h2").get("id"):div.find("h2").text.strip()})

In [34]:
#Unit Test: Do the number of Lab Names match the number of IDs?
CountNames = len(names)
if not (CountNames == CountLabs):
    if (CountNames > CountLabs):
        greater = 'Names'
        fewer = 'Lab IDs'
    else:
        greater = 'Lab IDs'
        fewer = 'Names'
    print("WARNING: There are more ",greater," than ",fewer)
else:
    print("Everything is as expected.")

Everything is as expected.


In [35]:
#Find Hospital Association
associated = {}  #Create a dictionary to store ID and Associations as key:value pairs
all_legal = basesoup.find_all("p", {"class": "legal"})
for p in all_legal:
    associated.update({p.previous_sibling.get("id"):p.text})

In [36]:
#Unit Test: Do the number of Associated Hospitals match the number of IDs?
CountHospitals = len(associated)
if not (CountHospitals == CountLabs):
    if (CountHospitals > CountLabs):
        greater = 'Hospitals'
        fewer = 'Lab IDs'
    else:
        greater = 'Lab IDs'
        fewer = 'Hospitals'
    print("WARNING: There are more ",greater," than ",fewer)
else:
    print("Everything is as expected.")

Everything is as expected.


In [37]:
#Define Function to parse Addresses into human readable text and format
def text_with_newlines(elem):
    text = ''
    for e in elem.descendants:
        if isinstance(e, str):
            text += e.strip()
        elif e.name == 'br':
            text += '\n'
    return text

In [38]:
#Find Addresses
address = {}  #Create a dictionary to store ID and Addresses as key:value pairs
all_addresses = [p.findNext('p') for p in all_legal]
for s in all_addresses:
    trans = text_with_newlines(s)
    trans2 = re.sub(r'\([^()]*\)', ' ',trans) #remove parantheses and everything inside
    trans3 = trans2.replace("Get Directions", "") #remove 'Get Directions'
    trans4 = re.sub('\n +', ' ',trans3) #remove return and extra spaces to just one space
    trans5 = re.sub(' +', ' ', trans4)
    address.update({s.previous_sibling.previous_sibling.get("id"):trans5})

In [39]:
#Unit Test: Do the number of Addresses match the number of IDs?
CountAddresses = len(address)
if not (CountAddresses == CountLabs):
    if (CountAddresses > CountLabs):
        greater = 'Addresses'
        fewer = 'Lab IDs'
    else:
        greater = 'Lab IDs'
        fewer = 'Addresses'
    print("WARNING: There are more ",greater," than ",fewer)
else:
    print("Everything is as expected.")

Everything is as expected.


In [40]:
#Unit Test: Have the possible combinations of Opening Days changed?
current_days = ['M-F:', 'Mon - Fri:', 'F:','M-TH:','Sat:','Sat.:','Sun:','Mon & Tues:','Mon & Thu:','Wed & Fri:']
day_options1 = ['M-TH:','Sat:','Sat.:','Sun:','Mon & Tues:','Mon & Thu:','Wed & Fri:']
days = []
all_days = basesoup.find_all("strong")
for s in all_days:
    normalized = unicodedata.normalize('NFKC',s.text)  #get bold days text and normalize
    trans1 = re.sub(r'[\t\r\n]', ' ', normalized)  #remove break
    trans2 = re.sub(' +', ' ',trans1.strip())  #replace extra spaces with just one space
    days.append(trans2)
days.remove('Coronavirus (COVID-19):')
days.remove('Please Note:')
new_days = np.unique(np.array(days))
list(new_days)
diff = list(set(new_days) - set(current_days))
if diff:
    for i in diff:
        day_options1.append(i)
        print('A new day, ',i,' was detected and added to day_options1. This could indicate a larger change in the HTML structure of the page. Manual check recommended.')
else:
    print('No new days have been found. Everything is as expected.')

No new days have been found. Everything is as expected.


In [41]:
#Find Hours
lab_hours = {}  #Create a dictionary to store ID and Hours as key:value pairs
for lab in labs:
    hours_list = []  #Create a temporary list to store hours for a each lab
    if basesoup.find("h2",{"id":lab}).next_sibling.next_sibling.next_sibling.find(string=re.compile('M-F:')):
        hours = basesoup.find("h2",{"id":lab}).next_sibling.next_sibling.next_sibling.find(string=re.compile('M-F:')).findNext('span').text
        hours2 = unicodedata.normalize('NFKC',hours)
        hours3 = re.sub(' +', ' ',hours2)
        hours4 = re.sub(r'[\t\r\n]', '', hours3)
        hours_list.append('M-F: '+hours4+'\n')
    elif basesoup.find("h2",{"id":lab}).next_sibling.next_sibling.next_sibling.find(string=re.compile('Mon - Fri:')):
        hours = basesoup.find("h2",{"id":lab}).next_sibling.next_sibling.next_sibling.find(string=re.compile('Mon - Fri:')).findNext('span').text
        hours2 = unicodedata.normalize('NFKC',hours)
        hours3 = re.sub(' +', ' ',hours2)
        hours4 = re.sub(r'[\t\r\n]', '', hours3)
        hours_list.append('M-F: '+hours4+'\n')
    elif basesoup.find("h2",{"id":lab}).next_sibling.next_sibling.next_sibling.find(string=re.compile('F:')):
        hours = basesoup.find("h2",{"id":lab}).next_sibling.next_sibling.next_sibling.find(string=re.compile('F:')).findNext('span').text
        hours2 = unicodedata.normalize('NFKC',hours)
        hours3 = re.sub(' +', ' ',hours2)
        hours4 = re.sub(r'[\t\r\n]', '', hours3)
        hours_list.append('F: '+hours4+'\n')
    for day in day_options1:
        #daymatch = "r'^"+day+"$'"
        if basesoup.find("h2",{"id":lab}).next_sibling.next_sibling.next_sibling.find(string=re.compile(day)):
            match = basesoup.find("h2",{"id":lab}).next_sibling.next_sibling.next_sibling.find(string=re.compile(day)).findNext('span').text
            match2 = unicodedata.normalize('NFKC',match)
            match3 = re.sub(' +', ' ',match2)
            match4 = re.sub(r'[\t\r\n]', '', match3)
            hours_list.append(day+' '+match4+'\n')
    #get 'Closed for Lunch'
    if basesoup.find("h2",{"id":lab}).next_sibling.next_sibling.next_sibling.find('span',{'class':'text-red'}):
        closed = basesoup.find("h2",{"id":lab}).findNext('span',{'class':'text-red'}).text
        closed2 = unicodedata.normalize('NFKC',closed)
        closed3 = re.search(r'Closed for Lunch:\s(.*)', closed2).group(1) # get just the hours after 'Closed for Lunch:'
        hours_list.append('Closed for Lunch:'+closed3)
    hours_string = ''.join(hours_list)
    lab_hours.update({lab:hours_string})

In [42]:
#Unit Test: Does each Lab have Office Hours?
CountHours = len(lab_hours)
if not (CountHours == CountLabs):
    if (CountHours > CountLabs):
        greater = 'Addresses'
        fewer = 'Lab IDs'
    else:
        greater = 'Lab IDs'
        fewer = 'Addresses'
    print("WARNING: There are more ",greater," than ",fewer)
else:
    print("Everything is as expected.")

Everything is as expected.


In [43]:
#Find 'Get Directions' links
links = {}  #Create a dictionary to store ID and links as key:value pairs
for lab in labs:
    link = basesoup.find("h2",{"id":lab}).findNext("a").get("href")
    link2 = re.escape("=HYPERLINK")+'("'+link+'","Get Directions")' #Format link so CSV recognizes it as a hyperlink
    links.update({lab:link2})

In [44]:
#Unit Test: Does each Lab have a Link for Directions?
CountLinks = len(links)
if not (CountLinks == CountLabs):
    if (CountLinks > CountLabs):
        greater = 'Links'
        fewer = 'Lab IDs'
    else:
        greater = 'Lab IDs'
        fewer = 'Links'
    print("WARNING: There are more ",greater," than ",fewer)
else:
    print("Everything is as expected.")

Everything is as expected.


In [45]:
#Combine the dictionaries on ID
labsdict = {'Name':names, 'Associated': associated, 'Address': address, 'Lab Hours': lab_hours, 'Google Map Directions':links}

In [46]:
#Transform the dictionary into a dataframe
LabLocations = pd.DataFrame(labsdict)

In [47]:
#Check for Changes. Check current DataFrame against last run's exported CSV file
list_of_files = glob.glob('*.csv') #Find all CSV files in current directory
if not list_of_files: #Check that a previous CSV file exists in the directory
    print('WARNING: No CSV files were located in the current directory to compare with new run.')
else: 
    last_file = max(list_of_files, key=os.path.getctime) #get most recent CSV file
    lastDF = pd.read_csv(last_file,index_col = 'Unnamed: 0') #read that CSV file in as a DataFrame and set index
    differences = LabLocations.compare(lastDF, align_axis = 0) #look for differences between the current DataFrame and last DataFrame
    #Check if anything has changed since last time and flag changes
    if differences.empty:
        print('No changes from: ',last_file)
    else:
        differences.index.set_levels(['New','Old'],level=1,inplace=True) #Rename row labels for readability
        columns = []
        [columns.append(col) for col in differences.columns]
        print("Changes from last time are:")
        for i in columns:
            for item, frame in differences[i].iteritems():
                if pd.notnull(frame):
                    print(item,": ",frame)

No changes from:  2021-09-14 13:00_LabLocations.csv


In [48]:
#Export DataFrame to a CSV file
LabLocations.to_csv(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"+"_LabLocations.csv"))

In [49]:
#Close the selenium browser
driver.close()
driver.quit()

## MIT License
### Copyright (c) 2021 Redefining Default LLC
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.