In [621]:
!pip install selenium

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException

from joblib import Parallel, delayed

import sys
import traceback
from timeit import default_timer as timer
from tqdm import tqdm
from time import sleep

from io import StringIO
import pandas as pd
import numpy as np
import re
import zipfile

# Options for the driver
options = Options()
options.headless = True



### TO-DO
- Figure out if [Selenium Grid](https://www.selenium.dev/documentation/grid/) can potentially improve the performance of the scraper
- Run on Google Colab/Microsoft Azure/local desktop/some other place?

In [46]:
. # Keep this cell to prevent the rest of the notebook from automatically running

SyntaxError: invalid syntax (<ipython-input-46-b2026be2226a>, line 1)

The following scripts uses the `selenium` library to, in theory, scrape every request from [San Diego's NextRequest database](https://sandiego.nextrequest.com/requests). It does so by using the fact that every request has its own unique URL i.e. the request with ID 'yy-xxxx' will be found at '.../requests/yy-xxxx'. From each request webpage, the following information is extracted:

- `id` (str): ID of the request, yy-xxxx
- `status` (str): Whether the request is opened or closed. Always takes on a value of either 'closed' or 'open'
- `desc` (str): Description of the request provided by the requester
- `date` (str): Initial request date
- `depts` (str): Current departments assigned to the request (may not be the ones the requester had initially)
- `docs` (DataFrame in CSV format): All documents attached to the request, if there are any, otherwise None. The columns are:
    - `title` (str): Title given to each document
    - `link` (str): Link to each document
- `poc` (str): Point of contact
- `msgs` (DataFrame in CSV format): All messages attached to the request. The columns are:
    - `title` (str): Title of each message
    - `item` (str): Message body
    - `time` (str): Date of each message

After a request is scraped, the next request is navigated to by clicking on an arrow, and the scraper continues to run until the arrow cannot be found, either due to a timeout or because the scraper has reached the last request in the database.

To address potential timeouts, we stop the driver every time it cannot access a request, then restart it after a short delay starting from the request that it timed out on.

In [622]:
def scrape_record_append(requests, driver, debug=0):
    '''
    Scrapes data about a given request on a NextRequest request database, appending the result
    to the given list
    '''
    request_id, status, desc, date, depts, docs, poc, events = [None] * 8 # Initialize variables 
    try: # Attempt to scrape relevant data
        request_id = driver.find_element_by_class_name('request-title-text').text.split()[1][1:] # Request ID
        status = driver.find_element_by_class_name('request-status-label').text.strip() # Request status
        desc = driver.find_element_by_class_name('request-text.row').text # Request description
        date = driver.find_element_by_class_name('request_date').text # Request date
        depts = driver.find_element_by_class_name('current-department').text # Department(s) assigned to the request
        poc = driver.find_element_by_class_name('request-detail').text # Person of contact

        # Documents attached to the request, if there are any
        public_docs = driver.find_element_by_id('public-docs') # Documents block
        if '(none)' not in public_docs.text: # Check for the presence of documents
            # Expand folders, if there are any
            folders = public_docs.find_elements_by_class_name('folder-toggle') 
            if folders:
                for folder in folders:
                    folder.click()

            doc_links = public_docs.find_elements_by_class_name('document-link') # Links to documents

            # DataFrame-converted-to-CSV consisting of all documents
            docs = pd.DataFrame({
                'title': get_webelement_text(doc_links),
                'link': remove_download_from_urls(get_webelement_link(doc_links))
                }).to_csv(index=False)

        # Messages recorded on the request page, if there are any
        event_history = driver.find_elements_by_class_name('generic-event') # All message blocks
        if event_history: # Check for presence of 
            num_events = len(event_history)

            # Titles, descriptions, and time strings for each message
            event_titles = [None] * num_events
            event_items = [None] * num_events
            time_quotes = [None] * num_events

            # Scrape information from each individual event
            for i in range(len(event_history)):
                event = event_history[i]

                event_title = event.find_element_by_class_name('event-title').text
                event_item = '\n'.join(get_webelement_text(event.find_elements_by_class_name('event-item'))) # Necessary to address the case where there are multiple event-items
                time_quote = event.find_element_by_class_name('time-quotes').text

                event_titles[i] = event_title
                event_items[i] = event_item
                time_quotes[i] = time_quote

            # DataFrame-converted-to-CSV consisting of all messages
            events = pd.DataFrame({ 
                'title': event_titles,
                'item': event_items,
                'time': time_quotes
                }).to_csv(index=False)
            
        # For testing purposes, print a message whenever a request is successfully scraped
        if debug:
            print(request_id, 'scraped')
    except NoSuchElementException: # Catch exception thrown if a specific element cannot be found, and print information about the exception
        print('NoSuchElementException encountered:')
        traceback.print_exc()
        print()
    except: # If some other exception occurs, do the same
        print('Exception encountered:')
        traceback.print_exc()
        print()
    finally: # Append the request to the list
        requests.append({
            'id': request_id,
            'status': status,
            'desc': desc,
            'date': date,
            'depts': depts,
            'docs': docs,
            'poc': poc,
            'msgs': events
            })

In [619]:
def scrape_records_sequential(requests, driver, num_requests=-1, cooldown=1, debug=0, progress=0):
    '''
    Scrapes all records on a NextRequest request database starting from the given ID and
    moving forward chronologically until the number of requests scraped reaches a given
    number. Each scraped requests is added to a given list. If num_requests is non-positive, 
    then scrape as many records as possible.
    '''
    counter = 0 # Keeps track of how many requests have been scraped
    start = timer() # Timer for progress checking purposes
    
    # Start by scraping the initial record
    
    # Only scrape a request if it was loaded properly; otherwise, stop the scraper
    if not driver.find_elements_by_class_name('nextrequest'):
        print('No requests scraped')
        return 0

    scrape_record_append(requests, driver, debug=debug) # Scrape request

    counter += 1
    
    # For positive num_requests, return the list of requests if the counter reaches the desired number
    if ((num_requests > 0) and (counter == num_requests)):
        if progress:
            end = timer()
            print('Total requests scraped:', counter, 
                  '\tAvg runtime:', str(round((end - start)/counter, 2)) + 's/request', 
                  '\tTotal runtime:', str(round(end - start, 1)) + 's')
            print('Last request scraped:', requests[-1]['id'])
            print()
        
        return counter

    # Show progress, if desired
    if progress and (counter % progress == 0):
        end = timer()
        print('Requests scraped:', counter, 
              '\tAvg runtime:', str(round((end - start)/counter, 2)) + 's/request', 
              '\tTotal runtime:', str(round(end - start, 1)) + 's')
    
    # Continue to scrape until the arrow to go to the next request is no longer present
    while driver.find_elements_by_class_name('js-next-request'): 
        driver.find_element_by_class_name('js-next-request').click() # Click on the arrow to navigate to the next request
        sleep(cooldown) # Cooldown between scraping attempts
        
        # Scrape request
        if not driver.find_elements_by_class_name('nextrequest'):
            break
        
        scrape_record_append(requests, driver, debug=debug) 
        
        counter += 1
        
        if ((num_requests > 0) and (counter == num_requests)):
            break
        
        if progress and (counter % progress == 0):
            end = timer()
            print('Requests scraped:', counter, 
                  '\tAvg runtime:', str(round((end - start)/counter, 2)) + 's/request', 
                  '\tTotal runtime:', str(round(end - start, 1)) + 's')
    
    # Final progress check
    if progress:
        end = timer()
        print('Total requests scraped:', counter, 
              '\tAvg runtime:', str(round((end - start)/counter, 2)) + 's/request', 
              '\tTotal runtime:', str(round(end - start, 1)) + 's')
        print()
        print('Last request scraped:', requests[-1]['id'])
        print()
        
    return counter

In [None]:
# Running the scraper
urls = ['https://sandiego.nextrequest.com/requests/', 'https://oaklandca.nextrequest.com/requests/'] # URLs
i = 0 # Index of URL to scrape from
current_id = sd_requests[-1]['id'] # The current ID, initialized to the ID to start scraping from
num_requests = -1 # Number of requests to scrape
cooldown = 1 # Cooldown between request accesses
timeout = 10 # Timeout wait time between scraper runs
progress = 100 # Number of requests to show progress for
num_runs = 1 # Keeps track of how many times the scraper has been run

driver = webdriver.Firefox(options=options) # Instantiate headless (non-visible) Firefox driver

# Run an intial iteration of the scraper
driver.get(urls[i] + current_id)

# Print iteration number
it_num_title = 'Iteration ' + str(num_runs)
print(it_num_title)
print('-' * len(it_num_title))

# Re-scrape the current request
print('Starting request:', sd_requests.pop()['id']) 
print()

# Scrape requests until it either reaches the end or times out
scrape_records_sequential(sd_requests, driver, 
                          num_requests=num_requests, 
                          cooldown=cooldown, 
                          progress=progress)

num_runs += 1
sleep(timeout) # Wait after a timeout

# Restart the driver at the last request scraped 
current_id = sd_requests[-1]['id']
driver.get(urls[i] + current_id)

# Continue to scrape until the arrow to go to the next request is no longer present
while driver.find_elements_by_class_name('js-next-request'):
    # Scrape requests until the next timeout or the arrow is no longer present
    it_num_title = 'Iteration ' + str(num_runs)
    print(it_num_title)
    print('-' * len(it_num_title))
    
    print('Starting request:', sd_requests.pop()['id'])
    print()
    
    scrape_records_sequential(sd_requests, driver, 
                          num_requests=num_requests, 
                          cooldown=cooldown, 
                          progress=progress)
    
    num_runs += 1
    sleep(timeout)

    current_id = sd_requests[-1]['id']
    driver.get(urls[i] + current_id)

driver.close()

# Convert to DataFrame
sd_requests = [request for request in sd_requests if (request and request['status'])]
sd_requests_df = pd.DataFrame(sd_requests).drop_duplicates()

# Create a zipped CSV file of the DataFrame
compression_opts = dict(method='zip', archive_name='sd_requests.csv')
sd_requests_df.to_csv('data/sd_requests.zip', index=False, compression=compression_opts)

Iteration 1
-----------
Starting request: 19-2477

Requests scraped: 100 	Avg runtime: 2.17s/request 	Total runtime: 216.9s


In [593]:
# Convert to DataFrame
sd_requests = [request for request in sd_requests if (request and request['status'])]
sd_requests_df = pd.DataFrame(sd_requests).drop_duplicates()

In [None]:
# Create a zipped CSV file of the DataFrame
compression_opts = dict(method='zip', archive_name='sd_requests.csv')
sd_requests_df.to_csv('data/sd_requests.zip', index=False, compression=compression_opts)

In [521]:
csv_to_df = lambda csv: pd.read_csv(StringIO(csv)) if csv else None
sd_requests_df['docs_df'] = sd_requests_df['docs'].apply(csv_to_df)
sd_requests_df['msgs_df'] = sd_requests_df['msgs'].apply(csv_to_df)
sd_requests_df.head()

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...","December 7, 2015 via web",Code Enforcement,"title,link\n5040 ShorehamPlace building permit...",Ginger Rodriguez,"title,item,time\n""Request Closed\nPublic"",02. ...",title \ ...,title ...
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time\n""Request Published\nPublic"",,...",,title ...
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,"December 7, 2015 via web",City Clerk,"title,link\nhttp://www.sandiego.gov/park-and-r...",Mailei Ross-Cerezo,"title,item,time\n""Request Closed\nPublic"",Stil...",...,title \ 0 ...
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time\n""Request Closed\nPublic"",02. ...",,title ...
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...","December 7, 2015 via web",Code Enforcement,"title,link\n15-1814 Fire Responsive.pdf,https:...",Ginger Rodriguez,"title,item,time\n""Request Closed\nPublic"",02. ...",title \ 0 ...,title ...


Scraper utility functions:

In [197]:
def get_city_from_url(url):
    '''
    Finds the city name from the NextRequest URL.
    '''
    return re.match(r'(?<=https://)[a-zA-Z]*', url)[0]

def get_webelement_text(webelement):
    '''
    Gets the text of each web element in a list, if such a list exists.
    '''
    return list(map(lambda x: x.text, webelement)) if webelement else []

def get_webelement_link(webelement):
    '''
    Gets the link of each web element in a list, if such a list exists.
    '''
    return list(map(lambda x: x.get_attribute('href'), webelement)) if webelement else []

def remove_download_from_urls(urls):
    '''
    Removes '/download' from the end of a list of URLs, if the list exists.
    '''
    return list(map(lambda url: re.match(r'.*(?=/download)', url)[0], urls)) if urls else []

Previous scraper scripts:

In [403]:
def scrape_record(url, request_id, driver):
    '''
    Scrapes data about a given request on a NextRequest request database
    '''
    driver.get(url + request_id) # Attempt to access the record
#     timeout = 2 # Timeout length, in seconds
    
#     try:
#         WebDriverWait(driver, timeout).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'nextrequest')))
#     except TimeoutException:
#         print(request_id, 'timed out')
#         pass
    
    if (request_id not in driver.title):
        return
    
    status, desc, date, depts, docs, poc, events = [None] * 7 # Initialize variables 
    try: # Attempt to scrape relevant data
        status = driver.find_element_by_class_name('request-status-label').text.strip() # Request status
        desc = driver.find_element_by_class_name('request-text.row').text # Request description
        date = driver.find_element_by_class_name('request_date').text # Request date
        depts = driver.find_element_by_class_name('current-department').text # Department(s) assigned to the request
        poc = driver.find_element_by_class_name('request-detail').text # Person of contact
        
        # Documents attached to the request, if there are any
        public_docs = driver.find_element_by_id('public-docs') # WebElement containing the documents
        if '(none)' not in public_docs.text:
            # Expand folders, if there are any
            folders = public_docs.find_elements_by_class_name('folder-toggle') 
            if folders:
                for folder in folders:
                    folder.click()

            doc_links = public_docs.find_elements_by_class_name('document-link') # Links to documents
            
            # DataFrame consisting of all documents
            docs = pd.DataFrame({
                'title': get_webelement_text(doc_links),
                'link': remove_download_from_urls(get_webelement_link(doc_links))
                })
            
        # Messages recorded on the request page, if there are any
        event_history = driver.find_elements_by_class_name('generic-event')
        if event_history:
            num_events = len(event_history)
            
            # Titles, descriptions, and time strings for each message
            event_titles = [None] * num_events
            event_items = [None] * num_events
            time_quotes = [None] * num_events
            
            # Scrape information from each individual event
            for i in range(len(event_history)):
                event = event_history[i]
                
                event_title = event.find_element_by_class_name('event-title').text
                event_item = '\n'.join(get_webelement_text(event.find_elements_by_class_name('event-item')))
                time_quote = event.find_element_by_class_name('time-quotes').text
                
                event_titles[i] = event_title
                event_items[i] = event_item
                time_quotes[i] = time_quote
                
            # DataFrame consisting of all messages
            events = pd.DataFrame({ 
                'title': event_titles,
                'item': event_items,
                'time': time_quotes
                })
    except NoSuchElementException: # Catch exception thrown if a specific element cannot be found, and silently pass
        pass
    except: # If some other exception occurs, print information about the exception
        traceback.print_exc()
    finally: # Return the request
        return {
            'id': request_id,
            'status': status,
            'desc': desc,
            'date': date,
            'depts': depts,
            'docs': docs,
            'poc': poc,
            'msgs': events
            }
    
def scrape_record_parallel(url, request_id):
    '''
    Scraper method used for parallelization
    '''
    driver = webdriver.Firefox(options=options)
    request_info = scrape_record(url, request_id, driver)
    driver.close()
    return request_info

In [514]:
# Options for scraping
earliest_year = 16 # Earliest year to search requests for
latest_year = 21 # Latest year to search request for
id_start = 1 # ID value to start from
id_range = id_max # Number of IDs to try for each year
cooldown = 0.9 # Amount of time, in seconds, to wait between website accesses

start_id = '15-1810' # The request to start scraping from
num_requests = -1 # Number of requests to scrape
progress = 100 # Display a message every 100 requests successfully scraped

# List of all request IDs
request_ids = [str(year) + '-' + str(num) for num in range(id_start, id_range + id_start) 
                                       for year in range(earliest_year, latest_year + 1)]

# URLs to extract data from
urls = ['https://sandiego.nextrequest.com/requests/', 'https://oaklandca.nextrequest.com/requests/']

In [319]:
# Iterative script
driver = webdriver.Firefox(options=options
                          ) # Headless (non-visible) WebDriver

sd_requests = [] # List of dictionaries containing information on each request
i = 0 # Index for URLs - may be useful later for scraping multiple sites

for year in range(earliest_year, latest_year + 1):
    for num in tqdm(range(id_start, id_range + id_start)):
        # NextRequest request IDs are a two-digit year and a number, with a dash in between
        request_id = str(year) + '-' + str(num)
        
        # Scrape record
        sd_requests.append(scrape_record(urls[i], request_id, driver)) 

        # Cooldown
        sleep(cooldown)
    
    # sleep(cooldown) # Cooldown
        
driver.close()

sd_requests = [x for x in sd_requests if x['status'] is not None] # Remove entries with incomplete information
sd_requests_df = pd.DataFrame(sd_requests) # Convert to DataFrame

# Create a zipped CSV file of the data
compression_opts = dict(method='zip', archive_name='sd_requests.csv')
sd_requests_df.to_csv('data/sd_requests_2.zip', index=False, compression=compression_opts)

100%|██████████| 10/10 [00:23<00:00,  2.32s/it]


Miscellaneous tests:

In [37]:
# # Test remote WebDriver (run Selenium Grid instance locally before running this cell)
# driver = webdriver.Remote(desired_capabilities=DesiredCapabilities.FIREFOX, options=options)
# driver.get('http://www.google.com')
# driver.close()

In [None]:
# # Test to make sure the driver works
# 
# driver = webdriver.Firefox(options=options) # Instantiate a headless Firefox WebDriver
# for url in urls:
#     driver.get(url)
#     print(driver.title)
#     sleep(5)
    
# driver.close()

In [None]:
# %%timeit
# # Test for retrieving message info from a specific request
# driver = webdriver.Firefox(options=options)
# driver.get('https://sandiego.nextrequest.com/requests/21-4915')
# print(driver.title)

# event_titles = driver.find_elements_by_class_name('event-title')
# event_items = driver.find_elements_by_class_name('event-item')
# times = driver.find_elements_by_class_name('time-quotes')
# for title, item, time in list(zip(event_titles, event_items, times)):
#     print(title.text)
#     print(item.text)
#     print(time.text)
#     print()

# driver.close()

In [None]:
# # Scraping documents test
# driver = webdriver.Firefox(options=options)

# for request_id in [5369, 5313, 5374]:
#     url = 'https://sandiego.nextrequest.com/requests/21-' + str(request_id)
#     driver.get(url)

#     docs = driver.find_element_by_id('public-docs')
#     folders = docs.find_elements_by_class_name('folder-toggle')
#     if folders:
#         for folder in folders:
#             folder.click()
#     else:
#         print('No folders found for', request_id)
    
#     doc_links = docs.find_elements_by_class_name('document-link')
#     display(
#         pd.DataFrame(
#             list(zip(get_webelement_text(doc_links), remove_download_from_urls(get_webelement_link(doc_links)))),
#             columns=['title', 'link']
#         )
#     )
#     display(
#         pd.DataFrame({
#             'title': get_webelement_text(doc_links),
#             'link': remove_download_from_urls(get_webelement_link(doc_links))
#         })
#     )

# driver.close()

In [235]:
# Parallelized script
scrape_request = lambda i: scrape_record_parallel(urls[0], i)
sd_requests = Parallel(n_jobs=-1, prefer='threads', verbose=10)(delayed(scrape_request)(request_id) for request_id in request_ids)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   29.2s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   43.2s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.4min


KeyboardInterrupt: 