My attempt to scrape NextRequest data using the `selenium` library:

In [35]:
!pip install selenium

from joblib import Parallel, delayed
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException

import sys
from tqdm import tqdm
from time import sleep

import pandas as pd
import numpy as np
import re

# Options for the driver
options = Options()
options.headless = True



### TO-DO
- Generate full dataset from San Diego NextRequest database
- Figure out if [Selenium Grid](https://www.selenium.dev/documentation/grid/) can potentially improve the performance of the scraper
- Run on Google Colab?

In [46]:
. # Keep this cell to prevent the rest of the notebook from automatically running

SyntaxError: invalid syntax (<ipython-input-46-b2026be2226a>, line 1)

In [48]:
# Options for scraping
earliest_year = 16 # Earliest year to search requests for
latest_year = 21 # Latest year to search request for
id_start = 1 # ID value to start from
id_range = 9999 # Number of IDs to try for each year
cooldown = 1 # Amount of time, in seconds, to wait between website accesses

# List of all request IDs
request_ids = [str(year) + '-' + str(num) for num in range(id_start, id_range + id_start) 
                                       for year in range(earliest_year, latest_year + 1)]

# URLs to extract data from
urls = ['https://sandiego.nextrequest.com/requests/', 'https://oaklandca.nextrequest.com/requests/']

In [40]:
# Non-parallelized script
driver = webdriver.Firefox(options=optoins) # Headless (non-visible) WebDriver

sd_requests = [] # List of dictionaries containing information on each request
i = 0 # Index for URLs - may be useful later for scraping multiple sites

for year in range(earliest_year, latest_year + 1):
    for num in tqdm(range(id_start, id_range + id_start)):
        request_id = str(year) + '-' + str(num) # NextRequest request IDs are a two-digit year and a number, with a dash in between
        url = url[i] + request_id
        driver.get(url) # Attempt to access the record
        
        # print(driver.title) # For testing purposes
        
        # If the record does not exist, then move on to the next ID
        if (request_id not in driver.title):
            continue
            
        status, desc, date, depts, docs, poc, events = [None] * 7 # Initialize variables 
        try: # Attempt to scrape relevant data
            status = driver.find_element_by_class_name('request-status-label').text.strip() # Request status
            desc = driver.find_element_by_class_name('request-text.row').text # Request description
            date = driver.find_element_by_class_name('request_date').text # Request date
            depts = driver.find_element_by_class_name('current-department').text # Department(s) assigned to the request
            poc = driver.find_element_by_class_name('request-detail').text # Person of contact

            # Messages recorded on the request page, if there are any
            event_titles = get_webelement_text(driver.find_elements_by_class_name('event-title')) # Title for each message
            event_items = get_webelement_text(driver.find_elements_by_class_name('event-item')) # Description for each message
            time_quotes = get_webelement_text(driver.find_elements_by_class_name('time-quotes')) # Time string for each message
            events = pd.DataFrame({ # DataFrame consisting of all messages
                'title': event_titles,
                'item': event_items,
                'time': time_quotes
                })

            # Documents attached to the request, if there are any
            public_docs = driver.find_element_by_id('public-docs') # WebElement containing the documents
            folders = public_docs.find_elements_by_class_name('folder-toggle') # Expand folders, if there are any
            if folders:
                for folder in folders:
                    folder.click()
            doc_links = public_docs.find_elements_by_class_name('document-link') # Links to documents
            docs = pd.DataFrame({ # DataFrame consisting of all documents
                'title': get_webelement_text(doc_links),
                'link': remove_download_from_urls(get_webelement_link(doc_links))
                })
        except NoSuchElementException: # Catch exception thrown if a specific element cannot be found, and silently pass
            pass
        except: # If some other exception occurs, print information about the exception
            print('Unexpected error:', sys.exc_info()[0])
        finally: # Add request to list
            sd_requests.append({
                'request_id': request_id,
                'status': status,
                'request_desc': desc,
                'request_date': date,
                'depts': depts,
                'docs': docs,
                'poc': poc,
                'msgs': events
            })

        sleep(cooldown) # Cooldown between requests
        
driver.close()

100%|██████████| 100/100 [04:03<00:00,  2.44s/it]


In [49]:
sd_requests = [x for x in sd_requests if x['status'] is not None] # Remove entries with incomplete information
sd_requests_df = pd.DataFrame(sd_requests) # Convert to DataFrame
sd_requests_df

Unnamed: 0,request_id,status,request_desc,request_date,depts,docs,poc,msgs
0,21-1234,CLOSED,CPRA Request:\n Please provide all reports an...,"March 11, 2021 via web",City Attorney,...,Danielle Fawcett,title \ 0 Req...
1,21-1235,CLOSED,request for 8 call log reports\n 1. 3/7/2021 ...,"March 11, 2021 via web",Police,ti...,Angela Laurita,title \ 0...
2,21-1236,CLOSED,CPRA Request:\n Please provide all reports an...,"March 11, 2021 via web",City Attorney,...,Danielle Fawcett,title \ 0 Req...
3,21-1237,CLOSED,"911 recording Jan 2, 2021 in the evening betwe...","March 9, 2021 via email",Fire-Rescue,...,Angela Laurita,title \ 0 Req...
4,21-1238,CLOSED,Dear PRA Clerk: \nIn order to monitor communi...,"March 11, 2021 via web",Police,...,Angela Laurita,title \ 0 R...
5,21-1239,CLOSED,City of San Diego and San Diego Fire-Rescue De...,"March 11, 2021 via web",Fire-Rescue,...,Angela Laurita,title \ 0 Req...
6,21-1240,CLOSED,Dear PRA Clerk: \nIn order to monitor communi...,"March 11, 2021 via web",Police,...,Angela Laurita,title \ 0 Req...
7,21-1241,CLOSED,Dear PRA Clerk: \nIn order to monitor communi...,"March 11, 2021 via web",Code Enforcement,...,Angela Laurita,title \ 0...
8,21-1242,CLOSED,"I would like copies of any plans, deeds, map, ...","March 11, 2021 via web",Transportation,title ...,Ginger Rodriguez,title \ 0 Req...
9,21-1243,CLOSED,To Whom It May Concern:\nPursuant to the Calif...,"March 11, 2021 via web","Police, Public Records Administration",...,Angela Laurita,title \ 0 Req...


In [51]:
# Create a zipped CSV file of the data
compression_opts = dict(method='zip', archive_name='sd_requests.csv')
sd_requests_df.to_csv('data/sd_requests.zip', index=False, compression=compression_opts)

In [47]:
# Scraper functions
def scrape_record(url, request_id):
    '''
    Scrapes data about a given request on a NextRequest request database. For use with parallelization.
    '''
    driver = webdriver.Remote(desired_capabilities=DesiredCapabilities.FIREFOX, options=options)
    driver.get(url + request_id) # Attempt to access the record
    
    # If the record does not exist, return nothing
    if (request_id not in driver.title):
        return
    
    status, desc, date, depts, docs, poc, events = [None] * 7 # Initialize variables 
    try: # Attempt to scrape relevant data
        status = driver.find_element_by_class_name('request-status-label').text.strip() # Request status
        desc = driver.find_element_by_class_name('request-text.row').text # Request description
        date = driver.find_element_by_class_name('request_date').text # Request date
        depts = driver.find_element_by_class_name('current-department').text # Department(s) assigned to the request
        poc = driver.find_element_by_class_name('request-detail').text # Person of contact

        # Messages recorded on the request page, if there are any
        event_titles = get_webelement_text(driver.find_elements_by_class_name('event-title')) # Title for each message
        event_items = get_webelement_text(driver.find_elements_by_class_name('event-item')) # Description for each message
        time_quotes = get_webelement_text(driver.find_elements_by_class_name('time-quotes')) # Time string for each message
        events = pd.DataFrame({ # DataFrame consisting of all messages
            'title': event_titles,
            'item': event_items,
            'time': time_quotes
            })
        
        # Documents attached to the request, if there are any
        public_docs = driver.find_element_by_id('public-docs') # WebElement containing the documents
        folders = public_docs.find_elements_by_class_name('folder-toggle') # Expand folders, if there are any
        if folders:
            for folder in folders:
                folder.click()
        doc_links = public_docs.find_elements_by_class_name('document-link') # Links to documents
        docs = pd.DataFrame({ # DataFrame consisting of all documents
            'title': get_webelement_text(doc_links),
            'link': remove_download_from_urls(get_webelement_link(doc_links))
            })
    except NoSuchElementException: # Catch exception thrown if a specific element cannot be found, and silently pass
        pass
    except: # If some other exception occurs, print information about the exception
        print('Unexpected error:', sys.exc_info()[0])
    finally: # Return the request
        driver.close()

        return {
            'request_id': request_id,
            'status': status,
            'request_desc': desc,
            'request_date': date,
            'depts': depts,
            'docs': docs,
            'poc': poc,
            'msgs': events
            }
    
def get_city_from_url(url):
    '''
    Finds the city name from the NextRequest URL.
    '''
    return re.match(r'(?<=https://)[a-zA-Z]*', url)[0]

def get_webelement_text(webelement):
    '''
    Gets the text of each web element in a list, if such a list exists.
    '''
    return list(map(lambda x: x.text, webelement)) if (webelement is not None) else None

def get_webelement_link(webelement):
    '''
    Gets the link of each web element in a list, if such a list exists.
    '''
    return list(map(lambda x: x.get_attribute('href'), webelement)) if (webelement is not None) else None

def remove_download_from_urls(urls):
    '''
    Removes '/download' from the end of a list of URLs, if the list exists.
    '''
    return list(map(lambda url: re.match(r'.*(?=/download)', url)[0], urls)) if (urls is not None) else None

In [37]:
# # Test remote WebDriver (run Selenium Grid instance locally before running this cell)
# driver = webdriver.Remote(desired_capabilities=DesiredCapabilities.FIREFOX, options=options)
# driver.get('http://www.google.com')
# driver.close()

In [None]:
# # Test to make sure the driver works
# 
# driver = webdriver.Firefox(options=options) # Instantiate a headless Firefox WebDriver
# for url in urls:
#     driver.get(url)
#     print(driver.title)
#     sleep(5)
    
# driver.close()

In [None]:
# %%timeit
# # Test for retrieving message info from a specific request
# driver = webdriver.Firefox(options=options)
# driver.get('https://sandiego.nextrequest.com/requests/21-4915')
# print(driver.title)

# event_titles = driver.find_elements_by_class_name('event-title')
# event_items = driver.find_elements_by_class_name('event-item')
# times = driver.find_elements_by_class_name('time-quotes')
# for title, item, time in list(zip(event_titles, event_items, times)):
#     print(title.text)
#     print(item.text)
#     print(time.text)
#     print()

# driver.close()

In [None]:
# # Scraping documents test
# driver = webdriver.Firefox(options=options)

# for request_id in [5369, 5313, 5374]:
#     url = 'https://sandiego.nextrequest.com/requests/21-' + str(request_id)
#     driver.get(url)

#     docs = driver.find_element_by_id('public-docs')
#     folders = docs.find_elements_by_class_name('folder-toggle')
#     if folders:
#         for folder in folders:
#             folder.click()
#     else:
#         print('No folders found for', request_id)
    
#     doc_links = docs.find_elements_by_class_name('document-link')
#     display(
#         pd.DataFrame(
#             list(zip(get_webelement_text(doc_links), remove_download_from_urls(get_webelement_link(doc_links)))),
#             columns=['title', 'link']
#         )
#     )
#     display(
#         pd.DataFrame({
#             'title': get_webelement_text(doc_links),
#             'link': remove_download_from_urls(get_webelement_link(doc_links))
#         })
#     )

# driver.close()

In [None]:
# Create a dictionary showing whether a given request was scraped or not (does not account for requests that do not exist)
missing_requests = {i: (('21-' + str(i)) in sd_requests_df['request_id'].to_numpy()) for i in range(id_start, id_range + id_start)}
display(missing_requests)

In [41]:
# Parallelized script
scrape_request = lambda i: scrape_record(urls[0], i)
sd_requests = Parallel(n_jobs=-1, prefer='threads', verbose=10)(delayed(scrape_request)(request_id) for request_id in request_ids)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   33.9s


KeyboardInterrupt: 