In [35]:
!pip install selenium

from joblib import Parallel, delayed
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException

import sys
from tqdm import tqdm
from time import sleep

import pandas as pd
import numpy as np
import re

# Options for the driver
options = Options()
options.headless = True



### TO-DO
- Figure out if [Selenium Grid](https://www.selenium.dev/documentation/grid/) can potentially improve the performance of the scraper
- Run on Google Colab/Microsoft Azure/local desktop/some other place?

In [46]:
. # Keep this cell to prevent the rest of the notebook from automatically running

SyntaxError: invalid syntax (<ipython-input-46-b2026be2226a>, line 1)

In [107]:
# Options for scraping
earliest_year = 18 # Earliest year to search requests for
latest_year = 18 # Latest year to search request for
id_start = 2511 # ID value to start from
id_range = 3 # Number of IDs to try for each year
cooldown = 1 # Amount of time, in seconds, to wait between website accesses

# List of all request IDs
request_ids = [str(year) + '-' + str(num) for num in range(id_start, id_range + id_start) 
                                       for year in range(earliest_year, latest_year + 1)]

# URLs to extract data from
urls = ['https://sandiego.nextrequest.com/requests/', 'https://oaklandca.nextrequest.com/requests/']

The following iterative script uses the `selenium` library to, in theory, scrape every request from [San Diego's NextRequest database](https://sandiego.nextrequest.com/requests). It does so by using the fact that every request has its own unique URL i.e. the request with ID 'yy-xxxx' will be found at '.../requests/yy-xxxx'. From each request webpage, the following information is extracted:
- `request_id` (str): ID of the request, yy-xxxx
- `status` (str): Whether the request is opened or closed. Always takes on a value of either 'closed' or 'open'
- `request_desc` (str): Description of the request provided by the requester
- `request_date` (str): Initial request date
- `depts` (str): Current departments assigned to the requests (may not be the ones the requester had initially)
- `docs` (DataFrame): All documents attached to the request, if there are any
    - `title` (str): Title given to each document
    - `link` (str): Link to each document
- `poc` (str): Person of contact
- `msgs` (DataFrame): All messages attached to the requests
    - `title` (str): Title of each message
    - `item` (str): Message body
    - `time` (str): Date of each message

In [108]:
# Iterative script
driver = webdriver.Firefox(options=options) # Headless (non-visible) WebDriver

sd_requests = [] # List of dictionaries containing information on each request
i = 0 # Index for URLs - may be useful later for scraping multiple sites

for year in range(earliest_year, latest_year + 1):
    for num in tqdm(range(id_start, id_range + id_start)):
        request_id = str(year) + '-' + str(num) # NextRequest request IDs are a two-digit year and a number, with a dash in between
        url = urls[i] + request_id
        driver.get(url) # Attempt to access the record
        
        # print(driver.title) # For testing purposes
        
        # If the record does not exist, then move on to the next ID
        if (request_id not in driver.title):
            sleep(cooldown)
            continue
            
        status, desc, date, depts, docs, poc, events = [None] * 7 # Initialize variables 
        try: # Attempt to scrape relevant data
            status = driver.find_element_by_class_name('request-status-label').text.strip() # Request status
            desc = driver.find_element_by_class_name('request-text.row').text # Request description
            date = driver.find_element_by_class_name('request_date').text # Request date
            depts = driver.find_element_by_class_name('current-department').text # Department(s) currently assigned to the request
            poc = driver.find_element_by_class_name('request-detail').text # Person of contact

            # Messages recorded on the request page, if there are any
            event_titles = get_webelement_text(driver.find_elements_by_class_name('event-title')) # Title for each message
            event_items = get_webelement_text(driver.find_elements_by_class_name('event-item')) # Description for each message
            time_quotes = get_webelement_text(driver.find_elements_by_class_name('time-quotes')) # Time string for each message
            events = pd.DataFrame({ # DataFrame consisting of all messages
                'title': event_titles,
                'item': event_items,
                'time': time_quotes
                })

            # Documents attached to the request, if there are any
            public_docs = driver.find_element_by_id('public-docs') # WebElement containing the documents
            folders = public_docs.find_elements_by_class_name('folder-toggle') # Expand folders, if there are any
            if folders:
                for folder in folders:
                    folder.click()
            doc_links = public_docs.find_elements_by_class_name('document-link') # Links to documents
            docs = pd.DataFrame({ # DataFrame consisting of all documents
                'title': get_webelement_text(doc_links),
                'link': remove_download_from_urls(get_webelement_link(doc_links))
                })
        except NoSuchElementException: # Catch exception thrown if a specific element cannot be found, and silently pass
            pass
        except: # If some other exception occurs, print information about the exception
            print('Unexpected error:', sys.exc_info())
        finally: # Add request to list
            sd_requests.append({
                'request_id': request_id,
                'status': status,
                'request_desc': desc,
                'request_date': date,
                'depts': depts,
                'docs': docs,
                'poc': poc,
                'msgs': events
            })

        sleep(cooldown) # Cooldown between requests
    sleep(cooldown)
        
driver.close()

# sd_requests = [x for x in sd_requests if x['status'] is not None] # Remove entries with incomplete information
# sd_requests_df = pd.DataFrame(sd_requests) # Convert to DataFrame

# # Create a zipped CSV file of the data
# compression_opts = dict(method='zip', archive_name='sd_requests.csv')
# sd_requests_df.to_csv('data/sd_requests.zip', index=False, compression=compression_opts)

 33%|███▎      | 1/3 [00:04<00:09,  4.71s/it]

Unexpected error: (<class 'ValueError'>, ValueError('arrays must all be same length'), <traceback object at 0x7fa5939b75c0>)


100%|██████████| 3/3 [00:08<00:00,  2.96s/it]


Some requests threw a `ValueError` because the length of the `event_items` array was too long, so the `msgs` DataFrame (and subsequently the `docs` DataFrame) could not be created:

In [122]:
sd_requests_df[sd_requests_df['msgs'].isna()]

Unnamed: 0,request_id,status,request_desc,request_date,depts,docs,poc,msgs
8223,18-2512,CLOSED,Rehabilitation of Anthracite Media Beds (RE-BI...,"July 2, 2018 via email",Purchasing & Contracting,,Ginger Rodriguez,
8306,18-2595,CLOSED,"July 12, 2018 ...","July 12, 2018 via web",Engineering and Capital Projects,,Ginger Rodriguez,
9268,18-3558,CLOSED,"On behalf of my publisher, I am requesting the...","September 21, 2018 via web",Police,,Angela Laurita,
9767,18-4089,CLOSED,I would like to request a copy of the San Dieg...,"October 30, 2018 via web",Animal Services,,Ginger Rodriguez,
9768,18-4090,CLOSED,"Hi, \nCan I request records for case #78924?\n...","October 30, 2018 via web",Animal Services,,Ginger Rodriguez,
10628,19-222,CLOSED,"January 16, 2019 ...","January 16, 2019 via web",Engineering and Capital Projects,,Lori Hernandez,
10694,19-288,CLOSED,re: Balboa Avenue Station Area Specific Plan /...,"January 18, 2019 via web",Planning,,Ginger Rodriguez,
11543,19-1192,CLOSED,I would like to request logs for adult arrests...,"March 19, 2019 via web",Police,,Angela Laurita,
11728,19-1378,CLOSED,911 police report #19010007515 January 5th 2019,"April 1, 2019 via web",Police,,Angela Laurita,
14132,19-3900,CLOSED,"Pursuant to the California Public Records Act,...","August 19, 2019 via web",City Attorney,,Nancy Shapiro,


More seriously, a significant proportion of requests were not included due to an undesirable behavior of the driver: if it has been running at a rate of less than 2 seconds per iteration, it will begin to rapidly skip over many consecutive IDs for a while, even if some the IDs are valid. Thus, any requests that were in one of these consecutive runs would not have been scraped.

In [119]:
len(sd_requests)

26788

As of Monday, November 1, 7 PM, there are 28817 requests in the San Diego NextRequest database, meaning that the scraper successfully scraped about 92.5\% of all the requests. We can roughly figure out the ranges of the remaining 7.5\% that were not scraped:

In [81]:
# Get all the requests the scraper managed to scrape
scraped_requests = sd_requests_df['request_id'].to_numpy()
scraped_requests

array(['16-1', '16-2', '16-3', ..., '21-5552', '21-5565', '21-5571'],
      dtype=object)

In [83]:
# Determine the highest number among all the scraped requests
id_max = max(list(map(lambda x: int(x.split('-')[1]), scraped_requests)))
id_max

6121

In [90]:
# Find (roughly) the ranges for all the requests that were not scraped
request_ids_max = [str(year) + '-' + str(num) for num in range(id_start, id_max) 
                                       for year in range(earliest_year, latest_year + 1)]
unscraped_requests = [request_id for request_id in request_ids_max if request_id not in scraped_requests]

['17-1',
 '18-1',
 '19-1',
 '20-1',
 '21-1',
 '17-2',
 '18-2',
 '19-2',
 '20-2',
 '21-2']

In [120]:
sorted(unscraped_requests)[:10]

['16-1816',
 '16-2534',
 '16-2535',
 '16-2536',
 '16-2537',
 '16-2538',
 '16-2539',
 '16-2540',
 '16-2541',
 '16-2542']

(Note that some of these requests were not scraped because they are not publically accessible. One such request is [16-1816](https://sandiego.nextrequest.com/requests/16-1816).)

We will attempt to add the excluded requests by performing a second iteration of the scraper, this time on our `unscraped_requests` list:

In [94]:
driver = webdriver.Firefox(options=options)

for request_id in tqdm(unscraped_requests):
    url = urls[i] + request_id
    driver.get(url) # Attempt to access the record

    # If the record does not exist, then move on to the next ID
    if (request_id not in driver.title):
        sleep(cooldown)
        continue

    status, desc, date, depts, docs, poc, events = [None] * 7 # Initialize variables 
    try: # Attempt to scrape relevant data
        status = driver.find_element_by_class_name('request-status-label').text.strip() # Request status
        desc = driver.find_element_by_class_name('request-text.row').text # Request description
        date = driver.find_element_by_class_name('request_date').text # Request date
        depts = driver.find_element_by_class_name('current-department').text # Department(s) assigned to the request
        poc = driver.find_element_by_class_name('request-detail').text # Person of contact

        # Messages recorded on the request page, if there are any
        event_titles = get_webelement_text(driver.find_elements_by_class_name('event-title')) # Title for each message
        event_items = get_webelement_text(driver.find_elements_by_class_name('event-item')) # Description for each message
        time_quotes = get_webelement_text(driver.find_elements_by_class_name('time-quotes')) # Time string for each message
        events = pd.DataFrame({ # DataFrame consisting of all messages
            'title': event_titles,
            'item': event_items,
            'time': time_quotes
            })

        # Documents attached to the request, if there are any
        public_docs = driver.find_element_by_id('public-docs') # WebElement containing the documents
        folders = public_docs.find_elements_by_class_name('folder-toggle') # Expand folders, if there are any
        if folders:
            for folder in folders:
                folder.click()
        doc_links = public_docs.find_elements_by_class_name('document-link') # Links to documents
        docs = pd.DataFrame({ # DataFrame consisting of all documents
            'title': get_webelement_text(doc_links),
            'link': remove_download_from_urls(get_webelement_link(doc_links))
            })
    except NoSuchElementException: # Catch exception thrown if a specific element cannot be found, and silently pass
        pass
    except: # If some other exception occurs, print information about the exception
        print('Unexpected error:', sys.exc_info())
    finally: # Add request to list
        sd_requests.append({
            'request_id': request_id,
            'status': status,
            'request_desc': desc,
            'request_date': date,
            'depts': depts,
            'docs': docs,
            'poc': poc,
            'msgs': events
        })

    sleep(cooldown) # Cooldown between requests
    
driver.close()

  1%|          | 72/9933 [03:07<7:07:41,  2.60s/it] 


KeyboardInterrupt: 

In [47]:
# Scraper functions
def scrape_record(url, request_id):
    '''
    Scrapes data about a given request on a NextRequest request database. For use with parallelization.
    '''
    driver = webdriver.Remote(desired_capabilities=DesiredCapabilities.FIREFOX, options=options)
    driver.get(url + request_id) # Attempt to access the record
    
    # If the record does not exist, return nothing
    if (request_id not in driver.title):
        return
    
    status, desc, date, depts, docs, poc, events = [None] * 7 # Initialize variables 
    try: # Attempt to scrape relevant data
        status = driver.find_element_by_class_name('request-status-label').text.strip() # Request status
        desc = driver.find_element_by_class_name('request-text.row').text # Request description
        date = driver.find_element_by_class_name('request_date').text # Request date
        depts = driver.find_element_by_class_name('current-department').text # Department(s) assigned to the request
        poc = driver.find_element_by_class_name('request-detail').text # Person of contact

        # Messages recorded on the request page, if there are any
        event_titles = get_webelement_text(driver.find_elements_by_class_name('event-title')) # Title for each message
        event_items = get_webelement_text(driver.find_elements_by_class_name('event-item')) # Description for each message
        time_quotes = get_webelement_text(driver.find_elements_by_class_name('time-quotes')) # Time string for each message
        events = pd.DataFrame({ # DataFrame consisting of all messages
            'title': event_titles,
            'item': event_items,
            'time': time_quotes
            })
        
        # Documents attached to the request, if there are any
        public_docs = driver.find_element_by_id('public-docs') # WebElement containing the documents
        folders = public_docs.find_elements_by_class_name('folder-toggle') # Expand folders, if there are any
        if folders:
            for folder in folders:
                folder.click()
        doc_links = public_docs.find_elements_by_class_name('document-link') # Links to documents
        docs = pd.DataFrame({ # DataFrame consisting of all documents
            'title': get_webelement_text(doc_links),
            'link': remove_download_from_urls(get_webelement_link(doc_links))
            })
    except NoSuchElementException: # Catch exception thrown if a specific element cannot be found, and silently pass
        pass
    except: # If some other exception occurs, print information about the exception
        print('Unexpected error:', sys.exc_info()[2])
    finally: # Return the request
        driver.close()

        return {
            'request_id': request_id,
            'status': status,
            'request_desc': desc,
            'request_date': date,
            'depts': depts,
            'docs': docs,
            'poc': poc,
            'msgs': events
            }
    
def get_city_from_url(url):
    '''
    Finds the city name from the NextRequest URL.
    '''
    return re.match(r'(?<=https://)[a-zA-Z]*', url)[0]

def get_webelement_text(webelement):
    '''
    Gets the text of each web element in a list, if such a list exists.
    '''
    return list(map(lambda x: x.text, webelement)) if webelement else None

def get_webelement_link(webelement):
    '''
    Gets the link of each web element in a list, if such a list exists.
    '''
    return list(map(lambda x: x.get_attribute('href'), webelement)) if webelement else None

def remove_download_from_urls(urls):
    '''
    Removes '/download' from the end of a list of URLs, if the list exists.
    '''
    return list(map(lambda url: re.match(r'.*(?=/download)', url)[0], urls)) if urls else None

In [37]:
# # Test remote WebDriver (run Selenium Grid instance locally before running this cell)
# driver = webdriver.Remote(desired_capabilities=DesiredCapabilities.FIREFOX, options=options)
# driver.get('http://www.google.com')
# driver.close()

In [None]:
# # Test to make sure the driver works
# 
# driver = webdriver.Firefox(options=options) # Instantiate a headless Firefox WebDriver
# for url in urls:
#     driver.get(url)
#     print(driver.title)
#     sleep(5)
    
# driver.close()

In [None]:
# %%timeit
# # Test for retrieving message info from a specific request
# driver = webdriver.Firefox(options=options)
# driver.get('https://sandiego.nextrequest.com/requests/21-4915')
# print(driver.title)

# event_titles = driver.find_elements_by_class_name('event-title')
# event_items = driver.find_elements_by_class_name('event-item')
# times = driver.find_elements_by_class_name('time-quotes')
# for title, item, time in list(zip(event_titles, event_items, times)):
#     print(title.text)
#     print(item.text)
#     print(time.text)
#     print()

# driver.close()

In [None]:
# # Scraping documents test
# driver = webdriver.Firefox(options=options)

# for request_id in [5369, 5313, 5374]:
#     url = 'https://sandiego.nextrequest.com/requests/21-' + str(request_id)
#     driver.get(url)

#     docs = driver.find_element_by_id('public-docs')
#     folders = docs.find_elements_by_class_name('folder-toggle')
#     if folders:
#         for folder in folders:
#             folder.click()
#     else:
#         print('No folders found for', request_id)
    
#     doc_links = docs.find_elements_by_class_name('document-link')
#     display(
#         pd.DataFrame(
#             list(zip(get_webelement_text(doc_links), remove_download_from_urls(get_webelement_link(doc_links)))),
#             columns=['title', 'link']
#         )
#     )
#     display(
#         pd.DataFrame({
#             'title': get_webelement_text(doc_links),
#             'link': remove_download_from_urls(get_webelement_link(doc_links))
#         })
#     )

# driver.close()

In [41]:
# Parallelized script
scrape_request = lambda i: scrape_record(urls[0], i)
sd_requests = Parallel(n_jobs=-1, prefer='threads', verbose=10)(delayed(scrape_request)(request_id) for request_id in request_ids)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   33.9s


KeyboardInterrupt: 

In [None]:
# Copy of non-parallelized script, for testing purposes

# Options for scraping
earliest_year = 18 # Earliest year to search requests for
latest_year = 18 # Latest year to search request for
id_start = 2511 # ID value to start from
id_range = 3 # Number of IDs to try for each year
cooldown = 1 # Amount of time, in seconds, to wait between website accesses

# List of all request IDs
request_ids = [str(year) + '-' + str(num) for num in range(id_start, id_range + id_start) 
                                       for year in range(earliest_year, latest_year + 1)]

# URLs to extract data from
urls = ['https://sandiego.nextrequest.com/requests/', 'https://oaklandca.nextrequest.com/requests/']

driver = webdriver.Firefox(options=options) # Headless (non-visible) WebDriver

sd_requests = [] # List of dictionaries containing information on each request
i = 0 # Index for URLs - may be useful later for scraping multiple sites

for year in range(earliest_year, latest_year + 1):
    for num in tqdm(range(id_start, id_range + id_start)):
        request_id = str(year) + '-' + str(num) # NextRequest request IDs are a two-digit year and a number, with a dash in between
        url = urls[i] + request_id
        driver.get(url) # Attempt to access the record
        
        # print(driver.title) # For testing purposes
        
        # If the record does not exist, then move on to the next ID
        if (request_id not in driver.title):
            sleep(cooldown)
            continue
            
        status, desc, date, depts, docs, poc, events = [None] * 7 # Initialize variables 
        try: # Attempt to scrape relevant data
            status = driver.find_element_by_class_name('request-status-label').text.strip() # Request status
            desc = driver.find_element_by_class_name('request-text.row').text # Request description
            date = driver.find_element_by_class_name('request_date').text # Request date
            depts = driver.find_element_by_class_name('current-department').text # Department(s) assigned to the request
            poc = driver.find_element_by_class_name('request-detail').text # Person of contact

            # Messages recorded on the request page, if there are any
            event_titles = get_webelement_text(driver.find_elements_by_class_name('event-title')) # Title for each message
            event_items = get_webelement_text(driver.find_elements_by_class_name('event-item')) # Description for each message
            time_quotes = get_webelement_text(driver.find_elements_by_class_name('time-quotes')) # Time string for each message
            events = pd.DataFrame({ # DataFrame consisting of all messages
                'title': event_titles,
                'item': event_items,
                'time': time_quotes
                })

            # Documents attached to the request, if there are any
            public_docs = driver.find_element_by_id('public-docs') # WebElement containing the documents
            folders = public_docs.find_elements_by_class_name('folder-toggle') # Expand folders, if there are any
            if folders:
                for folder in folders:
                    folder.click()
            doc_links = public_docs.find_elements_by_class_name('document-link') # Links to documents
            docs = pd.DataFrame({ # DataFrame consisting of all documents
                'title': get_webelement_text(doc_links),
                'link': remove_download_from_urls(get_webelement_link(doc_links))
                })
        except NoSuchElementException: # Catch exception thrown if a specific element cannot be found, and silently pass
            pass
        except: # If some other exception occurs, print information about the exception
            print('Unexpected error:', sys.exc_info())
        finally: # Add request to list
            sd_requests.append({
                'request_id': request_id,
                'status': status,
                'request_desc': desc,
                'request_date': date,
                'depts': depts,
                'docs': docs,
                'poc': poc,
                'msgs': events
            })

        sleep(cooldown) # Cooldown between requests
    sleep(cooldown)
        
driver.close()