My attempt to scrape NextRequest data using the `selenium` library:

In [15]:
!pip install selenium

from joblib import Parallel, delayed
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException

import sys
from tqdm import tqdm
from time import sleep

import pandas as pd
import numpy as np
import re

# Options for the driver
options = Options()
options.headless = True



In [None]:
driver = webdriver.Firefox(options=options) # Instantiate a headless Firefox WebDriver

# # Test to make sure the driver works
# for url in urls:
#     driver.get(url)
#     print(driver.title)
#     sleep(5)
    
# driver.close()

In [None]:
# %%timeit
# # Test for retrieving message info from a specific request
# driver = webdriver.Firefox(options=options)
# driver.get('https://sandiego.nextrequest.com/requests/21-4915')
# print(driver.title)

# event_titles = driver.find_elements_by_class_name('event-title')
# event_items = driver.find_elements_by_class_name('event-item')
# times = driver.find_elements_by_class_name('time-quotes')
# for title, item, time in list(zip(event_titles, event_items, times)):
#     print(title.text)
#     print(item.text)
#     print(time.text)
#     print()

# driver.close()

In [None]:
# # Loop test 
# for i in tqdm(range(5049, 5060)):
#     url = "https://sandiego.nextrequest.com/requests/21-" + str(i) 
#     driver.get(url)
    
#     event_titles = driver.find_elements_by_class_name('event-title')
#     event_items = driver.find_elements_by_class_name('event-item')
#     times = driver.find_elements_by_class_name('time-quotes')
    
#     for title, item, time in list(zip(event_titles, event_items, times)):
#         print(title.text)
#         print(item.text)
#         print(time.text)
#         print()
    
#     print()

In [16]:
def scrape_record(url, request_id):
    '''
    Scrapes data about a given request on a NextRequest request database. For parallelization purposes.
    '''
    driver = webdriver.Firefox(options=options)
    driver.get(url + request_id) # Attempt to access the record
    
    # If the record does not exist, return nothing
    if (request_id not in driver.title):
        return
    
    status, desc, date, depts, poc, events = [None] * 6 # Initialize variables 
    try: # Attempt to scrape relevant data
        status = driver.find_element_by_class_name('request-status-label').text.strip() # Request status
        desc = driver.find_element_by_class_name('request-text.row').text # Request description
        date = driver.find_element_by_class_name('request_date').text # Request date
        depts = driver.find_element_by_class_name('current-department').text.split(',') # Department(s) assigned to the request
        poc = driver.find_element_by_class_name('request-detail').text # Person of contact

        event_titles = driver.find_elements_by_class_name('event-title') # Title for each message
        event_items = driver.find_elements_by_class_name('event-item') # Description for each message
        time_quotes = driver.find_elements_by_class_name('time-quotes') # Time string for each message

        events = pd.DataFrame({ # DataFrame consisting of all messages
            'title': get_webelement_text(event_titles),
            'item': get_webelement_text(event_items), 
            'time': get_webelement_text(time_quotes)
            })
    except NoSuchElementException: # Catch exception thrown if a specific element cannot be found, and silently pass
        pass
    except: # If some other exception occurs, print information about the exception
        print('Unexpected error:', sys.exc_info()[0])
    finally: # Return the request
        driver.close()

        return {'request_id': request_id,
        'status': status,
        'request_desc': desc,
        'request_date': date,
        'depts': depts,
        'poc': poc,
        'msgs': events}

def get_webelement_text(webelement):
    '''
    Gets the text of each web element in a list, if such a list exists
    '''
    return list(map(lambda x: x.text, webelement)) if (webelement is not None) else None

def get_city_from_url(url):
    '''
    Finds the city name from the NextRequest URL
    '''
    return re.findall(r'(?<=https://)[a-zA-Z]*', url)[0]

In [43]:
earliest_year = 21 # Earliest year to search requests for
latest_year = 21 # Latest year to search request for
id_start = 1 # ID value to start from
id_range = 100 # Number of IDs to try for each year
cooldown = 1 # Amount of time, in seconds, to wait between website accesses

# List of all request IDs
request_ids = [str(year) + '-' + str(num) for num in range(id_start, id_range + id_start) 
                                       for year in range(earliest_year, latest_year + 1)]

# URLs to extract data from
urls = ['https://sandiego.nextrequest.com/requests/', 'https://oaklandca.nextrequest.com/requests/']

In [41]:
# Parallelized script
sd_requests = Parallel(n_jobs=-1, prefer='threads', verbose=10)(delayed(lambda i: scrape_record(urls[0], i))(request_id) for request_id in request_ids)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   30.0s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   52.3s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:  4.2min remaining:   10.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  4.3min finished


In [38]:
# Non-parallelized script
driver = webdriver.Firefox(options=options) # Headless (non-visible) Firefox WebDriver

sd_requests = [] # List of dictionaries containing information on each request

for year in range(earliest_year, latest_year + 1):
    for num in tqdm(range(id_start, id_range + id_start)):
        request_id = str(year) + '-' + str(num) # NextRequest request IDs are a two-digit year and a number, with a dash in between
        url = 'https://sandiego.nextrequest.com/requests/' + request_id
        driver.get(url) # Attempt to access the record
        
        # print(driver.title) # For testing purposes
        
        # If the record does not exist, then move on to the next ID
        if (request_id not in driver.title):
            continue
            
        status, desc, date, depts, poc, events = [None] * 6 # Initialize variables 
        try: # Attempt to scrape relevant data
            status = driver.find_element_by_class_name('request-status-label').text.strip() # Request status
            desc = driver.find_element_by_class_name('request-text.row').text # Request description
            date = driver.find_element_by_class_name('request_date').text # Request date
            depts = driver.find_element_by_class_name('current-department').text # Department(s) assigned to the request
            poc = driver.find_element_by_class_name('request-detail').text # Person of contact
        
            event_titles = driver.find_elements_by_class_name('event-title') # Title for each message
            event_items = driver.find_elements_by_class_name('event-item') # Description for each message
            time_quotes = driver.find_elements_by_class_name('time-quotes') # Time string for each message
            
            events = { # Dictionary of all messages
                'title': get_webelement_text(event_titles),
                'item': get_webelement_text(event_items), 
                'time': get_webelement_text(time_quotes)
                } 
        except NoSuchElementException: # Catch exception thrown if a specific element cannot be found, and silently pass
            pass
        except: # If some other exception occurs, print information about the exceptionn
            print('Unexpected error:', sys.exc_info()[0])
        finally: # Add request to list
            sd_requests.append({
                'request_id': request_id,
                'status': status,
                'request_desc': desc,
                'request_date': date,
                'depts': depts,
                'poc': poc,
                'msgs': events
            })

        sleep(cooldown) # Cooldown between requests. TO-DO: can we write a script so that this isn't necessary?
        
driver.close()

100%|██████████| 100/100 [04:16<00:00,  2.56s/it]


In [42]:
sd_requests = [x for x in sd_requests if x['status'] is not None]
sd_requests_df = pd.DataFrame(sd_requests)
sd_requests_df

Unnamed: 0,request_id,status,request_desc,request_date,depts,poc,msgs
0,21-1,CLOSED,What money has been paid to the city of San Di...,"January 1, 2021 via web",[Code Enforcement],Ginger Rodriguez,title \ 0 Req...
1,21-2,CLOSED,Address for Russell and Verna Giles of home th...,"January 1, 2021 via web",[Public Records Administration],Angela Laurita,title \ 0 Req...
2,21-3,CLOSED,9095 Rio San Diego Dr APN: 438051130\nOpen Bui...,"January 1, 2021 via web",[Code Enforcement],Ginger Rodriguez,title \ 0 Req...
3,21-4,CLOSED,9095 San Diego Dr APN: 4380511300\nOpen Fire C...,"January 1, 2021 via web",[Fire-Rescue],Angela Laurita,title \ 0 Req...
4,21-5,CLOSED,9095 Rio San Diego Dr APN: 4380511300\nCertifi...,"January 1, 2021 via web",[Development Services],Ginger Rodriguez,title \ 0 Req...
...,...,...,...,...,...,...,...
94,21-96,CLOSED,Subject: Public Records/OPRA/FOIA Request\nTo ...,"January 7, 2021 via fax",[Department of Finance],Stephanie Hoover,title \ 0 Req...
95,21-97,CLOSED,I'm hoping to get the latest set of data for N...,"January 7, 2021 via web","[Police, Public Records Administration]",Angela Laurita,title \ 0 Req...
96,21-98,CLOSED,"Call logs, notes, recordings, etc. for the fol...","January 7, 2021 via web",[Police],Angela Laurita,title \ 0...
97,21-99,CLOSED,Request information (CAD Reports/notes/logs/et...,"January 7, 2021 via web","[Police, Public Records Administration]",Angela Laurita,title \ 0...


In [40]:
# Example showing how to see each request's message log: the following will display all the messages for request 21-1
msgs_21_1 = sd_requests_df['msgs'][0]
display(msgs_21_1)

{'title': ['Request Published\nPublic',
  'Request Closed\nPublic',
  'Department Assignment\nPublic',
  'Department Assignment\nPublic',
  'Department Assignment\nPublic',
  'Request Opened\nPublic'],
 'item': ['',
  'No responsive documents\nThe City of San Diego has no responsive documents.',
  'Removed: Office of the Independent Budget Analyst.',
  'Added: Code Enforcement.',
  'Office of the Independent Budget Analyst',
  'Request received via web'],
 'time': ['January 15, 2021, 9:37pm',
  'January 12, 2021, 2:37pm by Ginger Rodriguez, Public Records Administration Coordinator',
  'January 5, 2021, 7:04am by Angela Laurita, Public Records Administration Manager',
  'January 4, 2021, 6:58am by Lori Hernandez, Public Records Administration-Program Coordinator',
  'January 1, 2021, 9:52am',
  'January 1, 2021, 9:52am']}

In [36]:
# Create a dictionary showing whether a given request was scraped or not (does not account for requests that do not exist)
missing_requests = {i: (('21-' + str(i)) in sd_requests_df['request_id'].to_numpy()) for i in range(id_start, id_range + id_start)}
display(missing_requests)

{1: True,
 2: True,
 3: True,
 4: True,
 5: True,
 6: True,
 7: True,
 8: True,
 9: True,
 10: True,
 11: True,
 12: True,
 13: True,
 14: True,
 15: True,
 16: True,
 17: True,
 18: True,
 19: True,
 20: True,
 21: True,
 22: True,
 23: True,
 24: True,
 25: True,
 26: True,
 27: True,
 28: True,
 29: True,
 30: True,
 31: True,
 32: True,
 33: True,
 34: True,
 35: True,
 36: True,
 37: True,
 38: True,
 39: True,
 40: True,
 41: True,
 42: True,
 43: True,
 44: True,
 45: True,
 46: True,
 47: True,
 48: True,
 49: True,
 50: True,
 51: True,
 52: True,
 53: True,
 54: True,
 55: True,
 56: True,
 57: True,
 58: True,
 59: True,
 60: True,
 61: True,
 62: True,
 63: True,
 64: True,
 65: True,
 66: True,
 67: True,
 68: True,
 69: True,
 70: True,
 71: True,
 72: True,
 73: False,
 74: True,
 75: True,
 76: True,
 77: True,
 78: True,
 79: True,
 80: True,
 81: True,
 82: True,
 83: True,
 84: True,
 85: True,
 86: True,
 87: True,
 88: True,
 89: True,
 90: True,
 91: True,
 92: Tr

In [None]:
num = 10
ID = '21-' + str(num)

print('{} present in sd_requests_df?: {}'.format(ID, missing_requests[num]))
display(sd_requests_df.query('request_id == @ID'))

### TO-DO
- **Figure out a way to avoid requests being skipped over - increase cooldown time?**
- Changes data structure for 'Messages' column to dict
- Include information about documents - scrape info about them from requests, scrape the database of documents
- Run on Google Colab?