My attempt to scrape NextRequest data using the `selenium` library:

In [1]:
!pip install selenium

from selenium import webdriver
from selenium.webdriver.firefox.options import Options

from tqdm import tqdm
from time import sleep

import pandas as pd
import numpy as np
import re

# Options for the driver
options = Options()
options.headless = True



In [2]:
driver = webdriver.Firefox(options=options) # Instantiate a headless Firefox WebDriver

# # Test to make sure the driver works
# for url in urls:
#     driver.get(url)
#     print(driver.title)
#     sleep(5)
    
# driver.close()

In [3]:
# Test URLs to extract data from
urls = ['https://sandiego.nextrequest.com/requests', 'https://oaklandca.nextrequest.com/requests']

In [11]:
# Test for retrieving message info from a specific request
driver.get(urls[0] + '/21-4915')
print(driver.title)

# event_titles = driver.find_elements_by_class_name('event-title')
# event_items = driver.find_elements_by_class_name('event-item')
# times = driver.find_elements_by_class_name('time-quotes')
# for title, item, time in list(zip(event_titles, event_items, times)):
#     print(title.text)
#     print(item.text)
#     print(time.text)
#     print()

Request 21-4915 - NextRequest - Modern FOIA & Public Records Request Software


In [None]:
# Test 
for i in tqdm(range(5049, 5060)):
    url = "https://sandiego.nextrequest.com/requests/21-" + str(i) 
    driver.get(url)
    
    event_titles = driver.find_elements_by_class_name('event-title')
    event_items = driver.find_elements_by_class_name('event-item')
    times = driver.find_elements_by_class_name('time-quotes')
    
    for title, item, time in list(zip(event_titles, event_items, times)):
        print(title.text)
        print(item.text)
        print(time.text)
        print()
    
    print()

In [40]:
earliest_year = 21 # Earliest year to search requests for
latest_year = 21 # Latest year to search request for
id_range = 100 # Number of IDs to try for each year

In [31]:
def get_text(webelement):
    return map(lambda x: x.text, webelement)

In [46]:
driver = webdriver.Firefox(options=options) # Headless (non-visible) Firefox WebDriver

sd_requests = [] # List of dictionaries containing information on each request

for year in range(earliest_year, latest_year + 1):
    for num in tqdm(range(1, id_range + 1)) :
        request_id = str(year) + '-' + str(num) # NextRequest request IDs are a two-digit year and a number, with a dash in between
        url = 'https://sandiego.nextrequest.com/requests/' + request_id
        driver.get(url) # Attempt to access the record
        
        # If the record does not exist, then move on to the next ID
        if (request_id not in driver.title):
            continue
        
        # Scrape relevant data
        status = driver.find_element_by_class_name('request-status-label').text.strip()
        desc = driver.find_element_by_class_name('request-text.row').text
        date = driver.find_element_by_class_name('request_date').text
        depts = driver.find_element_by_class_name('current-department').text
        poc = driver.find_element_by_class_name('request-detail').text
        
        event_titles = driver.find_elements_by_class_name('event-title')
        event_items = driver.find_elements_by_class_name('event-item')
        time_quotes = driver.find_elements_by_class_name('time-quotes')
        events = list(zip(get_text(event_titles), get_text(event_items), get_text(time_quotes)))
        
        sd_requests.append({
            'Request ID': request_id,
            'Status': status,
            'Request description': desc,
            'Request date': date,
            'Departments': depts,
            'Point of Contact': poc,
            'Messages': events
            })

        sleep(0.5)
        
# print(sd_requests[0])
driver.close()

100%|██████████| 100/100 [01:37<00:00,  1.03it/s]


In [47]:
sd_requests_df = pd.DataFrame(sd_requests)
display(sd_requests_df)

Unnamed: 0,Request ID,Status,Request description,Request date,Departments,Point of Contact,Messages
0,21-1,CLOSED,What money has been paid to the city of San Di...,"January 1, 2021 via web",Code Enforcement,Ginger Rodriguez,"[(Request Published\nPublic, , January 15, 202..."
1,21-2,CLOSED,Address for Russell and Verna Giles of home th...,"January 1, 2021 via web",Public Records Administration,Angela Laurita,"[(Request Published\nPublic, , January 7, 2021..."
2,21-3,CLOSED,9095 Rio San Diego Dr APN: 438051130\nOpen Bui...,"January 1, 2021 via web",Code Enforcement,Ginger Rodriguez,"[(Request Published\nPublic, , January 7, 2021..."
3,21-4,CLOSED,9095 San Diego Dr APN: 4380511300\nOpen Fire C...,"January 1, 2021 via web",Fire-Rescue,Angela Laurita,"[(Request Published\nPublic, , January 8, 2021..."
4,21-5,CLOSED,9095 Rio San Diego Dr APN: 4380511300\nCertifi...,"January 1, 2021 via web",Development Services,Ginger Rodriguez,"[(Request Published\nPublic, , January 15, 202..."
...,...,...,...,...,...,...,...
60,21-61,CLOSED,I would like a copy of all email communication...,"January 6, 2021 via web","Police, Public Records Administration",Angela Laurita,"[(Request Published\nPublic, , January 18, 202..."
61,21-62,CLOSED,I Would like a copy fo any land surveys done i...,"January 6, 2021 via web",Public Records Administration,Angela Laurita,"[(Request Published\nPublic, , January 9, 2021..."
62,21-63,CLOSED,CAD Logs and event information for the SDPD an...,"January 6, 2021 via web",Police,Angela Laurita,"[(Request Published\nPublic, , January 9, 2021..."
63,21-64,CLOSED,This firm is performing a Phase I Environmenta...,"January 6, 2021 via web",Public Utilities,Stephanie Hoover,"[(Request Published\nPublic, , February 21, 20..."


**TO-DO**
- Find HTML elements for the following: department name, request reason, points of contact (name of requester), documents
- Modify loop to try and scrape all requests
- Run on Google Colab?