My attempt to scrape NextRequest data using the `selenium` library:

In [20]:
!pip install selenium

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException

from tqdm import tqdm
from time import sleep

import pandas as pd
import numpy as np
import re

# Options for the driver
options = Options()
options.headless = True



In [8]:
driver = webdriver.Firefox(options=options) # Instantiate a headless Firefox WebDriver

# # Test to make sure the driver works
# for url in urls:
#     driver.get(url)
#     print(driver.title)
#     sleep(5)
    
# driver.close()

In [9]:
# Test URLs to extract data from
urls = ['https://sandiego.nextrequest.com/requests', 'https://oaklandca.nextrequest.com/requests']

In [None]:
# Test for retrieving message info from a specific request
driver.get(urls[0] + '/21-4915')
print(driver.title)

# event_titles = driver.find_elements_by_class_name('event-title')
# event_items = driver.find_elements_by_class_name('event-item')
# times = driver.find_elements_by_class_name('time-quotes')
# for title, item, time in list(zip(event_titles, event_items, times)):
#     print(title.text)
#     print(item.text)
#     print(time.text)
#     print()

driver.close()

In [None]:
# # Loop test 
# for i in tqdm(range(5049, 5060)):
#     url = "https://sandiego.nextrequest.com/requests/21-" + str(i) 
#     driver.get(url)
    
#     event_titles = driver.find_elements_by_class_name('event-title')
#     event_items = driver.find_elements_by_class_name('event-item')
#     times = driver.find_elements_by_class_name('time-quotes')
    
#     for title, item, time in list(zip(event_titles, event_items, times)):
#         print(title.text)
#         print(item.text)
#         print(time.text)
#         print()
    
#     print()

In [31]:
earliest_year = 21 # Earliest year to search requests for
latest_year = 21 # Latest year to search request for
id_start = 1 # ID value to start from
id_range = 500 # Number of IDs to try for each year

def get_text(webelement):
    return map(lambda x: x.text, webelement) if (webelement is not None) else None

In [32]:
driver = webdriver.Firefox(options=options) # Headless (non-visible) Firefox WebDriver

sd_requests = [] # List of dictionaries containing information on each request

for year in range(earliest_year, latest_year + 1):
    for num in tqdm(range(id_start, id_range + id_start)) :
        request_id = str(year) + '-' + str(num) # NextRequest request IDs are a two-digit year and a number, with a dash in between
        url = 'https://sandiego.nextrequest.com/requests/' + request_id
        driver.get(url) # Attempt to access the record
        
        # print(driver.title) # For testing purposes
        
        # If the record does not exist, then move on to the next ID
        if (request_id not in driver.title):
            continue
            
        status, desc, date, depts, poc, events = [None] * 6 # Initialize variables 
        try: # Attempt to scrape relevant data
            status = driver.find_element_by_class_name('request-status-label').text.strip() # Request status
            desc = driver.find_element_by_class_name('request-text.row').text # Request description
            date = driver.find_element_by_class_name('request_date').text # Request date
            depts = driver.find_element_by_class_name('current-department').text # Department(s) assigned to the request
            poc = driver.find_element_by_class_name('request-detail').text # Person of contact
        
            event_titles = driver.find_elements_by_class_name('event-title') # Title for each message
            event_items = driver.find_elements_by_class_name('event-item') # Description for each message
            time_quotes = driver.find_elements_by_class_name('time-quotes') # Time string for each message
            events = list(zip(get_text(event_titles), get_text(event_items), get_text(time_quotes))) # List of all messages. TO-DO: Convert into a dictionary
        except NoSuchElementException: # Catch exception thrown if a specific element cannot be found
            pass
        finally: # Add request to list
            sd_requests.append({
                'Request ID': request_id,
                'Status': status,
                'Request description': desc,
                'Request date': date,
                'Departments': depts,
                'Point of Contact': poc,
                'Messages': events
            })

        sleep(0.5) # Cooldown between requests. TO-DO: can we write a script so that this isn't necessary?
        
driver.close()

100%|██████████| 500/500 [06:56<00:00,  1.20it/s]


In [33]:
sd_requests_df = pd.DataFrame(sd_requests)
display(sd_requests_df) # Requests that were received using a cooldown period of 0.5 seconds

Unnamed: 0,Request ID,Status,Request description,Request date,Departments,Point of Contact,Messages
0,21-1,CLOSED,What money has been paid to the city of San Di...,"January 1, 2021 via web",Code Enforcement,Ginger Rodriguez,"[(Request Published\nPublic, , January 15, 202..."
1,21-2,CLOSED,Address for Russell and Verna Giles of home th...,"January 1, 2021 via web",Public Records Administration,Angela Laurita,"[(Request Published\nPublic, , January 7, 2021..."
2,21-3,CLOSED,9095 Rio San Diego Dr APN: 438051130\nOpen Bui...,"January 1, 2021 via web",Code Enforcement,Ginger Rodriguez,"[(Request Published\nPublic, , January 7, 2021..."
3,21-4,CLOSED,9095 San Diego Dr APN: 4380511300\nOpen Fire C...,"January 1, 2021 via web",Fire-Rescue,Angela Laurita,"[(Request Published\nPublic, , January 8, 2021..."
4,21-5,CLOSED,9095 Rio San Diego Dr APN: 4380511300\nCertifi...,"January 1, 2021 via web",Development Services,Ginger Rodriguez,"[(Request Published\nPublic, , January 15, 202..."
...,...,...,...,...,...,...,...
260,21-493,CLOSED,"On January 28, 2021, The City's Environment Co...","January 30, 2021 via web",Mayor,Angela Laurita,"[(Request Closed\nPublic, Request Re-opened af..."
261,21-494,CLOSED,"Greetings: on January 26, 2021, Mayor Gloria's...","January 30, 2021 via web",Mayor,Angela Laurita,"[(Request Published\nPublic, , February 14, 20..."
262,21-495,CLOSED,"Greetings, four SDGE representatives signed up...","January 30, 2021 via web",City Council District 1,Steven Hadley,"[(Request Published\nPublic, , February 13, 20..."
263,21-496,CLOSED,Hi I’m requesting a list of city owned vacant ...,"January 30, 2021 via web",Department of Real Estate and Airport Management,Lori Hernandez,"[(Request Published\nPublic, , February 14, 20..."


In [46]:
missing_requests = {i: (('21-' + str(i)) in sd_requests_df['Request ID'].to_numpy()) for i in range(1, 501)}
missing_requests

{1: True,
 2: True,
 3: True,
 4: True,
 5: True,
 6: True,
 7: True,
 8: True,
 9: True,
 10: True,
 11: True,
 12: True,
 13: True,
 14: True,
 15: True,
 16: True,
 17: True,
 18: True,
 19: True,
 20: True,
 21: True,
 22: True,
 23: True,
 24: True,
 25: True,
 26: True,
 27: True,
 28: True,
 29: True,
 30: True,
 31: True,
 32: True,
 33: True,
 34: True,
 35: True,
 36: True,
 37: True,
 38: True,
 39: False,
 40: False,
 41: False,
 42: False,
 43: False,
 44: False,
 45: False,
 46: False,
 47: False,
 48: False,
 49: False,
 50: False,
 51: False,
 52: False,
 53: False,
 54: False,
 55: False,
 56: False,
 57: False,
 58: False,
 59: False,
 60: False,
 61: False,
 62: False,
 63: False,
 64: False,
 65: False,
 66: False,
 67: False,
 68: True,
 69: True,
 70: True,
 71: True,
 72: True,
 73: True,
 74: True,
 75: True,
 76: True,
 77: True,
 78: True,
 79: True,
 80: True,
 81: True,
 82: True,
 83: True,
 84: True,
 85: True,
 86: True,
 87: True,
 88: True,
 89: True,
 

In [62]:
num = 167 # Change to any value betweeen 1 and 500 inclusive to see if the webscraping script scraped that ID
ID = '21-' + str(num)

print('{} present in sd_requests_df?: {}'.format(ID, missing_requests[num]))
display(sd_requests_df.query('`Request ID` == @ID'))

21-167 present in sd_requests_df?: False


Unnamed: 0,Request ID,Status,Request description,Request date,Departments,Point of Contact,Messages


### TO-DO
- **Figure out a way to avoid requests being skipped over - increase cooldown time?**
- Changes data structure for 'Messages' column to dict
- Include information about documents - scrape info about them from requests, scrape the database of documents
- Run on Google Colab?