In [3]:
!pip install selenium

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options # For Sai: Change 'firefox' 
                                                    # to your desired browser e.g. chrome
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException

from joblib import Parallel, delayed

import sys
import traceback
from timeit import default_timer as timer
from tqdm import tqdm
from time import sleep

from io import StringIO
import re
import zipfile

import numpy as np
import pandas as pd
import datetime

# Options for the driver
options = Options()
options.headless = True



### TO-DO
- Figure out how to scrape documents when there are navigation bars in the documents section
- Move the scraper methods to a separate Python file
- Figure out if [Selenium Grid](https://www.selenium.dev/documentation/grid/) can potentially improve the performance of the scraper
- Run on Google Colab/Microsoft Azure/local desktop/some other place?
- Preliminary EDA:
    - Get close time(s) and reason(s)
    - Fill in empty department info

In [None]:
. # Keep this cell to prevent the rest of the notebook from automatically running

The following script uses the `selenium` library to, in theory, scrape every request from [San Diego's NextRequest database](https://sandiego.nextrequest.com/requests). It does so by using the fact that every request has its own unique URL i.e. the request with ID 'yy-xxxx' will be found at '.../requests/yy-xxxx'. From each request webpage, the following information is extracted:

- `id` (str): ID of the request, yy-xxxx
- `status` (str): Whether the request is opened or closed. Always takes on a value of either 'CLOSED' or 'OPEN'
- `desc` (str): Description of the request provided by the requester
- `date` (str): Initial request date
- `depts` (str): Current departments assigned to the request (may not be the ones the requester had initially)
- `docs` (DataFrame in CSV format): All documents attached to the request, if there are any, otherwise None. The columns are:
    - `title` (str): Title given to each document
    - `link` (str): Link to each document
- `poc` (str): Point of contact
- `msgs` (DataFrame in CSV format): All messages attached to the request. The columns are:
    - `title` (str): Title of each message
    - `item` (str): Message body
    - `time` (str): Date of each message

After a request is scraped, the next request can be navigated to by clicking on an arrow, and the scraper continues to run until the arrow cannot be found, either because the scraper has reached the last request in the database or due to a timeout. To address these potential timeouts, we stop the driver every time it cannot access a request, then restart it after a short delay, starting from the request that it timed out on.

In [None]:
def scrape_requests_sequential(requests, driver, num_requests=-1, cooldown=1, debug=0, progress=0):
    '''
    Scrapes all records on a NextRequest request database starting from the given ID and
    moving forward chronologically until the number of requests scraped reaches a given
    number. Each scraped requests is added to a given list. If num_requests is non-positive, 
    then scrape as many records as possible.
    '''
    start = timer() # Timer for progress checking purposes
    counter = 0 # Keeps track of how many requests have been scraped
    
    # Start by scraping the initial record. TO-DO: Add try-except-finally blocks for KeyboardInterrupt errors
    
    # Only scrape a request if it was loaded properly; otherwise, stop the scraper
    if not driver.find_elements_by_class_name('nextrequest'):
        print('No requests scraped')
        return counter

    scrape_request_append(requests, driver, counter=counter, debug=debug) # Scrape request

    counter += 1
    
    # For positive num_requests, return the list of requests if the counter reaches the desired number
    if ((num_requests > 0) and (counter == num_requests)):
        if progress:
            print_progress_final(counter, start, end=timer(), last_request=requests[-1]['id'])
        
        return counter

    # Show progress, if desired
    if progress and (counter % progress == 0):
        print_progress(counter, start, end=timer())
    
    # Continue to scrape until it is not possible to navigate to the next request, 
    # either due to the scraper reaching the end of the database or because of a
    # timeout
    while driver.find_elements_by_class_name('js-next-request'): 
        driver.find_element_by_class_name('js-next-request').click() # Click on the arrow to navigate to the next request
        sleep(cooldown) # Cooldown between scraping attempts
        
        if not driver.find_elements_by_class_name('nextrequest'):
            break
        
        scrape_request_append(requests, driver, counter=counter, debug=debug) 
        
        counter += 1
        
        if ((num_requests > 0) and (counter == num_requests)):
            break
        
        if progress and (counter % progress == 0):
            print_progress(counter, start, end=timer())
    
    # Final progress check
    if progress:
        print_progress_final(counter, start, end=timer(), last_request=requests[-1]['id'])
        
    return counter

def scrape_request_append(requests, driver, counter=-1, debug=0):
    '''
    Scrapes data about a given request on a NextRequest request database, appending the result
    to the given list.
    '''
    request_id, status, desc, date, depts, docs, poc, events = [None] * 8 # Initialize variables 
    try: # Attempt to scrape relevant data
        request_id = driver.find_element_by_class_name('request-title-text').text.split()[1][1:] # Request ID
        status = driver.find_element_by_class_name('request-status-label').text.strip() # Request status
        
        desc_row = driver.find_element_by_class_name('request-text') # Box containing request description
        for desc_read_more in desc_row.find_elements_by_partial_link_text('Read more'): # Expand description if necessary
            desc_read_more.click()
        sleep(0.01) # TO-DO: Replace with a WebDriverWait
        desc = desc_row.find_element_by_id('request-text').text # Full request description
        
        date = driver.find_element_by_class_name('request_date').text # Request date
        depts = driver.find_element_by_class_name('current-department').text # Department(s) assigned to the request
        poc = driver.find_element_by_class_name('request-detail').text # Person of contact

        # Messages recorded on the request page, if there are any
        event_history = driver.find_elements_by_class_name('generic-event') # All message blocks
        if event_history: # Check for presence of messages
            num_events = len(event_history)

            # Titles, descriptions, and time strings for each message
            event_titles = [None] * num_events
            event_items = [None] * num_events
            time_quotes = [None] * num_events

            # Scrape information from each individual event
            for i in range(len(event_history)):
                event = event_history[i]

                event_title = event.find_element_by_class_name('event-title').text # Event title
                for details_toggle in event.find_elements_by_partial_link_text('Details'): # Expand event item details
                    details_toggle.click()
                event_item = '\n'.join(get_webelement_text(event.find_elements_by_class_name('event-item'))) # Event item
                time_quote = event.find_element_by_class_name('time-quotes').text # Time quote

                event_titles[i] = event_title
                event_items[i] = event_item
                time_quotes[i] = time_quote

            # DataFrame-converted-to-CSV consisting of all messages
            events = pd.DataFrame({ 
                'title': event_titles,
                'item': event_items,
                'time': time_quotes
                }).to_csv(index=False)
            
        # Documents attached to the request, if there are any (CURRENTLY DOES NOT SCRAPE ALL DOCUMENTS)
        doc_list = driver.find_element_by_class_name('document-list') # Box containing documents
        if '(none)' not in doc_list.text: # Check for the presence of documents
            # Expand folders, if there are any
            folders = doc_list.find_elements_by_class_name('folder-toggle') 
            for folder in folders:
                folder.click()
            sleep(0.01) # TO-DO: Replace with a WebDriverWait
            
            docs_all = doc_list.find_elements_by_class_name('document-link')
            
            # TO-DO: Figure out how to scrape all documents from a request whose folders also have navigation bars
#             # If there are many documents, then there will be navigation bar(s)
#             pag_navs = doc_list.find_elements_by_class_name('pagy-nav')
#             if pag_navs:
#                 pag_nav = pag_navs[-1]
#                 while not pag_nav.find_elements_by_class_name('page.next.disable'):
#                     pag_nav.find_element_by_partial_link_text('Next').click()
#                     doc_list = driver.find_element_by_class_name('document-list')

#                     doc_titles.extend(get_webelement_text(doc_list.find_elements_by_class_name('document-link')))
#                     doc_links.extend(remove_download_from_urls(get_webelement_link(doc_list.find_elements_by_class_name('document-link'))))  

#             doc_titles = list(set(doc_titles))
#             doc_links = list(set(doc_links))
            
            # DataFrame-converted-to-CSV consisting of all documents
            docs = pd.DataFrame({
                'title': get_webelement_text(docs_all),
                'link': remove_download_from_urls(get_webelement_link(docs_all))
                }).to_csv(index=False)
            
        # For testing purposes, print a message whenever a request is successfully scraped
        if debug:
            print(request_id, 'scraped')
    except: # If an exception occurs, print the stack trace
        print('Exception occured' + (' at count ' + str(counter + 1) if counter >= 0 else '') + ':')
        traceback.print_exc()
        print()
    finally: # Append the request to the list
        requests.append({
            'id': request_id,
            'status': status,
            'desc': desc,
            'date': date,
            'depts': depts,
            'docs': docs,
            'poc': poc,
            'msgs': events
            })

# Scraper utilities
def print_progress(counter, start, end):
    '''
    Prints scraper progress
    '''
    print('Requests scraped:', counter, 
          '\tAvg runtime:', str(round((end - start)/counter, 2)) + 's/request', 
          '\tTotal runtime:', str(round(end - start, 1)) + 's')
    
def print_progress_final(counter, start, end, last_request):
    '''
    Prints final scraper progress
    '''
    print('Total requests scraped:', counter, 
              '\tAvg runtime:', str(round((end - start)/counter, 2)) + 's/request', 
              '\tTotal runtime:', str(round(end - start, 1)) + 's')
    print()
    print('Last request scraped:', last_request)
    print()

def get_city_from_url(url):
    '''
    Finds the city name from the NextRequest URL.
    '''
    return re.match(r'(?<=https://)[a-zA-Z]*', url)[0]

def get_webelement_text(webelement):
    '''
    Gets the text of each web element in a list, if such a list exists.
    '''
    return list(map(lambda x: x.text, webelement)) if webelement else []

def get_webelement_link(webelement):
    '''
    Gets the link of each web element in a list, if such a list exists.
    '''
    return list(map(lambda x: x.get_attribute('href'), webelement)) if webelement else []

def remove_download_from_urls(urls):
    '''
    Removes '/download' from the end of a list of URLs, if the list exists.
    '''
    return list(map(lambda url: re.match(r'.*(?=/download)', url)[0], urls)) if urls else []

Define important variables before running the scraper:

In [None]:
num_requests = -1 # Number of requests to scrape
cooldown = 1 # Cooldown between request accesses. 
progress = 100 # Show progress every N requests that are scraped. 
timeout = 10 # Timeout wait time between scraper runs

requests = [] # List of dictionaries containing scraped info on each request
urls = ['https://lacity.nextrequest.com/requests/', 
        'https://nola.nextrequest.com/requests'] # URLs to scrape from. 
earliest_id = ['17-1', ] # Earliest IDs in the databases
requests_name = ['lacity_requests', 
                 'nola_requests'] # Name of CSV file and ZIP archive to export scraped data to

The following cell contains the main scraper process. Run it to start the scraper.

**Important notes:**
- If the scraper stops running for any reason, then simply rerun this cell to start it back up. 
- **Do not run the previous cell if the scraper stops, or else all your progress will be lost!**

In [None]:
def scrape_nextrequest(requests, url, earliest_id, requests_name, num_requests=-1, cooldown=1, progress=100):
    """
    Main scraper routine
    """
    num_its = 1 # Keeps track of how many times the scraper has been run
    current_id = requests[-1]['id'] if requests else earliest_id # Initialize the current ID to be either the earliest ID possible 
                                                                # if the requests list is empty, or the last ID in the list

    # Instantiate headless (non-visible) Firefox driver
    driver = webdriver.Firefox(options=options)

    # Start by runnning an intial iteration of the scraper
    driver.get(url + current_id)

    # Print iteration number
    it_num_title = 'Iteration ' + str(num_its)
    print(it_num_title)
    print('-' * len(it_num_title))

    # Re-scrape the current request
    start_id = requests.pop()['id'] if requests else current_id
    print('Starting request:', start_id) 
    print()

    # Scrape requests until the scraper either reaches the end of the database or times out
    scrape_requests_sequential(requests, driver, 
                               num_requests=num_requests, 
                               cooldown=cooldown, 
                               progress=progress)

    num_its += 1
    sleep(timeout) # Wait after the script reaches the end of the database or after a timeout

    # Restart the driver at the last request scraped
    current_id = requests[-1]['id']
    driver.get(url + current_id)

    # Continue to scrape until the scraper reaches the end of the database or times out
    while driver.find_elements_by_class_name('js-next-request'):
        it_num_title = 'Iteration ' + str(num_its)
        print(it_num_title)
        print('-' * len(it_num_title))

        print('Starting request:', requests.pop()['id'])
        print()

        scrape_requests_sequential(requests, driver, 
                                   num_requests=num_requests, 
                                   cooldown=cooldown, 
                                   progress=progress)

        num_its += 1
        sleep(timeout)

        current_id = requests[-1]['id']
        driver.get(url + current_id)

    driver.close()

    # Convert to DataFrame
    requests = [request for request in requests if (request and request['status'])]
    requests_df = pd.DataFrame(requests).drop_duplicates()

    # Create a zipped CSV file of the DataFrame
    compression_opts = dict(method='zip', archive_name=requests_name + '.csv')
    requests_df.to_csv('data/' + requests_name + '.zip', index=False, compression=compression_opts)
    
    return 

In [None]:
# TO-DO: Convert this script into a function?

# Initialize the current ID to be either the earliest ID possible if the requests list is empty, or the last ID in the list
current_id = requests[-1]['id'] if requests else earliest_id 

# Instantiate headless (non-visible) Firefox driver. For Sai: Change 'Firefox' to your browser of choice e.g. Chrome
driver = webdriver.Firefox(options=options)

# Start by runnning an intial iteration of the scraper
driver.get(url + current_id)

# Print iteration number
it_num_title = 'Iteration ' + str(num_its)
print(it_num_title)
print('-' * len(it_num_title))

# Re-scrape the current request
start_id = requests.pop()['id'] if requests else current_id
print('Starting request:', start_id) 
print()

# Scrape requests until the scraper either reaches the end of the database or times out
scrape_requests_sequential(requests, driver, 
                           num_requests=num_requests, 
                           cooldown=cooldown, 
                           progress=progress)

num_its += 1
sleep(timeout) # Wait after the script reaches the end of the database or after a timeout

# Restart the driver at the last request scraped
current_id = requests[-1]['id']
driver.get(url + current_id)

# Continue to scrape until the scraper reaches the end of the database or times out
while driver.find_elements_by_class_name('js-next-request'):
    it_num_title = 'Iteration ' + str(num_its)
    print(it_num_title)
    print('-' * len(it_num_title))
    
    print('Starting request:', requests.pop()['id'])
    print()
    
    scrape_requests_sequential(requests, driver, 
                               num_requests=num_requests, 
                               cooldown=cooldown, 
                               progress=progress)
    
    num_its += 1
    sleep(timeout)

    current_id = requests[-1]['id']
    driver.get(url + current_id)

driver.close()

# Convert to DataFrame
requests = [request for request in requests if (request and request['status'])]
requests_df = pd.DataFrame(requests).drop_duplicates()

# Create a zipped CSV file of the DataFrame
compression_opts = dict(method='zip', archive_name=requests_name + '.csv')
requests_df.to_csv('data/' + requests_name + '.zip', index=False, compression=compression_opts)

For Sai: **Run every code cell above this line (excluding the one with just the single period and comment)**

In [7]:
driver = webdriver.Firefox()
print()
driver.close()

True


In [8]:
type('test') == str

True

In [None]:
# Convert to DataFrame
requests = [request for request in requests if (request and request['status'])]
requests_df = pd.DataFrame(requests).drop_duplicates()

In [None]:
# Create a zipped CSV file of the DataFrame
compression_opts = dict(method='zip', archive_name=requests_name + '.csv')
requests_df.to_csv('data/' + requests_name + '.zip', index=False, compression=compression_opts)

In [None]:
# Check to make sure the CSV file was properly created
test_df = pd.read_csv(zipfile.ZipFile('data/sd_requests.zip', 'r').open('sd_requests.csv'))

In [None]:
test_df

Scraper tests:

In [None]:
url = 'https://sandiego.nextrequest.com/requests/'

test_requests = []
driver = webdriver.Firefox()
driver.get(url + '15-1810')

scrape_requests_sequential(test_requests, driver, 
                           num_requests=11,
                           debug=1,
                           progress=2)

driver.close()

test = pd.DataFrame(test_requests)

In [None]:
test

In [None]:
pd.read_csv(StringIO(test.loc[0]['msgs'])).tail(10)

The following process converts the CSV strings in the `docs` and `msgs` columns into DataFrames:

In [None]:
df_fillna = lambda df: df.convert_dtypes().fillna('') if df is not None else None
test_df = df_fillna(test_df)
test_df

In [None]:
csv_to_df = lambda csv: pd.read_csv(StringIO(csv)) if csv else None
test_df['docs_df'] = test_df['docs'].apply(csv_to_df)
test_df['msgs_df'] = test_df['msgs'].apply(csv_to_df)
test_df.head()

Then, we fill the NA values in the individual `docs` and `msgs` DataFrames:

In [None]:
test_df['docs_df'] = test_df['docs_df'].apply(df_fillna)
test_df['msgs_df'] = test_df['msgs_df'].apply(df_fillna)
test_df.loc[4]['msgs_df']

Other EDA stuff:

In [None]:
test_df.head()

In [None]:
test_df.shape[0] # Number of requests scraped

In [None]:
test_df[test_df['desc'].str.contains('Read more')] # Check if the descriptions were properly scraped

In [None]:
empty_desc = test_df.query('desc == ""')
empty_desc

In [None]:
# Check for empty depts field
empty_depts = test_df.query('depts == ""')
empty_depts

In [None]:
# Check for empty docs field
empty_docs = test_df[test_df['docs'].str.fullmatch('title,link\n')]
empty_docs

In [None]:
# Which requests had the longest message history? (Useful for finding worst-case scenarios for the scraper)
long_msgs = test_df['msgs_df'].apply(lambda df: df.shape[0] if df is not None else 0).sort_values(ascending=False)
long_msgs.head(10)

In [None]:
# Sort requests by message history length
requests_long_msg = test_df.loc[long_msgs.index]
requests_long_msg.head(20)

In [None]:
# Query for info about a specific request
request_id = '"17-3638"'
test_df.query('id == ' + request_id).iloc[0]['msgs_df']

In [None]:
# Find request descriptions with the given substring, case insensitive
desc = 'Padres'
test_df[test_df['desc'].str.contains(desc, case=False)]

In [None]:
# Find requests whose department(s) contain the given substring, case insensitive
dept = 'Police'
test_df[test_df['depts'].str.contains(dept, case=False)]

In [None]:
# Convert empty dataframes from docs_df into None
remove_empty = lambda df: None if ((df is None) or (type(df) == str and not df) or df.empty) else df
test_df['docs_df'] = test_df['docs_df'].apply(remove_empty)
test_df[test_df['docs'].str.fullmatch('title,link\n')]

In [None]:
# Split the date and request method from the date column
dates = test_df['date'].to_numpy()
test_df = test_df.join(
        pd.DataFrame(list(map(lambda x: x.split(' via '), dates)))
    ).drop(
        columns='date'
    ).rename(
        columns={0: 'date', 1: 'via'}
    ).convert_dtypes()
test_df.head()

In [None]:
# Split the time and author from the time quote on each message
def split_time_author(msgs):
    if msgs is None:
        return None
    time_quotes = msgs['time'].to_numpy()
    time_author = pd.DataFrame(list(map(lambda x: x.split(' by '), time_quotes)))
    return df_fillna(msgs.join(
            time_author
        ).drop(
            columns='time'
        ).rename(
            columns={0: 'time', 1: 'by'}
        ))

test_df['msgs_df'] = test_df['msgs_df'].apply(split_time_author)
test_df.loc[4]['msgs_df']

In [None]:
# Convert columns with time strings into DateTime
def convert_time_to_dt(df, col='time'):
    return df.assign(**{col + '_dt': pd.to_datetime(df[col])})

In [None]:
# Splitting departments for easier pivoting
depts = test_df['depts'].to_numpy() # depts column
test_df_depts = test_df.join(pd.DataFrame(list(map(lambda x: x.split(', '), depts)))) # Split departments into separate columns
test_df_depts = test_df_depts.melt( # Melt on the individual departments
        id_vars=test_df.columns
    )[lambda df: df['value'].apply(lambda x: x is not None)].drop( # Get rid of None values
        columns='variable'
    ).rename( # Drop the variable column, rename the value column, and reset indices
        columns={'value': 'dept'}
    ).reset_index().drop(
        columns='index'
    )
test_df_depts

In [None]:
test_df_depts.value_counts('dept')[lambda x: x.index.str.contains('Office', case=False)]

In [None]:
# Find requests whose department(s) contain the given substring, case insensitive
dept = 'Chief Operating Officer'
test_df[test_df['depts'].str.contains(dept, case=False)]

In [None]:
test_df = test_df.convert_dtypes()
test_df.loc[0]['msgs_df']

In [None]:
test.loc[0]

Previous scraper function attempts:

In [None]:
def scrape_record(url, request_id, driver):
    '''
    Scrapes data about a given request on a NextRequest request database
    '''
    driver.get(url + request_id) # Attempt to access the record
#     timeout = 2 # Timeout length, in seconds
    
#     try:
#         WebDriverWait(driver, timeout).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'nextrequest')))
#     except TimeoutException:
#         print(request_id, 'timed out')
#         pass
    
    if (request_id not in driver.title):
        return
    
    status, desc, date, depts, docs, poc, events = [None] * 7 # Initialize variables 
    try: # Attempt to scrape relevant data
        status = driver.find_element_by_class_name('request-status-label').text.strip() # Request status
        desc = driver.find_element_by_class_name('request-text.row').text # Request description
        date = driver.find_element_by_class_name('request_date').text # Request date
        depts = driver.find_element_by_class_name('current-department').text # Department(s) assigned to the request
        poc = driver.find_element_by_class_name('request-detail').text # Person of contact
        
        # Documents attached to the request, if there are any
        public_docs = driver.find_element_by_id('document-list') # WebElement containing the documents
        if '(none)' not in public_docs.text:
            # Expand folders, if there are any
            folders = public_docs.find_elements_by_class_name('folder-toggle') 
            if folders:
                for folder in folders:
                    folder.click()

            doc_links = public_docs.find_elements_by_class_name('document-link') # Links to documents
            
            # DataFrame consisting of all documents
            docs = pd.DataFrame({
                'title': get_webelement_text(doc_links),
                'link': remove_download_from_urls(get_webelement_link(doc_links))
                })
            
        # Show all message history, if the option is available
        show_all_history = driver.find_elements_by_class_name('show-all-history')
        if show_all_history:
            show_all_history[0].click()
        
        # Messages recorded on the request page, if there are any
        event_history = driver.find_elements_by_class_name('generic-event')
        if event_history:
            num_events = len(event_history)
            
            # Titles, descriptions, and time strings for each message
            event_titles = [None] * num_events
            event_items = [None] * num_events
            time_quotes = [None] * num_events
            
            # Scrape information from each individual event
            for i in range(len(event_history)):
                event = event_history[i]
                
                event_title = event.find_element_by_class_name('event-title').text
                event_item = '\n'.join(get_webelement_text(event.find_elements_by_class_name('event-item')))
                time_quote = event.find_element_by_class_name('time-quotes').text
                
                event_titles[i] = event_title
                event_items[i] = event_item
                time_quotes[i] = time_quote
                
            # DataFrame consisting of all messages
            events = pd.DataFrame({ 
                'title': event_titles,
                'item': event_items,
                'time': time_quotes
                })
    except NoSuchElementException: # Catch exception thrown if a specific element cannot be found, and silently pass
        pass
    except: # If some other exception occurs, print information about the exception
        traceback.print_exc()
    finally: # Return the request
        return {
            'id': request_id,
            'status': status,
            'desc': desc,
            'date': date,
            'depts': depts,
            'docs': docs,
            'poc': poc,
            'msgs': events
            }
    
def scrape_record_parallel(url, request_id):
    '''
    Scraper method used for parallelization
    '''
    driver = webdriver.Firefox(options=options)
    request_info = scrape_record(url, request_id, driver)
    driver.close()
    return request_info

In [None]:
# Options for scraping
earliest_year = 16 # Earliest year to search requests for
latest_year = 21 # Latest year to search request for
id_start = 1 # ID value to start from
id_range = id_max # Number of IDs to try for each year
cooldown = 0.9 # Amount of time, in seconds, to wait between website accesses

start_id = '15-1810' # The request to start scraping from
num_requests = -1 # Number of requests to scrape
progress = 100 # Display a message every 100 requests successfully scraped

# List of all request IDs
request_ids = [str(year) + '-' + str(num) for num in range(id_start, id_range + id_start) 
                                       for year in range(earliest_year, latest_year + 1)]

# URLs to extract data from
urls = ['https://sandiego.nextrequest.com/requests/', 'https://oaklandca.nextrequest.com/requests/']

In [None]:
# Iterative script
driver = webdriver.Firefox(options=options
                          ) # Headless (non-visible) WebDriver

sd_requests = [] # List of dictionaries containing information on each request
i = 0 # Index for URLs - may be useful later for scraping multiple sites

for year in range(earliest_year, latest_year + 1):
    for num in tqdm(range(id_start, id_range + id_start)):
        # NextRequest request IDs are a two-digit year and a number, with a dash in between
        request_id = str(year) + '-' + str(num)
        
        # Scrape record
        sd_requests.append(scrape_record(urls[i], request_id, driver)) 

        # Cooldown
        sleep(cooldown)
    
    # sleep(cooldown) # Cooldown
        
driver.close()

sd_requests = [x for x in sd_requests if x['status'] is not None] # Remove entries with incomplete information
sd_requests_df = pd.DataFrame(sd_requests) # Convert to DataFrame

# Create a zipped CSV file of the data
compression_opts = dict(method='zip', archive_name='sd_requests.csv')
sd_requests_df.to_csv('data/sd_requests_2.zip', index=False, compression=compression_opts)

In [None]:
# Parallelized script
scrape_request = lambda i: scrape_record_parallel(urls[0], i)
sd_requests = Parallel(n_jobs=-1, prefer='threads', verbose=10)(delayed(scrape_request)(request_id) for request_id in request_ids)

Miscellaneous tests:

In [None]:
# # Test remote WebDriver (run Selenium Grid instance locally before running this cell)
# driver = webdriver.Remote(desired_capabilities=DesiredCapabilities.FIREFOX, options=options)
# driver.get('http://www.google.com')
# driver.close()

In [None]:
# # Test to make sure the driver works
# 
# driver = webdriver.Firefox(options=options) # Instantiate a headless Firefox WebDriver
# for url in urls:
#     driver.get(url)
#     print(driver.title)
#     sleep(5)
    
# driver.close()

In [None]:
# %%timeit
# # Test for retrieving message info from a specific request
# driver = webdriver.Firefox(options=options)
# driver.get('https://sandiego.nextrequest.com/requests/21-4915')
# print(driver.title)

# event_titles = driver.find_elements_by_class_name('event-title')
# event_items = driver.find_elements_by_class_name('event-item')
# times = driver.find_elements_by_class_name('time-quotes')
# for title, item, time in list(zip(event_titles, event_items, times)):
#     print(title.text)
#     print(item.text)
#     print(time.text)
#     print()

# driver.close()

In [None]:
# # Scraping documents test
# driver = webdriver.Firefox(options=options)

# for request_id in [5369, 5313, 5374]:
#     url = 'https://sandiego.nextrequest.com/requests/21-' + str(request_id)
#     driver.get(url)

#     docs = driver.find_element_by_id('public-docs')
#     folders = docs.find_elements_by_class_name('folder-toggle')
#     if folders:
#         for folder in folders:
#             folder.click()
#     else:
#         print('No folders found for', request_id)
    
#     doc_links = docs.find_elements_by_class_name('document-link')
#     display(
#         pd.DataFrame(
#             list(zip(get_webelement_text(doc_links), remove_download_from_urls(get_webelement_link(doc_links)))),
#             columns=['title', 'link']
#         )
#     )
#     display(
#         pd.DataFrame({
#             'title': get_webelement_text(doc_links),
#             'link': remove_download_from_urls(get_webelement_link(doc_links))
#         })
#     )

# driver.close()