In [152]:
!pip install selenium

from selenium import webdriver # For Sai: Make sure you have the right driver 
                            # installed for the browser you want to use, and 
                            # ensure it is placed in a directory accessible to 
                            #this notebook
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options # For Sai: Change 'firefox' 
                                                    # to your desired browser e.g. chrome
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException

from joblib import Parallel, delayed

import sys
import traceback
from timeit import default_timer as timer
from tqdm import tqdm
from time import sleep

from io import StringIO
import pandas as pd
import numpy as np
import re
import zipfile

# Options for the driver
options = Options()
options.headless = True



### TO-DO
- Figure out how to scrape documents when there are navigation bars in the documents section
- Figure out if [Selenium Grid](https://www.selenium.dev/documentation/grid/) can potentially improve the performance of the scraper
- Run on Google Colab/Microsoft Azure/local desktop/some other place?
- Preliminary EDA:
    - Get close time(s)
    - Fill in empty department info

In [None]:
. # Keep this cell to prevent the rest of the notebook from automatically running

The following script uses the `selenium` library to, in theory, scrape every request from [San Diego's NextRequest database](https://sandiego.nextrequest.com/requests). It does so by using the fact that every request has its own unique URL i.e. the request with ID 'yy-xxxx' will be found at '.../requests/yy-xxxx'. From each request webpage, the following information is extracted:

- `id` (str): ID of the request, yy-xxxx
- `status` (str): Whether the request is opened or closed. Always takes on a value of either 'CLOSED' or 'OPEN'
- `desc` (str): Description of the request provided by the requester
- `date` (str): Initial request date
- `depts` (str): Current departments assigned to the request (may not be the ones the requester had initially)
- `docs` (DataFrame in CSV format): All documents attached to the request, if there are any, otherwise None. The columns are:
    - `title` (str): Title given to each document
    - `link` (str): Link to each document
- `poc` (str): Point of contact
- `msgs` (DataFrame in CSV format): All messages attached to the request. The columns are:
    - `title` (str): Title of each message
    - `item` (str): Message body
    - `time` (str): Date of each message

After a request is scraped, the next request can be navigated to by clicking on an arrow, and the scraper continues to run until the arrow cannot be found, either because the scraper has reached the last request in the database or due to a timeout. To address these potential timeouts, we stop the driver every time it cannot access a request, then restart it after a short delay, starting from the request that it timed out on.

In [153]:
def scrape_requests_sequential(requests, driver, num_requests=-1, cooldown=1, debug=0, progress=0):
    '''
    Scrapes all records on a NextRequest request database starting from the given ID and
    moving forward chronologically until the number of requests scraped reaches a given
    number. Each scraped requests is added to a given list. If num_requests is non-positive, 
    then scrape as many records as possible.
    '''
    counter = 0 # Keeps track of how many requests have been scraped
    start = timer() # Timer for progress checking purposes
    
    # Start by scraping the initial record
    
    # Only scrape a request if it was loaded properly; otherwise, stop the scraper
    if not driver.find_elements_by_class_name('nextrequest'):
        print('No requests scraped')
        return counter

    scrape_request_append(requests, driver, counter=counter, debug=debug) # Scrape request

    counter += 1
    
    # For positive num_requests, return the list of requests if the counter reaches the desired number
    if ((num_requests > 0) and (counter == num_requests)):
        if progress:
            end = timer()
            print('Total requests scraped:', counter, 
                  '\tAvg runtime:', str(round((end - start)/counter, 2)) + 's/request', 
                  '\tTotal runtime:', str(round(end - start, 1)) + 's')
            print()
            print('Last request scraped:', requests[-1]['id'])
            print()
        
        return counter

    # Show progress, if desired
    if progress and (counter % progress == 0):
        end = timer()
        print('Requests scraped:', counter, 
              '\tAvg runtime:', str(round((end - start)/counter, 2)) + 's/request', 
              '\tTotal runtime:', str(round(end - start, 1)) + 's')
    
    # Continue to scrape until it is not possible to navigate to the next request, either due to the scraper reaching the 
    while driver.find_elements_by_class_name('js-next-request'): 
        driver.find_element_by_class_name('js-next-request').click() # Click on the arrow to navigate to the next request
        sleep(cooldown) # Cooldown between scraping attempts
        
        if not driver.find_elements_by_class_name('nextrequest'):
            break
        
        scrape_request_append(requests, driver, counter=counter, debug=debug) 
        
        counter += 1
        
        if ((num_requests > 0) and (counter == num_requests)):
            break
        
        if progress and (counter % progress == 0):
            end = timer()
            print('Requests scraped:', counter, 
                  '\tAvg runtime:', str(round((end - start)/counter, 2)) + 's/request', 
                  '\tTotal runtime:', str(round(end - start, 1)) + 's')
    
    # Final progress check
    if progress:
        end = timer()
        print('Total requests scraped:', counter, 
              '\tAvg runtime:', str(round((end - start)/counter, 2)) + 's/request', 
              '\tTotal runtime:', str(round(end - start, 1)) + 's')
        print()
        print('Last request scraped:', requests[-1]['id'])
        print()
        
    return counter

def scrape_request_append(requests, driver, counter=-1, debug=0):
    '''
    Scrapes data about a given request on a NextRequest request database, appending the result
    to the given list.
    '''
    request_id, status, desc, date, depts, docs, poc, events = [None] * 8 # Initialize variables 
    try: # Attempt to scrape relevant data
        request_id = driver.find_element_by_class_name('request-title-text').text.split()[1][1:] # Request ID
        status = driver.find_element_by_class_name('request-status-label').text.strip() # Request status
        
        desc_row = driver.find_element_by_class_name('request-text') # Box containing request description
        for desc_read_more in desc_row.find_elements_by_partial_link_text('Read more'): # Expand description if necessary
            desc_read_more.click()
        sleep(0.01)
        desc = desc_row.find_element_by_id('request-text').text # Full request description
        
        date = driver.find_element_by_class_name('request_date').text # Request date
        depts = driver.find_element_by_class_name('current-department').text # Department(s) assigned to the request
        poc = driver.find_element_by_class_name('request-detail').text # Person of contact

        # Documents attached to the request, if there are any (CURRENTLY DOES NOT SCRAPE ALL DOCUMENTS)
        doc_list = driver.find_element_by_class_name('document-list') # Box containing documents
        if '(none)' not in doc_list.text: # Check for the presence of documents
            # Expand folders, if there are any
            folders = doc_list.find_elements_by_class_name('folder-toggle') 
            for folder in folders:
                folder.click()
            sleep(0.01)
            
            docs_all = doc_list.find_elements_by_class_name('document-link')
            
            # TO-DO: Figure out how to scrape all documents from a request whose folders also have navigation bars
#             # If there are many documents, then there will be navigation bar(s)
#             pag_navs = doc_list.find_elements_by_class_name('pagy-nav')
#             if pag_navs:
#                 pag_nav = pag_navs[-1]
#                 while not pag_nav.find_elements_by_class_name('page.next.disable'):
#                     pag_nav.find_element_by_partial_link_text('Next').click()
#                     doc_list = driver.find_element_by_class_name('document-list')

#                     doc_titles.extend(get_webelement_text(doc_list.find_elements_by_class_name('document-link')))
#                     doc_links.extend(remove_download_from_urls(get_webelement_link(doc_list.find_elements_by_class_name('document-link'))))  

#             doc_titles = list(set(doc_titles))
#             doc_links = list(set(doc_links))
            
            # DataFrame-converted-to-CSV consisting of all documents
            docs = pd.DataFrame({
                'title': get_webelement_text(docs_all),
                'link': remove_download_from_urls(get_webelement_link(docs_all))
                }).to_csv(index=False)

        # Messages recorded on the request page, if there are any
        event_history = driver.find_elements_by_class_name('generic-event') # All message blocks
        if event_history: # Check for presence of messages
            num_events = len(event_history)

            # Titles, descriptions, and time strings for each message
            event_titles = [None] * num_events
            event_items = [None] * num_events
            time_quotes = [None] * num_events

            # Scrape information from each individual event
            for i in range(len(event_history)):
                event = event_history[i]

                event_title = event.find_element_by_class_name('event-title').text # Event title
                for details_toggle in event.find_elements_by_partial_link_text('Details'): # Expand event item details
                    details_toggle.click()
                event_item = '\n'.join(get_webelement_text(event.find_elements_by_class_name('event-item'))) # Event item
                time_quote = event.find_element_by_class_name('time-quotes').text # Time quote

                event_titles[i] = event_title
                event_items[i] = event_item
                time_quotes[i] = time_quote

            # DataFrame-converted-to-CSV consisting of all messages
            events = pd.DataFrame({ 
                'title': event_titles,
                'item': event_items,
                'time': time_quotes
                }).to_csv(index=False)
            
        # For testing purposes, print a message whenever a request is successfully scraped
        if debug:
            print(request_id, 'scraped')
    except: # If an exception occurs, print the stack trace
        print('Exception occured' + (' at count ' + str(counter + 1) if counter >= 0 else '') + ':')
        traceback.print_exc()
        print()
    finally: # Append the request to the list
        requests.append({
            'id': request_id,
            'status': status,
            'desc': desc,
            'date': date,
            'depts': depts,
            'docs': docs,
            'poc': poc,
            'msgs': events
            })

# Scraper utilities
def get_city_from_url(url):
    '''
    Finds the city name from the NextRequest URL.
    '''
    return re.match(r'(?<=https://)[a-zA-Z]*', url)[0]

def get_webelement_text(webelement):
    '''
    Gets the text of each web element in a list, if such a list exists.
    '''
    return list(map(lambda x: x.text, webelement)) if webelement else []

def get_webelement_link(webelement):
    '''
    Gets the link of each web element in a list, if such a list exists.
    '''
    return list(map(lambda x: x.get_attribute('href'), webelement)) if webelement else []

def remove_download_from_urls(urls):
    '''
    Removes '/download' from the end of a list of URLs, if the list exists.
    '''
    return list(map(lambda url: re.match(r'.*(?=/download)', url)[0], urls)) if urls else []

Define important variables before running the scraper:

In [148]:
url = 'https://sandiego.nextrequest.com/requests/' # URL to scrape from. 
                    # For Sai: Change this to the desired URL, and make sure to leave a / at the end
num_requests = -1 # Number of requests to scrape
cooldown = 1 # Cooldown between request accesses. 
            # For Sai: Adjust as desired, but keep in mind that a lower cooldown time = greater chance of a timeout
progress = 100 # Show progress every N requests that are scraped. 
            # For Sai: Adjust as desired
timeout = 10 # Timeout wait time between scraper runs
num_its = 1 # Keeps track of how many times the scraper has been run

requests = [] # List of dictionaries containing scraped info on each request
earliest_id = '15-1810' # Earliest ID in the database. 
                    # For Sai: Change this to the earliest ID in the desired database
requests_name = 'sd_requests' # Name of CSV file and ZIP archive to export scraped data to
                            # For Sai: Change this to a more suitable name for the database being scraped

The following cell contains the main scraper process. Run it to start the scraper.

**Important notes:**
- If the scraper stops running for any reason, then simply rerun this cell to start it back up. 
- **Do not run the previous cell if the scraper stops, or else all your progress will be lost!**

In [None]:
# Initialize the current ID to be either the earliest ID possible if the requests list is empty, or the last ID in the list
current_id = requests[-1]['id'] if requests else earliest_id 

# Instantiate headless (non-visible) Firefox driver. For Sai: Change 'Firefox' to your browser of choice e.g. Chrome
driver = webdriver.Firefox(options=options)

# Start by runnning an intial iteration of the scraper
driver.get(url + current_id)

# Print iteration number
it_num_title = 'Iteration ' + str(num_its)
print(it_num_title)
print('-' * len(it_num_title))

# Re-scrape the current request
print('Starting request:', requests.pop()['id']) 
print()

# Scrape requests until the scraper either reaches the end of the database or times out
scrape_requests_sequential(requests, driver, 
                           num_requests=num_requests, 
                           cooldown=cooldown, 
                           progress=progress)

num_its += 1
sleep(timeout) # Wait after the script reaches the end of the database or after a timeout

# Restart the driver at the last request scraped
current_id = requests[-1]['id']
driver.get(url + current_id)

# Continue to scrape until the scraper reaches the end of the database or times out
while driver.find_elements_by_class_name('js-next-request'):
    it_num_title = 'Iteration ' + str(num_its)
    print(it_num_title)
    print('-' * len(it_num_title))
    
    print('Starting request:', requests.pop()['id'])
    print()
    
    scrape_requests_sequential(requests, driver, 
                               num_requests=num_requests, 
                               cooldown=cooldown, 
                               progress=progress)
    
    num_its += 1
    sleep(timeout)

    current_id = requests[-1]['id']
    driver.get(url + current_id)

driver.close()

# Convert to DataFrame
requests = [request for request in requests if (request and request['status'])]
requests_df = pd.DataFrame(requests).drop_duplicates()

# Create a zipped CSV file of the DataFrame
compression_opts = dict(method='zip', archive_name=requests_name + '.csv')
requests_df.to_csv('data/' + requests_name + '.zip', index=False, compression=compression_opts)

For Sai: **Run every code cell above this line (excluding the one with just the single period and comment)**

In [2]:
# Convert to DataFrame
requests = [request for request in requests if (request and request['status'])]
requests_df = pd.DataFrame(requests).drop_duplicates()

NameError: name 'sd_requests' is not defined

In [None]:
# Create a zipped CSV file of the DataFrame
compression_opts = dict(method='zip', archive_name=requests_name + '.csv')
requests_df.to_csv('data/' + requests_name + '.zip', index=False, compression=compression_opts)

In [3]:
# Check to make sure the CSV file was properly created
test_df = pd.read_csv(zipfile.ZipFile('data/sd_requests.zip', 'r').open('sd_requests.csv'))

In [4]:
test_df

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...","December 7, 2015 via web",Code Enforcement,"title,link\n5040 ShorehamPlace building permit...",Ginger Rodriguez,"title,item,time\n""Request Closed\nPublic"",02. ..."
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time\n""Request Published\nPublic"",,..."
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,"December 7, 2015 via web",City Clerk,"title,link\nhttp://www.sandiego.gov/park-and-r...",Mailei Ross-Cerezo,"title,item,time\n""Request Closed\nPublic"",Stil..."
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time\n""Request Closed\nPublic"",02. ..."
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...","December 7, 2015 via web",Code Enforcement,"title,link\n15-1814 Fire Responsive.pdf,https:...",Ginger Rodriguez,"title,item,time\n""Request Closed\nPublic"",02. ..."
...,...,...,...,...,...,...,...,...
28842,21-5579,CLOSED,"Hello, I would like a copy of the report from ...","October 28, 2021 via web",Animal Services,,Lori Hernandez,"title,item,time\n""Request Published\nPublic"",,..."
28843,21-5581,CLOSED,Requesting incident report and photos for San ...,"October 28, 2021 via web",Animal Services,,Lori Hernandez,"title,item,time\n""Request Published\nPublic"",,..."
28844,21-5584,CLOSED,request for call for service\n\n2110020816,"October 28, 2021 via web",Police,"title,link\n2110020816_Redacted.pdf,https://sa...",Lori Hernandez,"title,item,time\n""Request Published\nPublic"",,..."
28845,21-5588,CLOSED,request for call for service\n\nE20050048015,"October 28, 2021 via web",Police,"title,link\nE20050048015_Redacted.pdf,https://...",Lori Hernandez,"title,item,time\n""Request Published\nPublic"",,..."


Scraper tests:

In [154]:
url = 'https://sandiego.nextrequest.com/requests/'

test_requests = []
driver = webdriver.Firefox()
driver.get(url + '15-1810')

scrape_requests_sequential(test_requests, driver, 
                           num_requests=11,
                           debug=1,
                           progress=2)

driver.close()

test = pd.DataFrame(test_requests)

15-1810 scraped
15-1811 scraped
Requests scraped: 2 	Avg runtime: 1.36s/request 	Total runtime: 2.7s
15-1812 scraped
15-1813 scraped
Requests scraped: 4 	Avg runtime: 1.82s/request 	Total runtime: 7.3s
15-1814 scraped
15-1815 scraped
Requests scraped: 6 	Avg runtime: 1.95s/request 	Total runtime: 11.7s
15-1816 scraped
15-1817 scraped
Requests scraped: 8 	Avg runtime: 2.02s/request 	Total runtime: 16.2s
15-1818 scraped
15-1819 scraped
Requests scraped: 10 	Avg runtime: 2.06s/request 	Total runtime: 20.6s
15-1820 scraped
Total requests scraped: 11 	Avg runtime: 2.09s/request 	Total runtime: 22.9s

Last request scraped: 15-1820



In [155]:
test

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...","December 7, 2015 via web",Code Enforcement,"title,link\n5040 ShorehamPlace building permit...",Ginger Rodriguez,"title,item,time\n""Request Closed\nPublic"",02. ..."
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time\n""Request Published\nPublic"",,..."
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,"December 7, 2015 via web",City Clerk,"title,link\nhttp://www.sandiego.gov/park-and-r...",Mailei Ross-Cerezo,"title,item,time\n""Request Closed\nPublic"",Stil..."
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time\n""Request Closed\nPublic"",02. ..."
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...","December 7, 2015 via web",Code Enforcement,"title,link\nSite Plan - 11943 El Camino Real.p...",Ginger Rodriguez,"title,item,time\n""Request Closed\nPublic"",02. ..."
5,15-1815,CLOSED,Membership applications and sign-in sheets fro...,"December 7, 2015 via web",Public Records Administration,,Lea Fields-Bernard,"title,item,time\n""Request Closed\nPublic"",No r..."
6,15-1816,CLOSED,Records that detail the oaths of office of cit...,"December 7, 2015 via web",City Clerk,"title,link\n15-1816 Yisrael.pdf,https://sandie...",Mailei Ross-Cerezo,"title,item,time\n""Request Closed\nPublic"",02. ..."
7,15-1817,CLOSED,File materials related to:\nP12010041171\nP120...,"December 7, 2015 via web",Police,,Humberto Hernandez,"title,item,time\n""Request Closed\nPublic"",02c...."
8,15-1818,CLOSED,Incident and supplemental reports for:\nAugust...,"December 7, 2015 via web",Public Records Administration,,Lea Fields-Bernard,"title,item,time\n""Request Closed\nPublic"",02. ..."
9,15-1819,CLOSED,Documents related to services performed and/or...,"December 7, 2015 via web",Public Utilities,,Wilson Kennedy,"title,item,time\n""Request Closed\nPublic"",No r..."


In [146]:
pd.read_csv(StringIO(test.loc[0]['msgs'])).tail(10)

Unnamed: 0,title,item,time
102,Document(s) Released\nPublic,Resolution R311787 Item 206 6-11-18.pdf,"April 15, 2019, 5:27pm by Ginger Rodriguez, Pu..."
103,Document(s) Released\nPublic,San Diego Housing Commission Docs.pdf,"April 15, 2019, 5:27pm by Ginger Rodriguez, Pu..."
104,Document(s) Released\nPublic,Copy of InclusionaryFees-ListofProjects-(2019-...,"April 15, 2019, 5:26pm by Ginger Rodriguez, Pu..."
105,Department Assignment Details\nPublic,"Added: City Council District 9, City Council D...","April 9, 2019, 12:37pm by Ginger Rodriguez, Pu..."
106,Document(s) Released Details\nPublic,http://sandiego.granicus.com/MediaPlayer.php?v...,"April 2, 2019, 10:17am by Ginger Rodriguez, Pu..."
107,Document(s) Released Details\nPublic,R-299350_Authorizing The Affordable Housing Fu...,"March 15, 2019, 6:08pm by Ginger Rodriguez, Pu..."
108,Document(s) Released Details\nPublic,Affordable Housing Fund Fiscal Year 2012 Annua...,"March 15, 2019, 9:06am by Tina Davis"
109,Document(s) Released Details\nPublic,SDHC Affordable Housing Fund Fiscal Year 2006 ...,"March 8, 2019, 5:26pm by Ginger Rodriguez, Pub..."
110,Department Assignment\nPublic,Added: Development Services.,"January 17, 2019, 4:56pm by Angela Laurita, Pu..."
111,Request Opened\nPublic,Request received via web,"January 17, 2019, 3:40pm"


The following process converts the CSV strings in the `docs` and `msgs` columns into DataFrames:

In [6]:
df_fillna = lambda df: df.convert_dtypes().fillna('') if df is not None else None
test_df = df_fillna(test_df)
test_df

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...","December 7, 2015 via web",Code Enforcement,"title,link 5040 ShorehamPlace building permits...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re..."
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Published Public"",,""D..."
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,"December 7, 2015 via web",City Clerk,"title,link http://www.sandiego.gov/park-and-re...",Mailei Ross-Cerezo,"title,item,time ""Request Closed Public"",Still ..."
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Closed Public"",02. Re..."
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...","December 7, 2015 via web",Code Enforcement,"title,link 15-1814 Fire Responsive.pdf,https:/...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re..."
...,...,...,...,...,...,...,...,...
28842,21-5579,CLOSED,"Hello, I would like a copy of the report from ...","October 28, 2021 via web",Animal Services,,Lori Hernandez,"title,item,time ""Request Published Public"",,""N..."
28843,21-5581,CLOSED,Requesting incident report and photos for San ...,"October 28, 2021 via web",Animal Services,,Lori Hernandez,"title,item,time ""Request Published Public"",,""N..."
28844,21-5584,CLOSED,request for call for service 2110020816,"October 28, 2021 via web",Police,"title,link 2110020816_Redacted.pdf,https://san...",Lori Hernandez,"title,item,time ""Request Published Public"",,""N..."
28845,21-5588,CLOSED,request for call for service E20050048015,"October 28, 2021 via web",Police,"title,link E20050048015_Redacted.pdf,https://s...",Lori Hernandez,"title,item,time ""Request Published Public"",,""N..."


In [7]:
csv_to_df = lambda csv: pd.read_csv(StringIO(csv)) if csv else None
test_df['docs_df'] = test_df['docs'].apply(csv_to_df)
test_df['msgs_df'] = test_df['msgs'].apply(csv_to_df)
test_df.head()

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...","December 7, 2015 via web",Code Enforcement,"title,link 5040 ShorehamPlace building permits...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ ...,title ...
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Published Public"",,""D...",,title ...
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,"December 7, 2015 via web",City Clerk,"title,link http://www.sandiego.gov/park-and-re...",Mailei Ross-Cerezo,"title,item,time ""Request Closed Public"",Still ...",...,title \ 0 ...
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Closed Public"",02. Re...",,title ...
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...","December 7, 2015 via web",Code Enforcement,"title,link 15-1814 Fire Responsive.pdf,https:/...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ 0 ...,title ...


Then, we fill the NA values in the individual `docs` and `msgs` DataFrames:

In [8]:
test_df['docs_df'] = test_df['docs_df'].apply(df_fillna)
test_df['msgs_df'] = test_df['msgs_df'].apply(df_fillna)
test_df.loc[4]['msgs_df']

Unnamed: 0,title,item,time
0,Request Closed Public,02. Released,"December 24, 2015, 10:34am by Ginger Rodriquez"
1,Document(s) Released Public,Site Plan - 11943 El Camino Real.pdf,"December 24, 2015, 10:33am by Ginger Rodriquez"
2,Request Reopened Public,,"December 24, 2015, 10:32am by Ginger Rodriquez"
3,Request Closed Public,02. Released,"December 24, 2015, 8:09am by Ginger Rodriquez"
4,Document(s) Released Public,15-1814 Fire Responsive.pdf,"December 24, 2015, 8:09am by Ginger Rodriquez"
5,Request Reopened Public,,"December 24, 2015, 8:02am by Amanda Alvarado, ..."
6,Request Closed Public,02. Released,"December 17, 2015, 4:00pm by Ginger Rodriquez"
7,Request Reopened Public,,"December 17, 2015, 3:56pm by Ginger Rodriquez"
8,Request Closed Public,02. Released,"December 17, 2015, 3:54pm by Ginger Rodriquez"
9,Request Closed Public,02. Released,"December 17, 2015, 3:52pm by Ginger Rodriquez"


Other EDA stuff:

In [24]:
test_df.head()

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...","December 7, 2015 via web",Code Enforcement,"title,link 5040 ShorehamPlace building permits...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ ...,title ...
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Published Public"",,""D...",,title ...
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,"December 7, 2015 via web",City Clerk,"title,link http://www.sandiego.gov/park-and-re...",Mailei Ross-Cerezo,"title,item,time ""Request Closed Public"",Still ...",...,title \ 0 R...
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Closed Public"",02. Re...",,title ...
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...","December 7, 2015 via web",Code Enforcement,"title,link 15-1814 Fire Responsive.pdf,https:/...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ 0 ...,title ...


In [41]:
test_df.shape

(28847, 10)

In [51]:
test_df['msgs_df'].apply(lambda df: df.shape[0] if df is not None else 0).sort_values(ascending=False).head(20)

3092     1117
2495      814
1721      313
676       174
2562      160
3109      153
6674      145
3971      142
6727      130
5895      126
129       124
11961     112
2119      106
3720       94
1859       92
2700       89
14166      89
19495      88
19195      86
3050       84
Name: msgs_df, dtype: int64

In [124]:
test_df.loc[129]

id                                                   15-1939
status                                                CLOSED
desc       Formal RP proposal responses, Intergraph contr...
date                               December 29, 2015 via web
depts                               Purchasing & Contracting
docs       title,link\nCost Proposal Cover Page.docx,http...
poc                                   Stacy Roberts-Gilhooly
msgs       title,item,time\n"Request Closed\nPublic","02....
docs_df                                                  ...
msgs_df                                title  \
0        ...
Name: 129, dtype: object

In [None]:
dept = 'Police'
test_df[test_df['depts'].str.contains(dept)]

In [9]:
test_df[test_df['desc'].str.contains('Read more')]

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df
7,15-1817,CLOSED,File materials related to: P12010041171 P12050...,"December 7, 2015 via web",Police,,Humberto Hernandez,"title,item,time ""Request Closed Public"",02c. R...",,title ...
8,15-1818,CLOSED,Incident and supplemental reports for: August ...,"December 7, 2015 via web",Public Records Administration,,Lea Fields-Bernard,"title,item,time ""Request Closed Public"",02. Re...",,title ...
80,15-1890,CLOSED,Latest emergency action plans for the followin...,"December 17, 2015 via web",Public Utilities,,Wilson Kennedy,"title,item,time ""Request Closed Hide Public""...",,title \ 0 Request ...
123,15-1933,CLOSED,Lease payments for the past 10 years for: SeaW...,"December 28, 2015 via web",Department of Real Estate and Airport Management,"title,link SD Coaster 10 Year Revenue.pdf,http...",Jeffrey Wallace,"title,item,time ""Request Closed Public"",""02. R...",title \ 0 ...,title \ 0 ...
313,16-174,CLOSED,Text or phone messages regarding police-involv...,"February 1, 2016 via web",Mayor,,Lea Fields-Bernard,"title,item,time ""Request Closed Public"",""No re...",,title \ 0 R...
...,...,...,...,...,...,...,...,...,...,...
28833,21-5526,CLOSED,"Hello, This is a public records request seeki...","October 15, 2021 via email",Police,"title,link 1281 9th Av 100112 to 030217 CFS Re...",Angela Laurita,"title,item,time ""Request Published Public"",,""O...",title...,title \ 0 Requ...
28834,21-5530,CLOSED,"Dear Custodian of Records, Under the Californ...","October 26, 2021 via web",Police,,Angela Laurita,"title,item,time ""Request Published Public"",,""O...",,title \ 0 Requ...
28835,21-5532,CLOSED,To Whom It May Concern: Pursuant to the Cali...,"October 26, 2021 via web",Police,"title,link PRA 21-5532 - ActiveSworn_Zip Codes...",Angela Laurita,"title,item,time ""Request Published Public"",,""N...",...,title \ 0 Requ...
28839,21-5552,CLOSED,The request is for the time of the towing call...,"October 27, 2021 via web",Police,,Angela Laurita,"title,item,time ""Request Published Public"",,""O...",,title \ 0 Requ...


In [26]:
test_df[test_df['docs'].str.fullmatch('title,link\n')]

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df
5752,17-2715,CLOSED,Project: Project Number: ...,"September 26, 2017 via email",Engineering and Capital Projects,"title,link",Jacqueline Palmer,"title,item,time ""Request Published Public"",,""O...","Empty DataFrame Columns: [title, link] Index: []",title \ 0 Requ...
9395,18-2521,CLOSED,AC Water & Sewer Improvements - Group 1026 Con...,"July 3, 2018 via email",Engineering and Capital Projects,"title,link",Ginger Rodriguez,"title,item,time ""Request Published Public"",,""J...","Empty DataFrame Columns: [title, link] Index: []",title \ 0 Requ...
9398,18-2524,CLOSED,Point Loma Ocean Outfall Repair Contract#: L17...,"July 9, 2018 via email",Engineering and Capital Projects,"title,link",Ginger Rodriguez,"title,item,time ""Request Published Public"",,""J...","Empty DataFrame Columns: [title, link] Index: []",title \ 0 Requ...
9814,18-2940,CLOSED,"August 3, 2018  City of San Diego, CA City A...","August 3, 2018 via web",Engineering and Capital Projects,"title,link",Ginger Rodriguez,"title,item,time ""Request Published Public"",,""A...","Empty DataFrame Columns: [title, link] Index: []",title \ 0 ...
9922,18-3048,CLOSED,Re: Property Condition Report Public Rec...,"August 10, 2018 via web",Development Services,"title,link",Angela Laurita,"title,item,time ""Request Published Public"",,""S...","Empty DataFrame Columns: [title, link] Index: []",title \ 0 Requ...
11514,18-4642,CLOSED,Please provide all email communications (inter...,"December 11, 2018 via web",City Council Administration,"title,link",Lori Witzel,"title,item,time ""Request Published Public"",,""M...","Empty DataFrame Columns: [title, link] Index: []",title \ 0 Re...
11928,19-230,CLOSED,"Any and all documents, including, but not limi...","January 16, 2019 via web",Public Utilities,"title,link",Angela Laurita,"title,item,time ""Request Published Public"",,""F...","Empty DataFrame Columns: [title, link] Index: []",title \ 0 ...
12615,19-918,CLOSED,"Reference: Sewer Group 786, City of San Diego,...","February 27, 2019 via web",Engineering and Capital Projects,"title,link",Lori Hernandez,"title,item,time ""Request Published Public"",,""M...","Empty DataFrame Columns: [title, link] Index: []",title \ 0 Requ...
13050,19-1354,CLOSED,May I have a copy of:  Certificate of Occupan...,"March 29, 2019 via web",Development Services,"title,link",Ginger Rodriguez,"title,item,time ""Request Published Public"",,""A...","Empty DataFrame Columns: [title, link] Index: []",title \ 0 Requ...
13928,19-2232,CLOSED,Any and all records relating to 4506 Ocean Vie...,"May 24, 2019 via web",Code Enforcement,"title,link",Ginger Rodriguez,"title,item,time ""Request Published Public"",,""J...","Empty DataFrame Columns: [title, link] Index: []",title \ 0 Requ...


In [None]:
# Split the time and author from the time quote on each message
def split_time_author(df):
    if df is None:
        return None
    time_quotes = df['time'].to_numpy()
    time_author = pd.DataFrame(list(map(lambda x: x.split(' by '), time_quotes)))
    return df_fillna(df.join(
            time_author
        ).drop(
            columns='time'
        ).rename(
            columns={0: 'time', 1: 'by'}
        ))

test_df['msgs_df'] = test_df['msgs_df'].apply(split_time_author)
test_df.loc[4]['msgs_df']

In [None]:
# Remove empty dataframes from docs_df
remove_empty = lambda df: None if ((df is None) or df.empty) else df
test_df['docs_df'] = test_df['docs_df'].apply(remove_empty)
test_df[test_df['docs'].str.fullmatch('title,link\n')]

In [None]:
# Splitting departments for easier pivoting
depts = test_df['depts'].to_numpy() # depts column
test_df_depts = test_df.join(pd.DataFrame(list(map(lambda x: x.split(', '), depts)))) # Split departments into separate columns
test_df_depts = depts_df.melt( # Melt on the individual departments
        id_vars=test_df.columns
    )[lambda df: df['value'].apply(lambda x: x is not None)].drop( # Get rid of None values
        columns='variable'
    ).rename( # Rename and reset indices
        columns={'value': 'dept'}
    ).reset_index().drop(
        columns='index'
    )
test_df_depts

In [None]:
test_df_depts.value_counts('id').head(20)

In [None]:
request_id = '"19-3612"'
test_df.query('id == ' + request_id).iloc[0]['desc']

In [None]:
desc = 'Padres'
test_df[test_df['desc'].str.contains(desc)]

In [None]:
dates = test_df['date'].to_numpy()
test_df = test_df.join(
        pd.DataFrame(list(map(lambda x: x.split(' via '), dates)))
    ).drop(
        columns='date'
    ).rename(
        columns={0: 'date', 1: 'via'}
    )

In [None]:
test_df = test_df.convert_dtypes()
test_df.loc[0]['msgs_df']

In [None]:
test.loc[0]

Previous scraper function attempts:

In [None]:
def scrape_record(url, request_id, driver):
    '''
    Scrapes data about a given request on a NextRequest request database
    '''
    driver.get(url + request_id) # Attempt to access the record
#     timeout = 2 # Timeout length, in seconds
    
#     try:
#         WebDriverWait(driver, timeout).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'nextrequest')))
#     except TimeoutException:
#         print(request_id, 'timed out')
#         pass
    
    if (request_id not in driver.title):
        return
    
    status, desc, date, depts, docs, poc, events = [None] * 7 # Initialize variables 
    try: # Attempt to scrape relevant data
        status = driver.find_element_by_class_name('request-status-label').text.strip() # Request status
        desc = driver.find_element_by_class_name('request-text.row').text # Request description
        date = driver.find_element_by_class_name('request_date').text # Request date
        depts = driver.find_element_by_class_name('current-department').text # Department(s) assigned to the request
        poc = driver.find_element_by_class_name('request-detail').text # Person of contact
        
        # Documents attached to the request, if there are any
        public_docs = driver.find_element_by_id('document-list') # WebElement containing the documents
        if '(none)' not in public_docs.text:
            # Expand folders, if there are any
            folders = public_docs.find_elements_by_class_name('folder-toggle') 
            if folders:
                for folder in folders:
                    folder.click()

            doc_links = public_docs.find_elements_by_class_name('document-link') # Links to documents
            
            # DataFrame consisting of all documents
            docs = pd.DataFrame({
                'title': get_webelement_text(doc_links),
                'link': remove_download_from_urls(get_webelement_link(doc_links))
                })
            
        # Show all message history, if the option is available
        show_all_history = driver.find_elements_by_class_name('show-all-history')
        if show_all_history:
            show_all_history[0].click()
        
        # Messages recorded on the request page, if there are any
        event_history = driver.find_elements_by_class_name('generic-event')
        if event_history:
            num_events = len(event_history)
            
            # Titles, descriptions, and time strings for each message
            event_titles = [None] * num_events
            event_items = [None] * num_events
            time_quotes = [None] * num_events
            
            # Scrape information from each individual event
            for i in range(len(event_history)):
                event = event_history[i]
                
                event_title = event.find_element_by_class_name('event-title').text
                event_item = '\n'.join(get_webelement_text(event.find_elements_by_class_name('event-item')))
                time_quote = event.find_element_by_class_name('time-quotes').text
                
                event_titles[i] = event_title
                event_items[i] = event_item
                time_quotes[i] = time_quote
                
            # DataFrame consisting of all messages
            events = pd.DataFrame({ 
                'title': event_titles,
                'item': event_items,
                'time': time_quotes
                })
    except NoSuchElementException: # Catch exception thrown if a specific element cannot be found, and silently pass
        pass
    except: # If some other exception occurs, print information about the exception
        traceback.print_exc()
    finally: # Return the request
        return {
            'id': request_id,
            'status': status,
            'desc': desc,
            'date': date,
            'depts': depts,
            'docs': docs,
            'poc': poc,
            'msgs': events
            }
    
def scrape_record_parallel(url, request_id):
    '''
    Scraper method used for parallelization
    '''
    driver = webdriver.Firefox(options=options)
    request_info = scrape_record(url, request_id, driver)
    driver.close()
    return request_info

In [None]:
# Options for scraping
earliest_year = 16 # Earliest year to search requests for
latest_year = 21 # Latest year to search request for
id_start = 1 # ID value to start from
id_range = id_max # Number of IDs to try for each year
cooldown = 0.9 # Amount of time, in seconds, to wait between website accesses

start_id = '15-1810' # The request to start scraping from
num_requests = -1 # Number of requests to scrape
progress = 100 # Display a message every 100 requests successfully scraped

# List of all request IDs
request_ids = [str(year) + '-' + str(num) for num in range(id_start, id_range + id_start) 
                                       for year in range(earliest_year, latest_year + 1)]

# URLs to extract data from
urls = ['https://sandiego.nextrequest.com/requests/', 'https://oaklandca.nextrequest.com/requests/']

In [None]:
# Iterative script
driver = webdriver.Firefox(options=options
                          ) # Headless (non-visible) WebDriver

sd_requests = [] # List of dictionaries containing information on each request
i = 0 # Index for URLs - may be useful later for scraping multiple sites

for year in range(earliest_year, latest_year + 1):
    for num in tqdm(range(id_start, id_range + id_start)):
        # NextRequest request IDs are a two-digit year and a number, with a dash in between
        request_id = str(year) + '-' + str(num)
        
        # Scrape record
        sd_requests.append(scrape_record(urls[i], request_id, driver)) 

        # Cooldown
        sleep(cooldown)
    
    # sleep(cooldown) # Cooldown
        
driver.close()

sd_requests = [x for x in sd_requests if x['status'] is not None] # Remove entries with incomplete information
sd_requests_df = pd.DataFrame(sd_requests) # Convert to DataFrame

# Create a zipped CSV file of the data
compression_opts = dict(method='zip', archive_name='sd_requests.csv')
sd_requests_df.to_csv('data/sd_requests_2.zip', index=False, compression=compression_opts)

In [None]:
# Parallelized script
scrape_request = lambda i: scrape_record_parallel(urls[0], i)
sd_requests = Parallel(n_jobs=-1, prefer='threads', verbose=10)(delayed(scrape_request)(request_id) for request_id in request_ids)

Miscellaneous tests:

In [None]:
# # Test remote WebDriver (run Selenium Grid instance locally before running this cell)
# driver = webdriver.Remote(desired_capabilities=DesiredCapabilities.FIREFOX, options=options)
# driver.get('http://www.google.com')
# driver.close()

In [None]:
# # Test to make sure the driver works
# 
# driver = webdriver.Firefox(options=options) # Instantiate a headless Firefox WebDriver
# for url in urls:
#     driver.get(url)
#     print(driver.title)
#     sleep(5)
    
# driver.close()

In [None]:
# %%timeit
# # Test for retrieving message info from a specific request
# driver = webdriver.Firefox(options=options)
# driver.get('https://sandiego.nextrequest.com/requests/21-4915')
# print(driver.title)

# event_titles = driver.find_elements_by_class_name('event-title')
# event_items = driver.find_elements_by_class_name('event-item')
# times = driver.find_elements_by_class_name('time-quotes')
# for title, item, time in list(zip(event_titles, event_items, times)):
#     print(title.text)
#     print(item.text)
#     print(time.text)
#     print()

# driver.close()

In [None]:
# # Scraping documents test
# driver = webdriver.Firefox(options=options)

# for request_id in [5369, 5313, 5374]:
#     url = 'https://sandiego.nextrequest.com/requests/21-' + str(request_id)
#     driver.get(url)

#     docs = driver.find_element_by_id('public-docs')
#     folders = docs.find_elements_by_class_name('folder-toggle')
#     if folders:
#         for folder in folders:
#             folder.click()
#     else:
#         print('No folders found for', request_id)
    
#     doc_links = docs.find_elements_by_class_name('document-link')
#     display(
#         pd.DataFrame(
#             list(zip(get_webelement_text(doc_links), remove_download_from_urls(get_webelement_link(doc_links)))),
#             columns=['title', 'link']
#         )
#     )
#     display(
#         pd.DataFrame({
#             'title': get_webelement_text(doc_links),
#             'link': remove_download_from_urls(get_webelement_link(doc_links))
#         })
#     )

# driver.close()