In [621]:
!pip install selenium

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException

from joblib import Parallel, delayed

import sys
import traceback
from timeit import default_timer as timer
from tqdm import tqdm
from time import sleep

from io import StringIO
import pandas as pd
import numpy as np
import re
import zipfile

# Options for the driver
options = Options()
options.headless = True



### TO-DO
- Figure out if [Selenium Grid](https://www.selenium.dev/documentation/grid/) can potentially improve the performance of the scraper
- Run on Google Colab/Microsoft Azure/local desktop/some other place?
- Preliminary EDA:
    - Get close time(s)
    - Fill in empty department info

In [809]:
. # Keep this cell to prevent the rest of the notebook from automatically running

SyntaxError: invalid syntax (<ipython-input-809-b2026be2226a>, line 1)

The following script uses the `selenium` library to, in theory, scrape every request from [San Diego's NextRequest database](https://sandiego.nextrequest.com/requests). It does so by using the fact that every request has its own unique URL i.e. the request with ID 'yy-xxxx' will be found at '.../requests/yy-xxxx'. From each request webpage, the following information is extracted:

- `id` (str): ID of the request, yy-xxxx
- `status` (str): Whether the request is opened or closed. Always takes on a value of either 'CLOSED' or 'OPEN'
- `desc` (str): Description of the request provided by the requester
- `date` (str): Initial request date
- `depts` (str): Current departments assigned to the request (may not be the ones the requester had initially)
- `docs` (DataFrame in CSV format): All documents attached to the request, if there are any, otherwise None. The columns are:
    - `title` (str): Title given to each document
    - `link` (str): Link to each document
- `poc` (str): Point of contact
- `msgs` (DataFrame in CSV format): All messages attached to the request. The columns are:
    - `title` (str): Title of each message
    - `item` (str): Message body
    - `time` (str): Date of each message

After a request is scraped, the next request can be navigated to by clicking on an arrow, and the scraper continues to run until the arrow cannot be found, either because the scraper has reached the last request in the database or due to a timeout. To address these potential timeouts, we stop the driver every time it cannot access a request, then restart it after a short delay, starting from the request that it timed out on.

In [623]:
urls = ['https://sandiego.nextrequest.com/requests/', 'https://oaklandca.nextrequest.com/requests/'] # URLs
start_ids = ['', '']
i = 0 # Index of URL to scrape from
current_id = sd_requests[-1]['id'] # The current ID, initialized to the ID to start scraping from
num_requests = -1 # Number of requests to scrape
cooldown = 1 # Cooldown between request accesses
timeout = 10 # Timeout wait time between scraper runs
progress = 100 # Number of requests to show progress for
num_runs = 1 # Keeps track of how many times the scraper has been run

driver = webdriver.Firefox(options=options) # Instantiate headless (non-visible) Firefox driver

# Run an intial iteration of the scraper
driver.get(urls[i] + current_id)

# Print iteration number
it_num_title = 'Iteration ' + str(num_runs)
print(it_num_title)
print('-' * len(it_num_title))

# Re-scrape the current request
print('Starting request:', sd_requests.pop()['id']) 
print()

# Scrape requests until it either reaches the end or times out
scrape_requests_sequential(sd_requests, driver, 
                           num_requests=num_requests, 
                           cooldown=cooldown, 
                           progress=progress)

num_runs += 1
sleep(timeout) # Wait after a timeout

# Restart the driver at the last request scraped 
current_id = sd_requests[-1]['id']
driver.get(urls[i] + current_id)

# Continue to scrape until the arrow to go to the next request is no longer present
while driver.find_elements_by_class_name('js-next-request'):
    # Scrape requests until the next timeout or the arrow is no longer present
    it_num_title = 'Iteration ' + str(num_runs)
    print(it_num_title)
    print('-' * len(it_num_title))
    
    print('Starting request:', sd_requests.pop()['id'])
    print()
    
    scrape_requests_sequential(sd_requests, driver, 
                               num_requests=num_requests, 
                               cooldown=cooldown, 
                               progress=progress)
    
    num_runs += 1
    sleep(timeout)

    current_id = sd_requests[-1]['id']
    driver.get(urls[i] + current_id)

driver.close()

# Convert to DataFrame
sd_requests = [request for request in sd_requests if (request and request['status'])]
sd_requests_df = pd.DataFrame(sd_requests).drop_duplicates()

# Create a zipped CSV file of the DataFrame
compression_opts = dict(method='zip', archive_name='sd_requests.csv')
sd_requests_df.to_csv('data/sd_requests.zip', index=False, compression=compression_opts)

Iteration 1
-----------
Starting request: 19-2477

Requests scraped: 100 	Avg runtime: 2.17s/request 	Total runtime: 216.9s
Requests scraped: 200 	Avg runtime: 2.18s/request 	Total runtime: 435.6s
Requests scraped: 300 	Avg runtime: 2.17s/request 	Total runtime: 651.6s
Requests scraped: 400 	Avg runtime: 2.16s/request 	Total runtime: 864.2s
Requests scraped: 500 	Avg runtime: 2.19s/request 	Total runtime: 1093.0s
Requests scraped: 600 	Avg runtime: 2.17s/request 	Total runtime: 1299.9s
Requests scraped: 700 	Avg runtime: 2.17s/request 	Total runtime: 1518.9s
Requests scraped: 800 	Avg runtime: 2.18s/request 	Total runtime: 1743.0s
Total requests scraped: 816 	Avg runtime: 2.22s/request 	Total runtime: 1814.7s

Last request scraped: 19-3292

Iteration 2
-----------
Starting request: 19-3292

Total requests scraped: 19 	Avg runtime: 2.91s/request 	Total runtime: 55.3s

Last request scraped: 19-3310

Iteration 3
-----------
Starting request: 19-3310

Requests scraped: 100 	Avg runtime: 2.

Requests scraped: 900 	Avg runtime: 2.26s/request 	Total runtime: 2029.8s
Requests scraped: 1000 	Avg runtime: 2.25s/request 	Total runtime: 2253.9s
Requests scraped: 1100 	Avg runtime: 2.25s/request 	Total runtime: 2476.0s
Requests scraped: 1200 	Avg runtime: 2.26s/request 	Total runtime: 2710.8s
Requests scraped: 1300 	Avg runtime: 2.27s/request 	Total runtime: 2947.1s
Total requests scraped: 1367 	Avg runtime: 2.27s/request 	Total runtime: 3108.4s

Last request scraped: 21-1378

Iteration 6
-----------
Starting request: 21-1378

Requests scraped: 100 	Avg runtime: 2.3s/request 	Total runtime: 230.3s
Requests scraped: 200 	Avg runtime: 2.29s/request 	Total runtime: 457.8s
Requests scraped: 300 	Avg runtime: 2.27s/request 	Total runtime: 681.0s
Total requests scraped: 335 	Avg runtime: 2.29s/request 	Total runtime: 768.1s

Last request scraped: 21-1730

Iteration 7
-----------
Starting request: 21-1730

Requests scraped: 100 	Avg runtime: 2.21s/request 	Total runtime: 221.0s
Requests 

Traceback (most recent call last):
  File "<ipython-input-622-6acee892921f>", line 22, in scrape_record_append
    folder.click()
  File "/Users/stevenyuan/opt/anaconda3/lib/python3.8/site-packages/selenium/webdriver/remote/webelement.py", line 80, in click
    self._execute(Command.CLICK_ELEMENT)
  File "/Users/stevenyuan/opt/anaconda3/lib/python3.8/site-packages/selenium/webdriver/remote/webelement.py", line 633, in _execute
    return self._parent.execute(command, params)
  File "/Users/stevenyuan/opt/anaconda3/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py", line 321, in execute
    self.error_handler.check_response(response)
  File "/Users/stevenyuan/opt/anaconda3/lib/python3.8/site-packages/selenium/webdriver/remote/errorhandler.py", line 242, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.StaleElementReferenceException: Message: The element reference of <a class="icon-edit icon-edit-inline folder-toggle" hr

Requests scraped: 3200 	Avg runtime: 2.31s/request 	Total runtime: 7403.2s
Requests scraped: 3300 	Avg runtime: 2.31s/request 	Total runtime: 7624.6s
Requests scraped: 3400 	Avg runtime: 2.31s/request 	Total runtime: 7849.0s
Requests scraped: 3500 	Avg runtime: 2.31s/request 	Total runtime: 8072.9s
Total requests scraped: 3572 	Avg runtime: 2.3s/request 	Total runtime: 8227.5s

Last request scraped: 21-5614



In [626]:
# Convert to DataFrame
sd_requests = [request for request in sd_requests if (request and request['status'])]
sd_requests_df = pd.DataFrame(sd_requests).drop_duplicates()

In [639]:
# Create a zipped CSV file of the DataFrame
compression_opts = dict(method='zip', archive_name='sd_requests.csv')
sd_requests_df.to_csv('data/sd_requests.zip', index=False, compression=compression_opts)

In [870]:
# Check to make sure the CSV file was properly created
test_df = pd.read_csv(zipfile.ZipFile('data/sd_requests.zip', 'r').open('sd_requests.csv'))

In [871]:
test_df

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...","December 7, 2015 via web",Code Enforcement,"title,link\n5040 ShorehamPlace building permit...",Ginger Rodriguez,"title,item,time\n""Request Closed\nPublic"",02. ..."
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time\n""Request Published\nPublic"",,..."
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,"December 7, 2015 via web",City Clerk,"title,link\nhttp://www.sandiego.gov/park-and-r...",Mailei Ross-Cerezo,"title,item,time\n""Request Closed\nPublic"",Stil..."
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time\n""Request Closed\nPublic"",02. ..."
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...","December 7, 2015 via web",Code Enforcement,"title,link\n15-1814 Fire Responsive.pdf,https:...",Ginger Rodriguez,"title,item,time\n""Request Closed\nPublic"",02. ..."
...,...,...,...,...,...,...,...,...
28842,21-5579,CLOSED,"Hello, I would like a copy of the report from ...","October 28, 2021 via web",Animal Services,,Lori Hernandez,"title,item,time\n""Request Published\nPublic"",,..."
28843,21-5581,CLOSED,Requesting incident report and photos for San ...,"October 28, 2021 via web",Animal Services,,Lori Hernandez,"title,item,time\n""Request Published\nPublic"",,..."
28844,21-5584,CLOSED,request for call for service\n\n2110020816,"October 28, 2021 via web",Police,"title,link\n2110020816_Redacted.pdf,https://sa...",Lori Hernandez,"title,item,time\n""Request Published\nPublic"",,..."
28845,21-5588,CLOSED,request for call for service\n\nE20050048015,"October 28, 2021 via web",Police,"title,link\nE20050048015_Redacted.pdf,https://...",Lori Hernandez,"title,item,time\n""Request Published\nPublic"",,..."


The following process converts the CSV strings in the `docs` and `msgs` columns into DataFrames:

In [872]:
test_df = test_df.convert_dtypes().fillna('')
test_df

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...","December 7, 2015 via web",Code Enforcement,"title,link 5040 ShorehamPlace building permits...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re..."
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Published Public"",,""D..."
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,"December 7, 2015 via web",City Clerk,"title,link http://www.sandiego.gov/park-and-re...",Mailei Ross-Cerezo,"title,item,time ""Request Closed Public"",Still ..."
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Closed Public"",02. Re..."
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...","December 7, 2015 via web",Code Enforcement,"title,link 15-1814 Fire Responsive.pdf,https:/...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re..."
...,...,...,...,...,...,...,...,...
28842,21-5579,CLOSED,"Hello, I would like a copy of the report from ...","October 28, 2021 via web",Animal Services,,Lori Hernandez,"title,item,time ""Request Published Public"",,""N..."
28843,21-5581,CLOSED,Requesting incident report and photos for San ...,"October 28, 2021 via web",Animal Services,,Lori Hernandez,"title,item,time ""Request Published Public"",,""N..."
28844,21-5584,CLOSED,request for call for service 2110020816,"October 28, 2021 via web",Police,"title,link 2110020816_Redacted.pdf,https://san...",Lori Hernandez,"title,item,time ""Request Published Public"",,""N..."
28845,21-5588,CLOSED,request for call for service E20050048015,"October 28, 2021 via web",Police,"title,link E20050048015_Redacted.pdf,https://s...",Lori Hernandez,"title,item,time ""Request Published Public"",,""N..."


In [873]:
csv_to_df = lambda csv: pd.read_csv(StringIO(csv)) if csv else None
test_df['docs_df'] = test_df['docs'].apply(csv_to_df)
test_df['msgs_df'] = test_df['msgs'].apply(csv_to_df)
test_df.head()

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...","December 7, 2015 via web",Code Enforcement,"title,link 5040 ShorehamPlace building permits...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ ...,title ...
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Published Public"",,""D...",,title ...
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,"December 7, 2015 via web",City Clerk,"title,link http://www.sandiego.gov/park-and-re...",Mailei Ross-Cerezo,"title,item,time ""Request Closed Public"",Still ...",...,title \ 0 ...
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Closed Public"",02. Re...",,title ...
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...","December 7, 2015 via web",Code Enforcement,"title,link 15-1814 Fire Responsive.pdf,https:/...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ 0 ...,title ...


Then, we fill the NA values in the individual `docs` and `msgs` DataFrames:

In [874]:
df_fillna = lambda df: df.convert_dtypes().fillna('') if df is not None else None
test_df['docs_df'] = test_df['docs_df'].apply(df_fillna)
test_df['msgs_df'] = test_df['msgs_df'].apply(df_fillna)
test_df.loc[4]['msgs_df']

Unnamed: 0,title,item,time
0,Request Closed Public,02. Released,"December 24, 2015, 10:34am by Ginger Rodriquez"
1,Document(s) Released Public,Site Plan - 11943 El Camino Real.pdf,"December 24, 2015, 10:33am by Ginger Rodriquez"
2,Request Reopened Public,,"December 24, 2015, 10:32am by Ginger Rodriquez"
3,Request Closed Public,02. Released,"December 24, 2015, 8:09am by Ginger Rodriquez"
4,Document(s) Released Public,15-1814 Fire Responsive.pdf,"December 24, 2015, 8:09am by Ginger Rodriquez"
5,Request Reopened Public,,"December 24, 2015, 8:02am by Amanda Alvarado, ..."
6,Request Closed Public,02. Released,"December 17, 2015, 4:00pm by Ginger Rodriquez"
7,Request Reopened Public,,"December 17, 2015, 3:56pm by Ginger Rodriquez"
8,Request Closed Public,02. Released,"December 17, 2015, 3:54pm by Ginger Rodriquez"
9,Request Closed Public,02. Released,"December 17, 2015, 3:52pm by Ginger Rodriquez"


Other EDA stuff:

In [746]:
test_df.head()

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...","December 7, 2015 via web",Code Enforcement,"title,link 5040 ShorehamPlace building permits...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ ...,title ...
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Published Public"",,""D...",,title ...
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,"December 7, 2015 via web",City Clerk,"title,link http://www.sandiego.gov/park-and-re...",Mailei Ross-Cerezo,"title,item,time ""Request Closed Public"",Still ...",...,title \ 0 R...
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Closed Public"",02. Re...",,title ...
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...","December 7, 2015 via web",Code Enforcement,"title,link 15-1814 Fire Responsive.pdf,https:/...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ 0 ...,title ...
...,...,...,...,...,...,...,...,...,...,...
28842,21-5579,CLOSED,"Hello, I would like a copy of the report from ...","October 28, 2021 via web",Animal Services,,Lori Hernandez,"title,item,time ""Request Published Public"",,""N...",,title \ 0 Requ...
28843,21-5581,CLOSED,Requesting incident report and photos for San ...,"October 28, 2021 via web",Animal Services,,Lori Hernandez,"title,item,time ""Request Published Public"",,""N...",,title \...
28844,21-5584,CLOSED,request for call for service 2110020816,"October 28, 2021 via web",Police,"title,link 2110020816_Redacted.pdf,https://san...",Lori Hernandez,"title,item,time ""Request Published Public"",,""N...",title ...,title \ 0 Requ...
28845,21-5588,CLOSED,request for call for service E20050048015,"October 28, 2021 via web",Police,"title,link E20050048015_Redacted.pdf,https://s...",Lori Hernandez,"title,item,time ""Request Published Public"",,""N...",title \ 0 E2005004801...,title \ 0 Requ...


In [855]:
dept = 'Police'
test_df[test_df['depts'].str.contains(dept)]

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df
7,15-1817,CLOSED,File materials related to: P12010041171 P12050...,"December 7, 2015 via web",Police,,Humberto Hernandez,"title,item,time ""Request Closed Public"",02c. R...",,title ...
49,15-1859,CLOSED,All records related to the demographic study c...,"December 11, 2015 via web",Police,,Humberto Hernandez,"title,item,time ""Request Closed Hide Public""...",,title \ 0 Request...
65,15-1875,CLOSED,Reports related to incident of 11/11/2015 wher...,"December 15, 2015 via web",Police,,Humberto Hernandez,"title,item,time ""Request Closed Public"",""02. R...",,title \ 0 ...
81,15-1891,CLOSED,Communications by the City of San Diego regard...,"December 17, 2015 via web",Police,,Humberto Hernandez,"title,item,time ""Request Closed Public"",Still ...",,title ...
121,15-1931,CLOSED,"Vehicle Stop Data for period September 1, 2015...","December 28, 2015 via web",Police,,Humberto Hernandez,"title,item,time ""Request Closed Public"",Still ...",,title \ 0 ...
...,...,...,...,...,...,...,...,...,...,...
28837,21-5544,CLOSED,Dan you tell me when San Diego Police Officer ...,"October 26, 2021 via web",Police,,Angela Laurita,"title,item,time ""Request Published Public"",,""N...",,title \ 0 Req...
28838,21-5550,CLOSED,I request any police reports involving inciden...,"October 27, 2021 via web",Police,,Angela Laurita,"title,item,time ""Request Published Public"",,""O...",,title \ 0 Req...
28839,21-5552,CLOSED,The request is for the time of the towing call...,"October 27, 2021 via web",Police,,Angela Laurita,"title,item,time ""Request Published Public"",,""O...",,title \ 0 Req...
28844,21-5584,CLOSED,request for call for service 2110020816,"October 28, 2021 via web",Police,"title,link 2110020816_Redacted.pdf,https://san...",Lori Hernandez,"title,item,time ""Request Published Public"",,""N...",title ...,title \ 0 Req...


In [808]:
test_df.query('depts == ""')

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df
32,15-1842,CLOSED,"Most recent ""Statement of Acccount-Unpaid Chec...","December 10, 2015 via web",,,Elena Perez,"title,item,time ""Request Closed Public"",No res...",,title ...
153,16-14,CLOSED,"Accounting of uncashed checks/warrants, tax ov...","January 5, 2016 via web",,"title,link http://www.sandiego.gov/comptroller...",Doug Enger,"title,item,time ""Request Closed Hide Public""...",...,title \ 0 Request ...
184,16-45,CLOSED,"Payments made in the amount of $10,000 or more...","January 12, 2016 via web",,"title,link PRA CCPRA2016-0045 1-21-2016.xlsx,h...",Elena Perez,"title,item,time ""Request Closed Public"",""02. R...",title \ 0 PRA...,title \ 0 R...
267,16-128,CLOSED,Unclaimed warrants,"January 26, 2016 via web",,,Elena Perez,"title,item,time ""Request Closed Public"",""02. R...",,title \ 0 R...
306,16-167,CLOSED,List of outstanding checks,"February 1, 2016 via web",,"title,link Outstanding Checks 1-19-2016.htm,ht...",Elena Perez,"title,item,time ""Request Closed Public"",""02. R...",title \ 0 Outs...,title \ 0 R...
...,...,...,...,...,...,...,...,...,...,...
12367,19-669,CLOSED,Can the City please provide me with a copy of ...,"February 12, 2019 via web",,"title,link 8531950_44131.pdf,https://sandiego....",Lori Hernandez,"title,item,time ""Request Published Public"",,""F...",title ...,title \ 0 Requ...
13494,19-1798,CLOSED,Please produce the following: Application for ...,"April 25, 2019 via web",,"title,link 19-1798 Permit #13756.pdf,https://s...",Lori Hernandez,"title,item,time ""Request Published Public"",,""A...",...,title \ 0...
13629,19-1933,CLOSED,"Copies of any and all contracts,service agreem...","May 6, 2019 via web",,"title,link Urban Corps Staffing for 08-07-14.p...",Lori Hernandez,"title,item,time ""Request Published Public"",,""J...",...,title \ 0 ...
15886,19-4190,CLOSED,I am requesting contracts for SDCCU Stadium th...,"September 4, 2019 via web",,,Lori Hernandez,"title,item,time ""Request Published Public"",,""S...",,title \ 0 Requ...


In [875]:
# Split the time and author from the time quote on each message
def split_time_author(df):
    if df is None:
        return None
    time_quotes = df['time'].to_numpy()
    time_author = pd.DataFrame(list(map(lambda x: x.split(' by '), time_quotes)))
    return df_fillna(df.join(
            time_author
        ).drop(
            columns='time'
        ).rename(
            columns={0: 'time', 1: 'by'}
        ))

test_df['msgs_df'] = test_df['msgs_df'].apply(split_time_author)
test_df.loc[4]['msgs_df']

Unnamed: 0,title,item,time,by
0,Request Closed Public,02. Released,"December 24, 2015, 10:34am",Ginger Rodriquez
1,Document(s) Released Public,Site Plan - 11943 El Camino Real.pdf,"December 24, 2015, 10:33am",Ginger Rodriquez
2,Request Reopened Public,,"December 24, 2015, 10:32am",Ginger Rodriquez
3,Request Closed Public,02. Released,"December 24, 2015, 8:09am",Ginger Rodriquez
4,Document(s) Released Public,15-1814 Fire Responsive.pdf,"December 24, 2015, 8:09am",Ginger Rodriquez
5,Request Reopened Public,,"December 24, 2015, 8:02am","Amanda Alvarado, Senior Clerk"
6,Request Closed Public,02. Released,"December 17, 2015, 4:00pm",Ginger Rodriquez
7,Request Reopened Public,,"December 17, 2015, 3:56pm",Ginger Rodriquez
8,Request Closed Public,02. Released,"December 17, 2015, 3:54pm",Ginger Rodriquez
9,Request Closed Public,02. Released,"December 17, 2015, 3:52pm",Ginger Rodriquez


In [879]:
# Remove empty dataframes from docs_df
remove_empty = lambda df: None if ((df is None) or df.empty) else df
test_df['docs_df'] = test_df['docs_df'].apply(remove_empty)
test_df[test_df['docs'].str.fullmatch('title,link\n')]

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df
5752,17-2715,CLOSED,Project: Project Number: ...,"September 26, 2017 via email",Engineering and Capital Projects,"title,link",Jacqueline Palmer,"title,item,time ""Request Published Public"",,""O...",,title \ 0 Requ...
9395,18-2521,CLOSED,AC Water & Sewer Improvements - Group 1026 Con...,"July 3, 2018 via email",Engineering and Capital Projects,"title,link",Ginger Rodriguez,"title,item,time ""Request Published Public"",,""J...",,title \ 0 Requ...
9398,18-2524,CLOSED,Point Loma Ocean Outfall Repair Contract#: L17...,"July 9, 2018 via email",Engineering and Capital Projects,"title,link",Ginger Rodriguez,"title,item,time ""Request Published Public"",,""J...",,title \ 0 Requ...
9814,18-2940,CLOSED,"August 3, 2018  City of San Diego, CA City A...","August 3, 2018 via web",Engineering and Capital Projects,"title,link",Ginger Rodriguez,"title,item,time ""Request Published Public"",,""A...",,title \ 0 ...
9922,18-3048,CLOSED,Re: Property Condition Report Public Rec...,"August 10, 2018 via web",Development Services,"title,link",Angela Laurita,"title,item,time ""Request Published Public"",,""S...",,title \ 0 Requ...
11514,18-4642,CLOSED,Please provide all email communications (inter...,"December 11, 2018 via web",City Council Administration,"title,link",Lori Witzel,"title,item,time ""Request Published Public"",,""M...",,title \ 0 Re...
11928,19-230,CLOSED,"Any and all documents, including, but not limi...","January 16, 2019 via web",Public Utilities,"title,link",Angela Laurita,"title,item,time ""Request Published Public"",,""F...",,title \ 0 ...
12615,19-918,CLOSED,"Reference: Sewer Group 786, City of San Diego,...","February 27, 2019 via web",Engineering and Capital Projects,"title,link",Lori Hernandez,"title,item,time ""Request Published Public"",,""M...",,title \ 0 Requ...
13050,19-1354,CLOSED,May I have a copy of:  Certificate of Occupan...,"March 29, 2019 via web",Development Services,"title,link",Ginger Rodriguez,"title,item,time ""Request Published Public"",,""A...",,title \ 0 Requ...
13928,19-2232,CLOSED,Any and all records relating to 4506 Ocean Vie...,"May 24, 2019 via web",Code Enforcement,"title,link",Ginger Rodriguez,"title,item,time ""Request Published Public"",,""J...",,title \ 0 Requ...


In [811]:
# Splitting departments for easier pivoting
depts = test_df['depts'].to_numpy() # depts column
test_df_depts = test_df.join(pd.DataFrame(list(map(lambda x: x.split(', '), depts)))) # Split departments into separate columns
test_df_depts = depts_df.melt( # Melt on the individual departments
        id_vars=test_df.columns
    )[lambda df: df['value'].apply(lambda x: x is not None)].drop( # Get rid of None values
        columns='variable'
    ).rename( # Rename and reset indices
        columns={'value': 'dept'}
    ).reset_index().drop(
        columns='index'
    )
test_df_depts

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df,dept
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...","December 7, 2015 via web",Code Enforcement,"title,link 5040 ShorehamPlace building permits...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ ...,title ...,Code Enforcement
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Published Public"",,""D...",,title ...,Department of Real Estate and Airport Management
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,"December 7, 2015 via web",City Clerk,"title,link http://www.sandiego.gov/park-and-re...",Mailei Ross-Cerezo,"title,item,time ""Request Closed Public"",Still ...",...,title \ 0 R...,City Clerk
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Closed Public"",02. Re...",,title ...,Department of Real Estate and Airport Management
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...","December 7, 2015 via web",Code Enforcement,"title,link 15-1814 Fire Responsive.pdf,https:/...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ 0 ...,title ...,Code Enforcement
...,...,...,...,...,...,...,...,...,...,...,...
32428,20-4254,CLOSED,All emails with the attachment of the prelimin...,"September 23, 2020 via web","City Attorney, City Council District 4, Develo...","title,link ""Attachment to Email A, Forensic Re...",Angela Laurita,"title,item,time ""Request Published Public"",,""J...",...,title \ ...,Department of Finance
32429,20-4254,CLOSED,All emails with the attachment of the prelimin...,"September 23, 2020 via web","City Attorney, City Council District 4, Develo...","title,link ""Attachment to Email A, Forensic Re...",Angela Laurita,"title,item,time ""Request Published Public"",,""J...",...,title \ ...,Deputy Chief Operating Officer - Smart & Susta...
32430,20-4254,CLOSED,All emails with the attachment of the prelimin...,"September 23, 2020 via web","City Attorney, City Council District 4, Develo...","title,link ""Attachment to Email A, Forensic Re...",Angela Laurita,"title,item,time ""Request Published Public"",,""J...",...,title \ ...,Office of Sustainability
32431,20-4254,CLOSED,All emails with the attachment of the prelimin...,"September 23, 2020 via web","City Attorney, City Council District 4, Develo...","title,link ""Attachment to Email A, Forensic Re...",Angela Laurita,"title,item,time ""Request Published Public"",,""J...",...,title \ ...,Homelessness Strategies


In [833]:
test_df_depts.value_counts('id').head(20)

id
20-4254    20
17-787     11
19-5387    11
19-3612    10
19-4896     9
19-6066     9
17-929      9
19-5926     9
19-4893     8
21-132      8
20-3876     7
17-1309     7
17-2538     7
19-4888     6
20-3439     6
21-1641     6
20-1682     6
19-2651     6
17-3366     5
17-2894     5
dtype: int64

In [834]:
request_id = '"19-3612"'
test_df.query('id == ' + request_id).iloc[0]['desc']

'After reviewing the City Attorney’s April 4 memo, which interpreted the Brown Act to require City Council members to have public discussions of most SDSU-related issues, I am hopeful the City Attorney is similarly interpreting the California Public Records Act, the March 21, 2019 MOU signed by negotiating teams notwithstanding. While the MOU does create a set of documents that are not to be voluntarily shared by the negotiating teams, it obviously does not take precedence over the requirements of the CPRA. So, we’d like to request all documents shared by the negotiating teams with one another. These documents may include some non-public records, such as the appraisal, but would seem to consistent mostly of public records sent by one government agency to an entirely different government agency, which indicates public documents being shared by public officials with other public officials.\nRead more'

In [835]:
desc = 'Padres'
test_df[test_df['desc'].str.contains(desc)]

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df
487,16-348,CLOSED,"Joint ballpark ownership expenses, incremental...","February 29, 2016 via web",Department of Real Estate and Airport Management,"title,link FY2014-2015 Ballpark Sales Budget 2...",Jeffrey Wallace,"title,item,time ""Request Closed Public"",""02. R...",...,title \ 0 R...
583,16-444,CLOSED,Amount of times the SDFD and EMS have been cal...,"March 11, 2016 via web",Public Records Administration,"title,link PetCo Park 911 EMS Response for Fou...",Lea Fields-Bernard,"title,item,time ""Request Closed Public"",""02. R...",...,title \ 0 R...
584,16-445,CLOSED,Padres projected 2016 capital expenditures bas...,"March 11, 2016 via web",Department of Real Estate and Airport Management,"title,link CapEx Letter 2015-2016 executed.pdf...",Jeffrey Wallace,"title,item,time ""Request Closed Public"",""02. R...",title \ 0 C...,title \ 0 R...
622,16-486,CLOSED,I am requesting an opportunity to inspect or o...,"March 16, 2016 via web",Department of Real Estate and Airport Management,"title,link 2012 Petco Park Non Baseball Events...",Jeffrey Wallace,"title,item,time ""Request Published Public"",,""M...",...,title \ 0 Requ...
648,16-513,CLOSED,-information about any time the San Diego Depa...,"March 21, 2016 via web",Public Records Administration,,Lea Fields-Bernard,"title,item,time ""Request Published Public"",,""M...",,title \ 0 Requ...
1354,16-1221,CLOSED,Any emails exchanged between the city and the ...,"June 16, 2016 via web",Public Records Administration,,Lea Fields-Bernard,"title,item,time ""Request Published Public"",,""J...",,title \ 0 Requ...
2400,16-2268,CLOSED,"All emails (dated between January 1, 2012 and ...","September 27, 2016 via web",City Council District 7,,Barrett Tetlow,"title,item,time ""Request Published Public"",,""O...",,title \ 0 Requ...
2940,16-2840,CLOSED,I am requesting as copy of the city's contract...,"December 7, 2016 via web",Public Records Administration,"title,link Ballpark Debt Service Schedule.pdf,...",Lea Fields-Bernard,"title,item,time ""Request Published Public"",,""D...",...,title \ 0 Requ...
3268,17-198,CLOSED,"All e-mails between Miguel Duran, city ballpar...","January 26, 2017 via web",,"title,link Miguel Durans Petco Ballpark Emails...",Angela Laurita,"title,item,time ""Request Published Public"",,""F...",...,title \ 0 Requ...
5564,17-2495,CLOSED,Requesting copies of public records (that are ...,"September 13, 2017 via web",,"title,link FY 2016 Capital Expenditures - Fina...",Angela Laurita,"title,item,time ""Request Published Public"",,""S...",...,title \ 0 ...


In [843]:
dates = test_df['date'].to_numpy()
test_df = test_df.join(
        pd.DataFrame(list(map(lambda x: x.split(' via '), dates)))
    ).drop(
        columns='date'
    ).rename(
        columns={0: 'date', 1: 'via'}
    )

In [850]:
test_df = test_df.convert_dtypes()
test_df.loc[0]['msgs_df']

KeyError: 0

Scraper functions:

In [733]:
def scrape_requests_sequential(requests, driver, num_requests=-1, cooldown=1, debug=0, progress=0):
    '''
    Scrapes all records on a NextRequest request database starting from the given ID and
    moving forward chronologically until the number of requests scraped reaches a given
    number. Each scraped requests is added to a given list. If num_requests is non-positive, 
    then scrape as many records as possible.
    '''
    counter = 0 # Keeps track of how many requests have been scraped
    start = timer() # Timer for progress checking purposes
    
    # Start by scraping the initial record
    
    # Only scrape a request if it was loaded properly; otherwise, stop the scraper
    if not driver.find_elements_by_class_name('nextrequest'):
        print('No requests scraped')
        return counter

    scrape_request_append(requests, driver, counter=counter, debug=debug) # Scrape request

    counter += 1
    
    # For positive num_requests, return the list of requests if the counter reaches the desired number
    if ((num_requests > 0) and (counter == num_requests)):
        if progress:
            end = timer()
            print('Total requests scraped:', counter, 
                  '\tAvg runtime:', str(round((end - start)/counter, 2)) + 's/request', 
                  '\tTotal runtime:', str(round(end - start, 1)) + 's')
            print()
            print('Last request scraped:', requests[-1]['id'])
            print()
        
        return counter

    # Show progress, if desired
    if progress and (counter % progress == 0):
        end = timer()
        print('Requests scraped:', counter, 
              '\tAvg runtime:', str(round((end - start)/counter, 2)) + 's/request', 
              '\tTotal runtime:', str(round(end - start, 1)) + 's')
    
    # Continue to scrape until the arrow to go to the next request is no longer present
    while driver.find_elements_by_class_name('js-next-request'): 
        driver.find_element_by_class_name('js-next-request').click() # Click on the arrow to navigate to the next request
        sleep(cooldown) # Cooldown between scraping attempts
        
        # Scrape request
        if not driver.find_elements_by_class_name('nextrequest'):
            break
        
        scrape_request_append(requests, driver, counter=counter, debug=debug) 
        
        counter += 1
        
        if ((num_requests > 0) and (counter == num_requests)):
            break
        
        if progress and (counter % progress == 0):
            end = timer()
            print('Requests scraped:', counter, 
                  '\tAvg runtime:', str(round((end - start)/counter, 2)) + 's/request', 
                  '\tTotal runtime:', str(round(end - start, 1)) + 's')
    
    # Final progress check
    if progress:
        end = timer()
        print('Total requests scraped:', counter, 
              '\tAvg runtime:', str(round((end - start)/counter, 2)) + 's/request', 
              '\tTotal runtime:', str(round(end - start, 1)) + 's')
        print()
        print('Last request scraped:', requests[-1]['id'])
        print()
        
    return counter

def scrape_request_append(requests, driver, counter=-1, debug=0):
    '''
    Scrapes data about a given request on a NextRequest request database, appending the result
    to the given list.
    '''
    request_id, status, desc, date, depts, docs, poc, events = [None] * 8 # Initialize variables 
    try: # Attempt to scrape relevant data
        request_id = driver.find_element_by_class_name('request-title-text').text.split()[1][1:] # Request ID
        status = driver.find_element_by_class_name('request-status-label').text.strip() # Request status
        desc = driver.find_element_by_class_name('request-text.row').text # Request description
        date = driver.find_element_by_class_name('request_date').text # Request date
        depts = driver.find_element_by_class_name('current-department').text # Department(s) assigned to the request
        poc = driver.find_element_by_class_name('request-detail').text # Person of contact

        # Documents attached to the request, if there are any
        public_docs = driver.find_element_by_id('public-docs') # Documents block
        if '(none)' not in public_docs.text: # Check for the presence of documents
            # Expand folders, if there are any
            folders = public_docs.find_elements_by_class_name('folder-toggle') 
            if folders:
                for folder in folders:
                    folder.click()

            doc_links = public_docs.find_elements_by_class_name('document-link') # Links to documents

            # DataFrame-converted-to-CSV consisting of all documents
            docs = pd.DataFrame({
                'title': get_webelement_text(doc_links),
                'link': remove_download_from_urls(get_webelement_link(doc_links))
                }).to_csv(index=False)

        # Messages recorded on the request page, if there are any
        event_history = driver.find_elements_by_class_name('generic-event') # All message blocks
        if event_history: # Check for presence of 
            num_events = len(event_history)

            # Titles, descriptions, and time strings for each message
            event_titles = [None] * num_events
            event_items = [None] * num_events
            time_quotes = [None] * num_events

            # Scrape information from each individual event
            for i in range(len(event_history)):
                event = event_history[i]

                event_title = event.find_element_by_class_name('event-title').text
                event_item = '\n'.join(get_webelement_text(event.find_elements_by_class_name('event-item'))) # Necessary to address the case where there are multiple event-items
                time_quote = event.find_element_by_class_name('time-quotes').text

                event_titles[i] = event_title
                event_items[i] = event_item
                time_quotes[i] = time_quote

            # DataFrame-converted-to-CSV consisting of all messages
            events = pd.DataFrame({ 
                'title': event_titles,
                'item': event_items,
                'time': time_quotes
                }).to_csv(index=False)
            
        # For testing purposes, print a message whenever a request is successfully scraped
        if debug:
            print(request_id, 'scraped')
    except: # If an exception occurs, print the stack trace
        print('Exception occured' + (' at count ' + str(counter + 1) if counter >= 0 else '') + ':')
        traceback.print_exc()
        print()
    finally: # Append the request to the list
        requests.append({
            'id': request_id,
            'status': status,
            'desc': desc,
            'date': date,
            'depts': depts,
            'docs': docs,
            'poc': poc,
            'msgs': events
            })

Scraper utility functions:

In [197]:
def get_city_from_url(url):
    '''
    Finds the city name from the NextRequest URL.
    '''
    return re.match(r'(?<=https://)[a-zA-Z]*', url)[0]

def get_webelement_text(webelement):
    '''
    Gets the text of each web element in a list, if such a list exists.
    '''
    return list(map(lambda x: x.text, webelement)) if webelement else []

def get_webelement_link(webelement):
    '''
    Gets the link of each web element in a list, if such a list exists.
    '''
    return list(map(lambda x: x.get_attribute('href'), webelement)) if webelement else []

def remove_download_from_urls(urls):
    '''
    Removes '/download' from the end of a list of URLs, if the list exists.
    '''
    return list(map(lambda url: re.match(r'.*(?=/download)', url)[0], urls)) if urls else []

Previous scraper functions:

In [880]:
def scrape_record(url, request_id, driver):
    '''
    Scrapes data about a given request on a NextRequest request database
    '''
    driver.get(url + request_id) # Attempt to access the record
#     timeout = 2 # Timeout length, in seconds
    
#     try:
#         WebDriverWait(driver, timeout).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'nextrequest')))
#     except TimeoutException:
#         print(request_id, 'timed out')
#         pass
    
    if (request_id not in driver.title):
        return
    
    status, desc, date, depts, docs, poc, events = [None] * 7 # Initialize variables 
    try: # Attempt to scrape relevant data
        status = driver.find_element_by_class_name('request-status-label').text.strip() # Request status
        desc = driver.find_element_by_class_name('request-text.row').text # Request description
        date = driver.find_element_by_class_name('request_date').text # Request date
        depts = driver.find_element_by_class_name('current-department').text # Department(s) assigned to the request
        poc = driver.find_element_by_class_name('request-detail').text # Person of contact
        
        # Documents attached to the request, if there are any
        public_docs = driver.find_element_by_id('document-list') # WebElement containing the documents
        if '(none)' not in public_docs.text:
            # Expand folders, if there are any
            folders = public_docs.find_elements_by_class_name('folder-toggle') 
            if folders:
                for folder in folders:
                    folder.click()

            doc_links = public_docs.find_elements_by_class_name('document-link') # Links to documents
            
            # DataFrame consisting of all documents
            docs = pd.DataFrame({
                'title': get_webelement_text(doc_links),
                'link': remove_download_from_urls(get_webelement_link(doc_links))
                })
            
        # Messages recorded on the request page, if there are any
        event_history = driver.find_elements_by_class_name('generic-event')
        if event_history:
            num_events = len(event_history)
            
            # Titles, descriptions, and time strings for each message
            event_titles = [None] * num_events
            event_items = [None] * num_events
            time_quotes = [None] * num_events
            
            # Scrape information from each individual event
            for i in range(len(event_history)):
                event = event_history[i]
                
                event_title = event.find_element_by_class_name('event-title').text
                event_item = '\n'.join(get_webelement_text(event.find_elements_by_class_name('event-item')))
                time_quote = event.find_element_by_class_name('time-quotes').text
                
                event_titles[i] = event_title
                event_items[i] = event_item
                time_quotes[i] = time_quote
                
            # DataFrame consisting of all messages
            events = pd.DataFrame({ 
                'title': event_titles,
                'item': event_items,
                'time': time_quotes
                })
    except NoSuchElementException: # Catch exception thrown if a specific element cannot be found, and silently pass
        pass
    except: # If some other exception occurs, print information about the exception
        traceback.print_exc()
    finally: # Return the request
        return {
            'id': request_id,
            'status': status,
            'desc': desc,
            'date': date,
            'depts': depts,
            'docs': docs,
            'poc': poc,
            'msgs': events
            }
    
def scrape_record_parallel(url, request_id):
    '''
    Scraper method used for parallelization
    '''
    driver = webdriver.Firefox(options=options)
    request_info = scrape_record(url, request_id, driver)
    driver.close()
    return request_info

In [514]:
# Options for scraping
earliest_year = 16 # Earliest year to search requests for
latest_year = 21 # Latest year to search request for
id_start = 1 # ID value to start from
id_range = id_max # Number of IDs to try for each year
cooldown = 0.9 # Amount of time, in seconds, to wait between website accesses

start_id = '15-1810' # The request to start scraping from
num_requests = -1 # Number of requests to scrape
progress = 100 # Display a message every 100 requests successfully scraped

# List of all request IDs
request_ids = [str(year) + '-' + str(num) for num in range(id_start, id_range + id_start) 
                                       for year in range(earliest_year, latest_year + 1)]

# URLs to extract data from
urls = ['https://sandiego.nextrequest.com/requests/', 'https://oaklandca.nextrequest.com/requests/']

In [319]:
# Iterative script
driver = webdriver.Firefox(options=options
                          ) # Headless (non-visible) WebDriver

sd_requests = [] # List of dictionaries containing information on each request
i = 0 # Index for URLs - may be useful later for scraping multiple sites

for year in range(earliest_year, latest_year + 1):
    for num in tqdm(range(id_start, id_range + id_start)):
        # NextRequest request IDs are a two-digit year and a number, with a dash in between
        request_id = str(year) + '-' + str(num)
        
        # Scrape record
        sd_requests.append(scrape_record(urls[i], request_id, driver)) 

        # Cooldown
        sleep(cooldown)
    
    # sleep(cooldown) # Cooldown
        
driver.close()

sd_requests = [x for x in sd_requests if x['status'] is not None] # Remove entries with incomplete information
sd_requests_df = pd.DataFrame(sd_requests) # Convert to DataFrame

# Create a zipped CSV file of the data
compression_opts = dict(method='zip', archive_name='sd_requests.csv')
sd_requests_df.to_csv('data/sd_requests_2.zip', index=False, compression=compression_opts)

100%|██████████| 10/10 [00:23<00:00,  2.32s/it]


Miscellaneous tests:

In [37]:
# # Test remote WebDriver (run Selenium Grid instance locally before running this cell)
# driver = webdriver.Remote(desired_capabilities=DesiredCapabilities.FIREFOX, options=options)
# driver.get('http://www.google.com')
# driver.close()

In [None]:
# # Test to make sure the driver works
# 
# driver = webdriver.Firefox(options=options) # Instantiate a headless Firefox WebDriver
# for url in urls:
#     driver.get(url)
#     print(driver.title)
#     sleep(5)
    
# driver.close()

In [None]:
# %%timeit
# # Test for retrieving message info from a specific request
# driver = webdriver.Firefox(options=options)
# driver.get('https://sandiego.nextrequest.com/requests/21-4915')
# print(driver.title)

# event_titles = driver.find_elements_by_class_name('event-title')
# event_items = driver.find_elements_by_class_name('event-item')
# times = driver.find_elements_by_class_name('time-quotes')
# for title, item, time in list(zip(event_titles, event_items, times)):
#     print(title.text)
#     print(item.text)
#     print(time.text)
#     print()

# driver.close()

In [None]:
# # Scraping documents test
# driver = webdriver.Firefox(options=options)

# for request_id in [5369, 5313, 5374]:
#     url = 'https://sandiego.nextrequest.com/requests/21-' + str(request_id)
#     driver.get(url)

#     docs = driver.find_element_by_id('public-docs')
#     folders = docs.find_elements_by_class_name('folder-toggle')
#     if folders:
#         for folder in folders:
#             folder.click()
#     else:
#         print('No folders found for', request_id)
    
#     doc_links = docs.find_elements_by_class_name('document-link')
#     display(
#         pd.DataFrame(
#             list(zip(get_webelement_text(doc_links), remove_download_from_urls(get_webelement_link(doc_links)))),
#             columns=['title', 'link']
#         )
#     )
#     display(
#         pd.DataFrame({
#             'title': get_webelement_text(doc_links),
#             'link': remove_download_from_urls(get_webelement_link(doc_links))
#         })
#     )

# driver.close()

In [235]:
# Parallelized script
scrape_request = lambda i: scrape_record_parallel(urls[0], i)
sd_requests = Parallel(n_jobs=-1, prefer='threads', verbose=10)(delayed(scrape_request)(request_id) for request_id in request_ids)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   29.2s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   43.2s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.4min


KeyboardInterrupt: 