In [134]:
import requests
from bs4 import BeautifulSoup
import logging
import json
import sys
import pandas as pd
import numpy as np
from time import sleep
from requests.exceptions import Timeout

In [6]:
BASE_URL           = "https://sandiego.nextrequest.com"
REQUESTS_TABLE_URL = "https://sandiego.nextrequest.com/requests?requests_smart_listing[page]="
REQUEST_PAGE_URL   = "https://sandiego.nextrequest.com/requests"
PAGES_COUNT        = 682 # going past the last page just forwards back to the last one
SUCCESS_CODE       = 200

In [88]:
def getDictKeys(list_of_dicts):
    try:
        keys = set()
        for _dict in list_of_dicts:
            if (len(_dict) > 0):
                for key in _dict.keys():
                    keys.add(key)
        return keys
    except:
        logging.error(_dict)

In [8]:
def clean_empty_space(string):
    return " ".join(string.split())

In [9]:
def get_rpage_data(pra_link, failed={}):
    try:
        r_data_i = {}
        
        r_page = BeautifulSoup(requests.get(BASE_URL + pra_link).text)
        
        
        r_id = pra_link.split("/")[-1]

        r_text = clean_empty_space(r_page.find("div", {"id": "request-text"}).get_text())
    
        r_date = r_page.findAll("p", {"class": "request_date"})[0].get_text() # get the submission date
        r_date = r_date.split()
        r_channel = r_date[-1]
        r_date = " ".join(r_date[:3])
        
        r_department = r_page.findAll("p", {"class": "current-department"})[0].get_text() # get the respective department
        r_department = clean_empty_space(r_department)
        r_contact = ' '.join(r_page.findAll("div", {"class": "staff-details"})[0].text.split("Contact",1)[1].split()) # get the request contact
        
        r_events = r_page.findAll("div", {"class": "generic-event"}) # get each public event
        r_events.reverse()
        for event in r_events:
            event_title = event.findAll("div", {"class": "event-title"})[0].text
            event_text  = event.findAll("div", {"class": "event-item"})[0].text
            event_title = ' '.join(event_title.split())
            event_text  = ' '.join(event_text.split())
            if (event_title == "Request Closed Public"):
                r_data_i["Close_Date"] = event_title + " " + " ".join(event.findAll("div")[-1].get_text().split()[:3]).replace(",","")
            if(~(event_title in r_data_i.keys())):
                r_data_i[event_title] = event_text
            else:
                r_data_i[event_title] = r_data_i[event_title] + "<NEXT> " + event_text
        
        row = [r_id, r_date, r_channel, r_department, r_contact, r_data_i]
        sleep(1)
        return row
    except:
        logging.error("This request returned an error: " + pra_link + ": " + str(sys.exc_info()[0]))
        failed[pra_link] = sys.exc_info()[0]
        return []

In [26]:
def getDictKeys(list_of_dicts):
    keys = set()
    for _dict in list_of_dicts:
        if (len(_dict) > 0):
            for key in _dict[-1].keys():
                keys.add(key)
    return keys 

## Test Code Below

This pull was last run at 2p.m. Monday November 25, 2019. Around 17,000 public records request IDs acquired.

In [None]:
pra_ids = []
for i in range(1,PAGES_COUNT):
    page_request = requests.get(REQUESTS_TABLE_URL + str(i))
    
    if page_request.status_code != SUCCESS_CODE:
        logging.warning("FAILED CODE " + str(page_request.status_code) + "FOR PAGE " + REQUESTS_TABLE_URL + str(i))
        
    if (i % 100 == 0):
        print(str(i) + " PAGES COMPLETED")
        #logging.info()
        
    page_text = BeautifulSoup(page_request.text)

    request_table = page_text.find_all("table", class_="request_table")[0]
    
    for row in request_table.tbody.findAll('tr'):
        pra_ids.append(row.findAll('td')[0].a.get('href'))

In [None]:
page_request = requests.get(REQUESTS_TABLE_URL + str(680))

if page_request.status_code != SUCCESS_CODE:
    logging.warning("FAILED CODE " + str(page_request.status_code) + "FOR PAGE " + REQUESTS_TABLE_URL + str(i))
    
if (i % 100 == 0):
    print(str(i) + " PAGES COMPLETED")
    #logging.info()
    
page_text = BeautifulSoup(page_request.text)

request_table = page_text.find_all("table", class_="request_table")[0]

for row in request_table.tbody.findAll('tr'):
    pra_ids.append(row.findAll('td')[0].a.get('href'))

In [None]:
len(pra_ids)

In [None]:
#with open('pra_href_scrape.txt', 'w') as outfile:
#    json.dump(pra_ids, outfile)

## Scrape Request Page

In [None]:
# READ IN THE REQUEST IDs
with open('pra_href_scrape.txt') as json_file:
    pra_ids = json.load(json_file)

In [None]:
r_page = BeautifulSoup(requests.get(REQUEST_PAGE_URL + '/19-5666').text)

In [None]:
failed_requests = {}

In [None]:
test_get_rpage_data('/requests/18-45')

In [None]:
corpus = []

In [None]:
[corpus.append(get_rpage_data(link)) for link in pra_ids]

In [None]:
print("done")

In [None]:
len(corpus)

In [None]:
len(failed_requests)

In [None]:
failed_requests

In [None]:
#with open('pra_data.txt', 'w') as outfile:
#    json.dump(corpus, outfile)

In [None]:
failed_requests_dict = {r_id: str(_type) for (r_id, _type) in zip(failed_requests.keys(), failed_requests.values())}

In [None]:
#with open('failed_pulls_00.txt', 'w') as outfile:
#    json.dump(failed_requests_dict, outfile)

In [10]:
# READ IN THE FAILED SCRAPES
with open('failed_pulls_00.txt') as json_file:
    failed_requests = json.load(json_file)

In [11]:
new_failed = {}

In [14]:
success_requests = [get_rpage_data(pra_link, new_failed) for pra_link in failed_requests.keys()]

In [22]:
#with open('failed_pulls_success.txt', 'w') as outfile:
#    json.dump(success_requests, outfile)

# Process the Data

In [93]:
# READ IN THE FAILED SCRAPES
with open('pra_data.txt') as json_file:
    corpus = json.load(json_file)

In [94]:
# READ IN THE FAILED SCRAPES
with open('failed_pulls_success.txt') as json_file:
    corpus01 = json.load(json_file)

In [100]:
corpus = [c for c in corpus if len(c) != 0]

In [102]:
# READ in and append the newly acquired request
corpus.extend(corpus01)

In [109]:
#with open('pra_fulldata.txt', 'w') as outfile:
#    json.dump(corpus, outfile)

In [104]:
_dicts = [c[-1] for c in corpus]

In [106]:
event_names = getDictKeys(_dicts)

In [107]:
len(event_names)

12

In [111]:
print(event_names)

{'Document(s) Released Public', 'Request Closed Public', 'Department Assignment Details Public', 'Request Opened Public', 'Document(s) Released Details Public', 'Document(s) Released to Requester Details Public', 'Request Reopened Public', 'Document(s) Released to Requester Public', 'Request Closed Hide Public', 'Request Published Public', 'Close_Date', 'Department Assignment Public'}


In [None]:
# Now, I need to expand each entry into its row representaion

In [114]:
corpus[11]

['19-5556',
 'November 18, 2019',
 'email',
 'Public Records Administration',
 'Angela Laurita',
 {'Request Opened Public': 'Request received via email',
  'Department Assignment Public': 'Public Records Administration',
  'Document(s) Released Public': 'General Application.pdf',
  'Close_Date': 'Request Closed Public November 18 2019',
  'Request Closed Public': '02. Released All responsive documents have been released pursuant to the California Public Records Act.',
  'Request Published Public': ''}]

In [118]:
col_names = ["r_id", "date_submitted", "submission_method", "receiving_department", "assigned_pro"] + list(event_names)

In [119]:
col_names

['r_id',
 'date_submitted',
 'submission_method',
 'receiving_department',
 'assigned_pro',
 'Document(s) Released Public',
 'Request Closed Public',
 'Department Assignment Details Public',
 'Request Opened Public',
 'Document(s) Released Details Public',
 'Document(s) Released to Requester Details Public',
 'Request Reopened Public',
 'Document(s) Released to Requester Public',
 'Request Closed Hide Public',
 'Request Published Public',
 'Close_Date',
 'Department Assignment Public']

In [128]:
def expand_row(row, col_names):
    new_row = {k:np.nan for k in col_names}
    new_row['r_id'] = row[0]
    new_row['date_submitted'] = row[1]
    new_row['submission_method'] = row[2]
    new_row['receiving_department'] = row[3]
    new_row['assigned_pro'] = row[4]
    for key in row[5].keys():
        new_row[key] = row[5][key]
    return new_row

In [131]:
corpus = [expand_row(request, col_names) for request in corpus]

In [136]:
df = pd.DataFrame(corpus)

In [137]:
df.to_csv("request_corpus.csv")