# NextRequest data cleaning and EDA
*Author: Steven Yuan*

This notebook contains potentially useful data cleaning and EDA routines for scraped NextRequest data.

In [41]:
import pandas as pd
from io import StringIO
import zipfile

from nextrequest_eda_utils import *

In [42]:
test_df = pd.read_csv(zipfile.ZipFile('../data/sd_requests.zip', 'r').open('sd_requests.csv'))
test_df.head()

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...","December 7, 2015 via web",Code Enforcement,"title,link\n5040 ShorehamPlace building permit...",Ginger Rodriguez,"title,item,time\n""Request Closed\nPublic"",02. ..."
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time\n""Request Published\nPublic"",,..."
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,"December 7, 2015 via web",City Clerk,"title,link\nhttp://www.sandiego.gov/park-and-r...",Mailei Ross-Cerezo,"title,item,time\n""Request Closed\nPublic"",Stil..."
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time\n""Request Closed\nPublic"",02. ..."
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...","December 7, 2015 via web",Code Enforcement,"title,link\nSite Plan - 11943 El Camino Real.p...",Ginger Rodriguez,"title,item,time\n""Request Closed\nPublic"",02. ..."


In [43]:
test_df_clean = nextrequest_df_clean(test_df)
test_df_clean.head()

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...","December 7, 2015 via web",Code Enforcement,"title,link 5040 ShorehamPlace building permits...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ ...,title ...
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Published Public"",,""D...",,title ...
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,"December 7, 2015 via web",City Clerk,"title,link http://www.sandiego.gov/park-and-re...",Mailei Ross-Cerezo,"title,item,time ""Request Closed Public"",Still ...",...,title \ 0 R...
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Closed Public"",02. Re...",,title ...
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...","December 7, 2015 via web",Code Enforcement,"title,link Site Plan - 11943 El Camino Real.pd...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ 0 ...,title ...


Other EDA stuff:

In [44]:
test_df_clean.shape[0] # Number of requests scraped

28848

In [45]:
test_df_clean.loc[4]['msgs_df']

Unnamed: 0,title,item,time
0,Request Closed Public,02. Released,"December 24, 2015, 10:34am by Ginger Rodriquez"
1,Document(s) Released Public,Site Plan - 11943 El Camino Real.pdf,"December 24, 2015, 10:33am by Ginger Rodriquez"
2,Request Reopened Public,,"December 24, 2015, 10:32am by Ginger Rodriquez"
3,Request Closed Public,02. Released,"December 24, 2015, 8:09am by Ginger Rodriquez"
4,Document(s) Released Public,15-1814 Fire Responsive.pdf,"December 24, 2015, 8:09am by Ginger Rodriquez"
5,Request Reopened Public,,"December 24, 2015, 8:02am by Amanda Alvarado, ..."
6,Request Closed Public,02. Released,"December 17, 2015, 4:00pm by Ginger Rodriquez"
7,Request Reopened Public,,"December 17, 2015, 3:56pm by Ginger Rodriquez"
8,Request Closed Public,02. Released,"December 17, 2015, 3:54pm by Ginger Rodriquez"
9,Request Closed Public,02. Released,"December 17, 2015, 3:52pm by Ginger Rodriquez"


In [16]:
# Query for info about a specific request
request_id = '"17-22"'
test_df.query('id == ' + request_id).iloc[0]['msgs_df']

Unnamed: 0,title,item,time
0,Request Published Public,,"January 16, 2017, 4:10pm"
1,Request Closed Public,02. Released All responsive documents have bee...,"January 13, 2017, 4:06pm by Ginger Rodriquez"
2,Document(s) Released Public,20160315_131321.jpg,"January 13, 2017, 4:00pm by Ginger Rodriquez"
3,Document(s) Released Public,20160315_131307.jpg,"January 13, 2017, 4:00pm by Ginger Rodriquez"
4,Document(s) Released Public,20160315_131230.jpg,"January 13, 2017, 4:00pm by Ginger Rodriquez"
...,...,...,...
1112,Document(s) Released Public,20160307_155141.jpg,"January 13, 2017, 3:54pm by Ginger Rodriquez"
1113,Document(s) Released Public,20160307_155139.jpg,"January 13, 2017, 3:54pm by Ginger Rodriquez"
1114,Document(s) Released Public,7411 Hillside Drive Photos.pdf,"January 13, 2017, 3:54pm by Ginger Rodriquez"
1115,Department Assignment Public,Code Enforcement,"January 6, 2017, 3:23pm by Lea Fields-Bernard,..."


In [17]:
# Find request descriptions with the given substring, case insensitive
desc = 'Padres'
test_df[test_df['desc'].str.contains(desc, case=False)]

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df
487,16-348,CLOSED,"Joint ballpark ownership expenses, incremental...","February 29, 2016 via web",Department of Real Estate and Airport Management,"title,link 2015 JBOE Reconciliation.pdf,https:...",Jeffrey Wallace,"title,item,time ""Request Closed Public"",""02. R...",...,title \ 0 R...
583,16-444,CLOSED,Amount of times the SDFD and EMS have been cal...,"March 11, 2016 via web",Public Records Administration,"title,link PetCo Park 911 EMS Response for Fou...",Lea Fields-Bernard,"title,item,time ""Request Closed Public"",""02. R...",...,title \ 0 R...
584,16-445,CLOSED,Padres projected 2016 capital expenditures bas...,"March 11, 2016 via web",Department of Real Estate and Airport Management,"title,link CapEx Letter 2015-2016 executed.pdf...",Jeffrey Wallace,"title,item,time ""Request Closed Public"",""02. R...",title \ 0 C...,title \ 0 R...
622,16-486,CLOSED,I am requesting an opportunity to inspect or o...,"March 16, 2016 via web",Department of Real Estate and Airport Management,"title,link 2013 Petco Park Non Baseball Events...",Jeffrey Wallace,"title,item,time ""Request Published Public"",,""M...",...,title \ 0 Requ...
648,16-513,CLOSED,-information about any time the San Diego Depa...,"March 21, 2016 via web",Public Records Administration,,Lea Fields-Bernard,"title,item,time ""Request Published Public"",,""M...",,title \ 0 Requ...
1354,16-1221,CLOSED,Any emails exchanged between the city and the ...,"June 16, 2016 via web",Public Records Administration,,Lea Fields-Bernard,"title,item,time ""Request Published Public"",,""J...",,title \ 0 Requ...
2400,16-2268,CLOSED,"All emails (dated between January 1, 2012 and ...","September 27, 2016 via web",City Council District 7,,Barrett Tetlow,"title,item,time ""Request Published Public"",,""O...",,title \ 0 Requ...
2940,16-2840,CLOSED,I am requesting as copy of the city's contract...,"December 7, 2016 via web",Public Records Administration,"title,link Ballpark Debt Service Schedule.pdf,...",Lea Fields-Bernard,"title,item,time ""Request Published Public"",,""D...",...,title \ 0 Requ...
3268,17-198,CLOSED,"All e-mails between Miguel Duran, city ballpar...","January 26, 2017 via web",,"title,link Miguel Durans Petco Ballpark Emails...",Angela Laurita,"title,item,time ""Request Published Public"",,""F...",...,title \ 0 Requ...
5564,17-2495,CLOSED,Requesting copies of public records (that are ...,"September 13, 2017 via web",,"title,link FY 2016 Capital Expenditures - Fina...",Angela Laurita,"title,item,time ""Request Published Public"",,""S...",...,title \ 0 ...


In [46]:
# Which requests had the longest message history? (Useful for finding worst-case scenarios for the scraper)
long_msgs = test_df_clean['msgs_df'].apply(lambda df: df.shape[0] if df is not None else 0).sort_values(ascending=False)
long_msgs.head(10)

3092    1117
2495     814
1721     313
676      174
2562     160
3109     153
6674     145
3971     142
6727     130
5895     126
Name: msgs_df, dtype: int64

In [None]:
# Sort requests by message history length
requests_long_msg = test_df_clean.loc[long_msgs.index]
requests_long_msg.head(20)

In [39]:
test_df_depts.value_counts('dept')[lambda x: x.index.str.contains('Police', case=False)]

dept
Police                            6886
Commission on Police Practices      51
dtype: int64

In [26]:
# Find requests whose department(s) contain the given substring, case insensitive
dept = 'Chief Operating Officer'
test_df_clean[lambda df: df['depts'].str.contains(dept, case=False)]

Unnamed: 0,id,status,desc,depts,docs,poc,msgs,docs_df,msgs_df,date,via
30,15-1840,CLOSED,Writings and communications related to Sai Kir...,Deputy Chief Operating Officer - Infrastructur...,"title,link Letter to Finch (CCPRA 2015-1840).p...",Jacqueline Palmer,"title,item,time ""Request Closed Hide Public""...",...,title \ 0 Request ...,"December 9, 2015",web
351,16-212,CLOSED,Records relating to SAP software agreement and...,Deputy Chief Operating Officer - Infrastructur...,,Erin Noel,"title,item,time ""Request Closed Public"",""02. R...",,title \ 0 R...,"February 8, 2016",web
3397,17-327,CLOSED,Location: Intersection of Canon St. & Catalina...,Deputy Chief Operating Officer - Infrastructur...,,Travis Brady,"title,item,time ""Request Published Public"",,""F...",,title \ 0 Requ...,"February 9, 2017",web
3730,17-660,CLOSED,1. Please provide a copy of the executed ...,"Information Technology, Deputy Chief Operating...","title,link 17-660 Email Results_Redacted.pdf,h...",Jacqueline Palmer,"title,item,time ""Request Published Public"",,""J...",...,title \ 0 Re...,"March 20, 2017",web
3731,17-661,CLOSED,Please provide all conflict of interest disclo...,Deputy Chief Operating Officer - Infrastructur...,"title,link H166584 Exhibit H Signed_02-29-16.p...",Travis Brady,"title,item,time ""Request Published Public"",,""A...",...,title \ 0 Re...,"March 20, 2017",web
...,...,...,...,...,...,...,...,...,...,...,...
25496,21-1956,CLOSED,"Hello, Pursuant to the California Public Recor...",Office of the Chief Operating Officer,"title,link Pages from 20-296_COO Emails re 101...",Angela Laurita,"title,item,time ""Request Published Public"",,""M...",...,title \ 0 Requ...,"April 16, 2021",web
25701,21-2169,CLOSED,"Hello, Pursuant to the California Public Recor...",Office of the Chief Operating Officer,"title,link 21-2169_RVilla Emails_09-2016_Redac...",Angela Laurita,"title,item,time ""Request Published Public"",,""J...",title ...,title \ 0 Requ...,"April 26, 2021",web
25870,21-2338,CLOSED,"Hello, Pursuant to the California Public Recor...","Mayor, Office of the Chief Operating Officer",,Angela Laurita,"title,item,time ""Request Published Public"",,""M...",,title \ 0 Requ...,"May 2, 2021",web
27701,21-4226,CLOSED,"Hello, Pursuant to the California Public Reco...",Office of the Chief Operating Officer,"title,link PRAR 21-4226 - KM Calendar_Redacted...",Angela Laurita,"title,item,time ""Request Published Public"",,""S...",title \ ...,title \ 0 Requ...,"August 4, 2021",web


In [51]:
def get_open_time(msgs):
    if msgs is None:
        return None
    
    request_published = msgs[lambda df: df['title'].str.contains('Opened')].sort_values(by='time_dt', ignore_index=True)
    return request_published.loc[0]['time_dt']

test_df['open_time'] = test_df['msgs_df'].apply(get_submission_time)
test_df.head()

KeyError: 0

In [50]:
test_df.value_counts('via')

via
web      26220
email     1855
mail       517
phone       99
fax         46
dtype: int64

In [30]:
# TODO: Extract department assignment additions and removals based on message history
def find_dept_assign(msgs):
    dept_assign = msgs[lambda df: df['title'].str.contains('Department')]
    
    dept_assign_added = dept_assign[lambda df: df['item'].str.contains('Added:')]
    dept_assign_removed = dept_assign[lambda df: df['item'].str.contains('Removed:')]

test_df.loc[1]['msgs_df']

Unnamed: 0,title,item,time,by,time_dt
0,Request Published Public,,"December 16, 2015, 9:54am",Jeffrey Wallace,2015-12-16 09:54:00
1,Request Closed Public,02. Released,"December 15, 2015, 7:12am",Jeffrey Wallace,2015-12-15 07:12:00
2,Department Assignment Public,Real Estate Assets,"December 7, 2015, 5:22pm",,2015-12-07 17:22:00
3,Request Opened Public,Request received via web,"December 7, 2015, 5:22pm","Lea Fields-Bernard, Public Records Administrat...",2015-12-07 17:22:00
4,Request Published Public,,"December 7, 2015, 5:22pm",,2015-12-07 17:22:00
