# NextRequest data cleaning and EDA
*Author: Steven Yuan*

This notebook contains potentially useful data cleaning and EDA routines for scraped NextRequest data.

In [2]:
import pandas as pd
from io import StringIO
import zipfile

In [3]:
test_df = pd.read_csv(zipfile.ZipFile('../data/sd_requests.zip', 'r').open('sd_requests.csv'))
test_df.head()

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...","December 7, 2015 via web",Code Enforcement,"title,link\n5040 ShorehamPlace building permit...",Ginger Rodriguez,"title,item,time\n""Request Closed\nPublic"",02. ..."
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time\n""Request Published\nPublic"",,..."
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,"December 7, 2015 via web",City Clerk,"title,link\nhttp://www.sandiego.gov/park-and-r...",Mailei Ross-Cerezo,"title,item,time\n""Request Closed\nPublic"",Stil..."
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time\n""Request Closed\nPublic"",02. ..."
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...","December 7, 2015 via web",Code Enforcement,"title,link\nSite Plan - 11943 El Camino Real.p...",Ginger Rodriguez,"title,item,time\n""Request Closed\nPublic"",02. ..."


The following process converts the CSV strings in the `docs` and `msgs` columns into DataFrames:

In [3]:
df_fillna = lambda df: df.convert_dtypes().fillna('') if df is not None else None
test_df = df_fillna(test_df)
test_df

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...","December 7, 2015 via web",Code Enforcement,"title,link 5040 ShorehamPlace building permits...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re..."
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Published Public"",,""D..."
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,"December 7, 2015 via web",City Clerk,"title,link http://www.sandiego.gov/park-and-re...",Mailei Ross-Cerezo,"title,item,time ""Request Closed Public"",Still ..."
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Closed Public"",02. Re..."
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...","December 7, 2015 via web",Code Enforcement,"title,link Site Plan - 11943 El Camino Real.pd...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re..."
...,...,...,...,...,...,...,...,...
28843,21-5579,CLOSED,"Hello, I would like a copy of the report from ...","October 28, 2021 via web",Animal Services,,Lori Hernandez,"title,item,time ""Request Published Public"",,""N..."
28844,21-5581,CLOSED,Requesting incident report and photos for San ...,"October 28, 2021 via web",Animal Services,,Lori Hernandez,"title,item,time ""Request Published Public"",,""N..."
28845,21-5584,CLOSED,request for call for service 2110020816,"October 28, 2021 via web",Police,"title,link 2110020816_Redacted.pdf,https://san...",Lori Hernandez,"title,item,time ""Request Published Public"",,""N..."
28846,21-5588,CLOSED,request for call for service E20050048015,"October 28, 2021 via web",Police,"title,link E20050048015_Redacted.pdf,https://s...",Lori Hernandez,"title,item,time ""Request Published Public"",,""N..."


In [4]:
csv_to_df = lambda csv: pd.read_csv(StringIO(csv)) if csv else None
test_df['docs_df'] = test_df['docs'].apply(csv_to_df)
test_df['msgs_df'] = test_df['msgs'].apply(csv_to_df)
test_df.head()

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...","December 7, 2015 via web",Code Enforcement,"title,link 5040 ShorehamPlace building permits...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ ...,title ...
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Published Public"",,""D...",,title ...
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,"December 7, 2015 via web",City Clerk,"title,link http://www.sandiego.gov/park-and-re...",Mailei Ross-Cerezo,"title,item,time ""Request Closed Public"",Still ...",...,title \ 0 ...
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Closed Public"",02. Re...",,title ...
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...","December 7, 2015 via web",Code Enforcement,"title,link Site Plan - 11943 El Camino Real.pd...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ 0 ...,title ...


Then, we fill the NA values in the individual `docs` and `msgs` DataFrames:

In [5]:
test_df['docs_df'] = test_df['docs_df'].apply(df_fillna)
test_df['msgs_df'] = test_df['msgs_df'].apply(df_fillna)
test_df.loc[4]['msgs_df']

Unnamed: 0,title,item,time
0,Request Closed Public,02. Released,"December 24, 2015, 10:34am by Ginger Rodriquez"
1,Document(s) Released Public,Site Plan - 11943 El Camino Real.pdf,"December 24, 2015, 10:33am by Ginger Rodriquez"
2,Request Reopened Public,,"December 24, 2015, 10:32am by Ginger Rodriquez"
3,Request Closed Public,02. Released,"December 24, 2015, 8:09am by Ginger Rodriquez"
4,Document(s) Released Public,15-1814 Fire Responsive.pdf,"December 24, 2015, 8:09am by Ginger Rodriquez"
5,Request Reopened Public,,"December 24, 2015, 8:02am by Amanda Alvarado, ..."
6,Request Closed Public,02. Released,"December 17, 2015, 4:00pm by Ginger Rodriquez"
7,Request Reopened Public,,"December 17, 2015, 3:56pm by Ginger Rodriquez"
8,Request Closed Public,02. Released,"December 17, 2015, 3:54pm by Ginger Rodriquez"
9,Request Closed Public,02. Released,"December 17, 2015, 3:52pm by Ginger Rodriquez"


Other EDA stuff:

In [None]:
test_df.head()

In [None]:
test_df.shape[0] # Number of requests scraped

In [10]:
test_df[test_df['desc'].str.contains('Read more')] # Check if the descriptions were properly scraped

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df


In [11]:
empty_desc = test_df.query('desc == ""')
empty_desc

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df
227,16-88,CLOSED,,"January 20, 2016 via web",Public Records Administration,,Lea Fields-Bernard,"title,item,time ""Request Closed Public"",""Not a...",,title \ 0 R...


In [12]:
# Check for empty depts field
empty_depts = test_df.query('depts == ""')
empty_depts

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df
32,15-1842,CLOSED,"Most recent ""Statement of Acccount-Unpaid Chec...","December 10, 2015 via web",,,Elena Perez,"title,item,time ""Request Closed Public"",No res...",,title ...
153,16-14,CLOSED,"Accounting of uncashed checks/warrants, tax ov...","January 5, 2016 via web",,"title,link http://www.sandiego.gov/comptroller...",Doug Enger,"title,item,time ""Request Closed Hide Public""...",...,title \ 0 Request ...
184,16-45,CLOSED,"Payments made in the amount of $10,000 or more...","January 12, 2016 via web",,"title,link PRA CCPRA2016-0045 1-21-2016.xlsx,h...",Elena Perez,"title,item,time ""Request Closed Public"",""02. R...",title \ 0 PRA...,title \ 0 R...
267,16-128,CLOSED,Unclaimed warrants,"January 26, 2016 via web",,,Elena Perez,"title,item,time ""Request Closed Public"",""02. R...",,title \ 0 R...
306,16-167,CLOSED,List of outstanding checks,"February 1, 2016 via web",,"title,link Outstanding Checks 1-19-2016.htm,ht...",Elena Perez,"title,item,time ""Request Closed Public"",""02. R...",title \ 0 Outs...,title \ 0 R...
...,...,...,...,...,...,...,...,...,...,...
12367,19-669,CLOSED,Can the City please provide me with a copy of ...,"February 12, 2019 via web",,"title,link 8531950_44131.pdf,https://sandiego....",Lori Hernandez,"title,item,time ""Request Published Public"",,""F...",title ...,title \ 0 Requ...
13494,19-1798,CLOSED,Please produce the following: Application for ...,"April 25, 2019 via web",,"title,link 19-1798 Permit #13756.pdf,https://s...",Lori Hernandez,"title,item,time ""Request Published Public"",,""A...",...,title \ 0...
13629,19-1933,CLOSED,"Copies of any and all contracts,service agreem...","May 6, 2019 via web",,"title,link Urban Corps Staffing for 08-07-14.p...",Lori Hernandez,"title,item,time ""Request Published Public"",,""J...",...,title \ 0 ...
15886,19-4190,CLOSED,I am requesting contracts for SDCCU Stadium th...,"September 4, 2019 via web",,,Lori Hernandez,"title,item,time ""Request Published Public"",,""S...",,title \ 0 Requ...


In [13]:
# Check for empty docs field
empty_docs = test_df[test_df['docs'].str.fullmatch('title,link\n')]
empty_docs

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df
6659,17-3623,CLOSED,Dear Records Officer: I would like to request ...,"December 8, 2017 via web",Engineering and Capital Projects,"title,link",Jacqueline Palmer,"title,item,time ""Request Published Public"",,""D...","Empty DataFrame Columns: [title, link] Index: []",title \ 0 ...
7533,18-658,CLOSED,All emails sent by Council member Barbara Bry ...,"February 16, 2018 via web",City Council District 1,"title,link",Steven Hadley,"title,item,time ""Department Assignment Public""...","Empty DataFrame Columns: [title, link] Index: []",title \ 0 Departme...
9814,18-2940,CLOSED,"August 3, 2018  City of San Diego, CA City A...","August 3, 2018 via web",Engineering and Capital Projects,"title,link",Ginger Rodriguez,"title,item,time ""Request Published Public"",,""A...","Empty DataFrame Columns: [title, link] Index: []",title \ 0 ...
10883,18-4010,CLOSED,"Please provide copies of PID, PAD, PCD, NUP, P...","October 25, 2018 via web",Development Services,"title,link",Ginger Rodriguez,"title,item,time ""Request Published Public"",,""N...","Empty DataFrame Columns: [title, link] Index: []",title \ 0 ...
10894,18-4021,CLOSED,"October 22, 2018 To whom it may concern: Und...","October 25, 2018 via web","Police, Public Records Administration","title,link",Angela Laurita,"title,item,time ""Request Published Public"",,""N...","Empty DataFrame Columns: [title, link] Index: []",title \ 0 Requ...
...,...,...,...,...,...,...,...,...,...,...
28633,21-5230,CLOSED,Please provide any insurance information regar...,"October 6, 2021 via web",Development Services,"title,link",Ginger Rodriguez,"title,item,time ""Request Published Public"",,""O...","Empty DataFrame Columns: [title, link] Index: []",title \ 0 Requ...
28660,21-5261,CLOSED,request for 2 call for service reports 211000...,"October 7, 2021 via web",Police,"title,link",Lori Hernandez,"title,item,time ""Request Published Public"",,""O...","Empty DataFrame Columns: [title, link] Index: []",title \ 0 Requ...
28769,21-5392,CLOSED,request for 4 call for services 21100008435 2...,"October 15, 2021 via web",Police,"title,link",Angela Laurita,"title,item,time ""Request Published Public"",,""O...","Empty DataFrame Columns: [title, link] Index: []",title \ 0 Requ...
28770,21-5394,CLOSED,I'm looking for code enforcement information o...,"October 16, 2021 via web",Code Enforcement,"title,link",Ginger Rodriguez,"title,item,time ""Request Published Public"",,""O...","Empty DataFrame Columns: [title, link] Index: []",title \ 0 Requ...


In [14]:
# Which requests had the longest message history? (Useful for finding worst-case scenarios for the scraper)
long_msgs = test_df['msgs_df'].apply(lambda df: df.shape[0] if df is not None else 0).sort_values(ascending=False)
long_msgs.head(10)

3092    1117
2495     814
1721     313
676      174
2562     160
3109     153
6674     145
3971     142
6727     130
5895     126
Name: msgs_df, dtype: int64

In [15]:
# Sort requests by message history length
requests_long_msg = test_df.loc[long_msgs.index]
requests_long_msg.head(20)

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df
3092,17-22,CLOSED,I would like to review the photographs in File...,"January 4, 2017 via web",Code Enforcement,"title,link P3090002.JPG,https://sandiego.nextr...",Ginger Rodriguez,"title,item,time ""Request Published Public"",,""J...",title \ 0 ...,title \ 0 ...
2495,16-2363,CLOSED,Pursuant to the California Public Records Ac...,"October 7, 2016 via web",Public Records Administration,"title,link Various Emails (8).pdf,https://sand...",Lea Fields-Bernard,"title,item,time ""Request Published Public"",,""J...",...,title \ 0 ...
1721,16-1588,CLOSED,The San Diego Police Department’s “Policies an...,"July 29, 2016 via web",Public Records Administration,"title,link DP 809.pdf,https://sandiego.nextreq...",Lea Fields-Bernard,"title,item,time ""Request Published Public"",,""S...",...,title \ 0 ...
676,16-541,CLOSED,This is a request for: 1. all inspection and e...,"March 24, 2016 via web",Public Records Administration,"title,link IMG_4868.pdf,https://sandiego.nextr...",Lea Fields-Bernard,"title,item,time ""Request Published Public"",,""M...",tit...,title \ 0 ...
2562,16-2430,CLOSED,This is a request under the California Public ...,"October 14, 2016 via web",Public Records Administration,"title,link 6. Council Resolutions_R-310557 7-6...",Lea Fields-Bernard,"title,item,time ""Request Published Public"",,""N...",...,title \ 0 ...
3109,17-39,CLOSED,"Good afternoon, I'm requesting all appointment...","January 5, 2017 via web",City Council Administration,"title,link 2016 Calendar MK.pdf,https://sandie...",Lori Witzel,"title,item,time ""Document(s) Released Detail...",...,title \ ...
6674,17-3638,CLOSED,We are requesting that you produce all record...,"December 5, 2017 via email",City Attorney,,Nancy Shapiro,"title,item,time ""Request Published Public"",,""D...",,title ...
3971,17-901,CLOSED,Re residential real property at 1867 Irving Av...,"April 12, 2017 via web","Code Enforcement, Development Services","title,link 030.JPG,https://sandiego.nextreques...",Angela Laurita,"title,item,time ""Request Closed Public"",""Reque...",...,title \ 0 ...
6727,17-3691,CLOSED,I am requesting copies of the following public...,"December 14, 2017 via email",Personnel,,Yajaira Gharst,"title,item,time ""Request Published Public"",,""D...",,title ...
5895,17-2858,CLOSED,Records related to the Linda Vista skatepark p...,"October 3, 2017 via mail","Equal Opportunity Contracting, Engineering and...","title,link Letter of Complaint - 3-D.pdf,https...",Jacqueline Palmer,"title,item,time ""Request Published Public"",,""D...",...,title \ 0 ...


In [16]:
# Query for info about a specific request
request_id = '"17-22"'
test_df.query('id == ' + request_id).iloc[0]['msgs_df']

Unnamed: 0,title,item,time
0,Request Published Public,,"January 16, 2017, 4:10pm"
1,Request Closed Public,02. Released All responsive documents have bee...,"January 13, 2017, 4:06pm by Ginger Rodriquez"
2,Document(s) Released Public,20160315_131321.jpg,"January 13, 2017, 4:00pm by Ginger Rodriquez"
3,Document(s) Released Public,20160315_131307.jpg,"January 13, 2017, 4:00pm by Ginger Rodriquez"
4,Document(s) Released Public,20160315_131230.jpg,"January 13, 2017, 4:00pm by Ginger Rodriquez"
...,...,...,...
1112,Document(s) Released Public,20160307_155141.jpg,"January 13, 2017, 3:54pm by Ginger Rodriquez"
1113,Document(s) Released Public,20160307_155139.jpg,"January 13, 2017, 3:54pm by Ginger Rodriquez"
1114,Document(s) Released Public,7411 Hillside Drive Photos.pdf,"January 13, 2017, 3:54pm by Ginger Rodriquez"
1115,Department Assignment Public,Code Enforcement,"January 6, 2017, 3:23pm by Lea Fields-Bernard,..."


In [17]:
# Find request descriptions with the given substring, case insensitive
desc = 'Padres'
test_df[test_df['desc'].str.contains(desc, case=False)]

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df
487,16-348,CLOSED,"Joint ballpark ownership expenses, incremental...","February 29, 2016 via web",Department of Real Estate and Airport Management,"title,link 2015 JBOE Reconciliation.pdf,https:...",Jeffrey Wallace,"title,item,time ""Request Closed Public"",""02. R...",...,title \ 0 R...
583,16-444,CLOSED,Amount of times the SDFD and EMS have been cal...,"March 11, 2016 via web",Public Records Administration,"title,link PetCo Park 911 EMS Response for Fou...",Lea Fields-Bernard,"title,item,time ""Request Closed Public"",""02. R...",...,title \ 0 R...
584,16-445,CLOSED,Padres projected 2016 capital expenditures bas...,"March 11, 2016 via web",Department of Real Estate and Airport Management,"title,link CapEx Letter 2015-2016 executed.pdf...",Jeffrey Wallace,"title,item,time ""Request Closed Public"",""02. R...",title \ 0 C...,title \ 0 R...
622,16-486,CLOSED,I am requesting an opportunity to inspect or o...,"March 16, 2016 via web",Department of Real Estate and Airport Management,"title,link 2013 Petco Park Non Baseball Events...",Jeffrey Wallace,"title,item,time ""Request Published Public"",,""M...",...,title \ 0 Requ...
648,16-513,CLOSED,-information about any time the San Diego Depa...,"March 21, 2016 via web",Public Records Administration,,Lea Fields-Bernard,"title,item,time ""Request Published Public"",,""M...",,title \ 0 Requ...
1354,16-1221,CLOSED,Any emails exchanged between the city and the ...,"June 16, 2016 via web",Public Records Administration,,Lea Fields-Bernard,"title,item,time ""Request Published Public"",,""J...",,title \ 0 Requ...
2400,16-2268,CLOSED,"All emails (dated between January 1, 2012 and ...","September 27, 2016 via web",City Council District 7,,Barrett Tetlow,"title,item,time ""Request Published Public"",,""O...",,title \ 0 Requ...
2940,16-2840,CLOSED,I am requesting as copy of the city's contract...,"December 7, 2016 via web",Public Records Administration,"title,link Ballpark Debt Service Schedule.pdf,...",Lea Fields-Bernard,"title,item,time ""Request Published Public"",,""D...",...,title \ 0 Requ...
3268,17-198,CLOSED,"All e-mails between Miguel Duran, city ballpar...","January 26, 2017 via web",,"title,link Miguel Durans Petco Ballpark Emails...",Angela Laurita,"title,item,time ""Request Published Public"",,""F...",...,title \ 0 Requ...
5564,17-2495,CLOSED,Requesting copies of public records (that are ...,"September 13, 2017 via web",,"title,link FY 2016 Capital Expenditures - Fina...",Angela Laurita,"title,item,time ""Request Published Public"",,""S...",...,title \ 0 ...


In [18]:
# Find requests whose department(s) contain the given substring, case insensitive
dept = 'Police'
test_df[test_df['depts'].str.contains(dept, case=False)]

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df
7,15-1817,CLOSED,File materials related to: P12010041171 P12050...,"December 7, 2015 via web",Police,,Humberto Hernandez,"title,item,time ""Request Closed Public"",02c. R...",,title ...
49,15-1859,CLOSED,All records related to the demographic study c...,"December 11, 2015 via web",Police,,Humberto Hernandez,"title,item,time ""Request Closed Hide Public""...",,title \ 0 Request ...
65,15-1875,CLOSED,Reports related to incident of 11/11/2015 wher...,"December 15, 2015 via web",Police,,Humberto Hernandez,"title,item,time ""Request Closed Public"",""02. R...",,title \ 0 R...
81,15-1891,CLOSED,Communications by the City of San Diego regard...,"December 17, 2015 via web",Police,,Humberto Hernandez,"title,item,time ""Request Closed Public"",Still ...",,title ...
121,15-1931,CLOSED,"Vehicle Stop Data for period September 1, 2015...","December 28, 2015 via web",Police,,Humberto Hernandez,"title,item,time ""Request Closed Public"",Still ...",,title \ 0 R...
...,...,...,...,...,...,...,...,...,...,...
28838,21-5544,CLOSED,Dan you tell me when San Diego Police Officer ...,"October 26, 2021 via web",Police,,Angela Laurita,"title,item,time ""Request Published Public"",,""N...",,title \ 0 Requ...
28839,21-5550,CLOSED,I request any police reports involving inciden...,"October 27, 2021 via web",Police,,Angela Laurita,"title,item,time ""Request Published Public"",,""O...",,title \ 0 Requ...
28840,21-5552,CLOSED,The request is for the time of the towing call...,"October 27, 2021 via web",Police,,Angela Laurita,"title,item,time ""Request Published Public"",,""O...",,title \ 0 Requ...
28845,21-5584,CLOSED,request for call for service 2110020816,"October 28, 2021 via web",Police,"title,link 2110020816_Redacted.pdf,https://san...",Lori Hernandez,"title,item,time ""Request Published Public"",,""N...",title ...,title \ 0 Requ...


In [6]:
# Convert empty dataframes from docs_df into None
remove_empty = lambda df: None if ((df is None) or (type(df) == str and not df) or df.empty) else df
test_df['docs_df'] = test_df['docs_df'].apply(remove_empty)
test_df[test_df['docs'].str.fullmatch('title,link\n')]

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs,docs_df,msgs_df
6659,17-3623,CLOSED,Dear Records Officer: I would like to request ...,"December 8, 2017 via web",Engineering and Capital Projects,"title,link",Jacqueline Palmer,"title,item,time ""Request Published Public"",,""D...",,title \ 0 ...
7533,18-658,CLOSED,All emails sent by Council member Barbara Bry ...,"February 16, 2018 via web",City Council District 1,"title,link",Steven Hadley,"title,item,time ""Department Assignment Public""...",,title \ 0 Departme...
9814,18-2940,CLOSED,"August 3, 2018  City of San Diego, CA City A...","August 3, 2018 via web",Engineering and Capital Projects,"title,link",Ginger Rodriguez,"title,item,time ""Request Published Public"",,""A...",,title \ 0 ...
10883,18-4010,CLOSED,"Please provide copies of PID, PAD, PCD, NUP, P...","October 25, 2018 via web",Development Services,"title,link",Ginger Rodriguez,"title,item,time ""Request Published Public"",,""N...",,title \ 0 ...
10894,18-4021,CLOSED,"October 22, 2018 To whom it may concern: Und...","October 25, 2018 via web","Police, Public Records Administration","title,link",Angela Laurita,"title,item,time ""Request Published Public"",,""N...",,title \ 0 Requ...
...,...,...,...,...,...,...,...,...,...,...
28633,21-5230,CLOSED,Please provide any insurance information regar...,"October 6, 2021 via web",Development Services,"title,link",Ginger Rodriguez,"title,item,time ""Request Published Public"",,""O...",,title \ 0 Requ...
28660,21-5261,CLOSED,request for 2 call for service reports 211000...,"October 7, 2021 via web",Police,"title,link",Lori Hernandez,"title,item,time ""Request Published Public"",,""O...",,title \ 0 Requ...
28769,21-5392,CLOSED,request for 4 call for services 21100008435 2...,"October 15, 2021 via web",Police,"title,link",Angela Laurita,"title,item,time ""Request Published Public"",,""O...",,title \ 0 Requ...
28770,21-5394,CLOSED,I'm looking for code enforcement information o...,"October 16, 2021 via web",Code Enforcement,"title,link",Ginger Rodriguez,"title,item,time ""Request Published Public"",,""O...",,title \ 0 Requ...


In [7]:
# Split the date and request method from the date column
dates = test_df['date'].to_numpy()
test_df = test_df.join(
        pd.DataFrame(list(map(lambda x: x.split(' via '), dates)))
    ).drop(
        columns='date'
    ).rename(
        columns={0: 'date', 1: 'via'}
    ).convert_dtypes()
test_df.head()

Unnamed: 0,id,status,desc,depts,docs,poc,msgs,docs_df,msgs_df,date,via
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...",Code Enforcement,"title,link 5040 ShorehamPlace building permits...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ ...,title ...,"December 7, 2015",web
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Published Public"",,""D...",,title ...,"December 7, 2015",web
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,City Clerk,"title,link http://www.sandiego.gov/park-and-re...",Mailei Ross-Cerezo,"title,item,time ""Request Closed Public"",Still ...",...,title \ 0 R...,"December 7, 2015",web
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Closed Public"",02. Re...",,title ...,"December 7, 2015",web
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...",Code Enforcement,"title,link Site Plan - 11943 El Camino Real.pd...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ 0 ...,title ...,"December 7, 2015",web


In [8]:
# Split the time and author from the time quote on each message
def split_time_author(msgs):
    if msgs is None:
        return None
    time_quotes = msgs['time'].to_numpy()
    time_author = pd.DataFrame(list(map(lambda x: x.split(' by '), time_quotes)))
    # dataset.str.split(expand=True)
    return df_fillna(msgs.join(
            time_author
        ).drop(
            columns='time'
        ).rename(
            columns={0: 'time', 1: 'by'}
        ))

test_df['msgs_df'] = test_df['msgs_df'].apply(split_time_author)
test_df.loc[4]['msgs_df']

Unnamed: 0,title,item,time,by
0,Request Closed Public,02. Released,"December 24, 2015, 10:34am",Ginger Rodriquez
1,Document(s) Released Public,Site Plan - 11943 El Camino Real.pdf,"December 24, 2015, 10:33am",Ginger Rodriquez
2,Request Reopened Public,,"December 24, 2015, 10:32am",Ginger Rodriquez
3,Request Closed Public,02. Released,"December 24, 2015, 8:09am",Ginger Rodriquez
4,Document(s) Released Public,15-1814 Fire Responsive.pdf,"December 24, 2015, 8:09am",Ginger Rodriquez
5,Request Reopened Public,,"December 24, 2015, 8:02am","Amanda Alvarado, Senior Clerk"
6,Request Closed Public,02. Released,"December 17, 2015, 4:00pm",Ginger Rodriquez
7,Request Reopened Public,,"December 17, 2015, 3:56pm",Ginger Rodriquez
8,Request Closed Public,02. Released,"December 17, 2015, 3:54pm",Ginger Rodriquez
9,Request Closed Public,02. Released,"December 17, 2015, 3:52pm",Ginger Rodriquez


In [9]:
# Convert columns with time strings into DateTime
def convert_time_to_dt(df, col='time'):
    return df.assign(**{col + '_dt': pd.to_datetime(df[col])})

test_df['msgs_df'] = test_df['msgs_df'].apply(convert_time_to_dt)
test_df.loc[4]['msgs_df']

Unnamed: 0,title,item,time,by,time_dt
0,Request Closed Public,02. Released,"December 24, 2015, 10:34am",Ginger Rodriquez,2015-12-24 10:34:00
1,Document(s) Released Public,Site Plan - 11943 El Camino Real.pdf,"December 24, 2015, 10:33am",Ginger Rodriquez,2015-12-24 10:33:00
2,Request Reopened Public,,"December 24, 2015, 10:32am",Ginger Rodriquez,2015-12-24 10:32:00
3,Request Closed Public,02. Released,"December 24, 2015, 8:09am",Ginger Rodriquez,2015-12-24 08:09:00
4,Document(s) Released Public,15-1814 Fire Responsive.pdf,"December 24, 2015, 8:09am",Ginger Rodriquez,2015-12-24 08:09:00
5,Request Reopened Public,,"December 24, 2015, 8:02am","Amanda Alvarado, Senior Clerk",2015-12-24 08:02:00
6,Request Closed Public,02. Released,"December 17, 2015, 4:00pm",Ginger Rodriquez,2015-12-17 16:00:00
7,Request Reopened Public,,"December 17, 2015, 3:56pm",Ginger Rodriquez,2015-12-17 15:56:00
8,Request Closed Public,02. Released,"December 17, 2015, 3:54pm",Ginger Rodriquez,2015-12-17 15:54:00
9,Request Closed Public,02. Released,"December 17, 2015, 3:52pm",Ginger Rodriquez,2015-12-17 15:52:00


In [10]:
# Splitting departments for easier pivoting
depts = test_df['depts'].to_numpy() # depts column
test_df_depts = test_df.join(pd.DataFrame(list(map(lambda x: x.split(', '), depts)))) # Split departments into separate columns
test_df_depts = test_df_depts.melt( # Melt on the individual departments
        id_vars=test_df.columns
    )[lambda df: df['value'].apply(lambda x: x is not None)].drop( # Get rid of None values
        columns='variable'
    ).rename( # Drop the variable column, rename the value column, and reset indices
        columns={'value': 'dept'}
    ).reset_index().drop(
        columns='index'
    )
test_df_depts

Unnamed: 0,id,status,desc,depts,docs,poc,msgs,docs_df,msgs_df,date,via,dept
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...",Code Enforcement,"title,link 5040 ShorehamPlace building permits...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ ...,title ...,"December 7, 2015",web,Code Enforcement
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Published Public"",,""D...",,title ...,"December 7, 2015",web,Department of Real Estate and Airport Management
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,City Clerk,"title,link http://www.sandiego.gov/park-and-re...",Mailei Ross-Cerezo,"title,item,time ""Request Closed Public"",Still ...",...,title \ 0 R...,"December 7, 2015",web,City Clerk
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Closed Public"",02. Re...",,title ...,"December 7, 2015",web,Department of Real Estate and Airport Management
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...",Code Enforcement,"title,link Site Plan - 11943 El Camino Real.pd...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ 0 ...,title ...,"December 7, 2015",web,Code Enforcement
...,...,...,...,...,...,...,...,...,...,...,...,...
32430,20-4254,CLOSED,All emails with the attachment of the prelimin...,"City Attorney, City Council District 4, Develo...","title,link ""Attachment to Email A, Forensic Re...",Angela Laurita,"title,item,time ""Request Published Public"",,""J...",...,title \ ...,"September 23, 2020",web,Department of Finance
32431,20-4254,CLOSED,All emails with the attachment of the prelimin...,"City Attorney, City Council District 4, Develo...","title,link ""Attachment to Email A, Forensic Re...",Angela Laurita,"title,item,time ""Request Published Public"",,""J...",...,title \ ...,"September 23, 2020",web,Deputy Chief Operating Officer - Smart & Susta...
32432,20-4254,CLOSED,All emails with the attachment of the prelimin...,"City Attorney, City Council District 4, Develo...","title,link ""Attachment to Email A, Forensic Re...",Angela Laurita,"title,item,time ""Request Published Public"",,""J...",...,title \ ...,"September 23, 2020",web,Office of Sustainability
32433,20-4254,CLOSED,All emails with the attachment of the prelimin...,"City Attorney, City Council District 4, Develo...","title,link ""Attachment to Email A, Forensic Re...",Angela Laurita,"title,item,time ""Request Published Public"",,""J...",...,title \ ...,"September 23, 2020",web,Homelessness Strategies


In [11]:
test_df_depts.value_counts('dept')[lambda x: x.index.str.contains('Police', case=False)]

dept
Police                            6886
Commission on Police Practices      51
dtype: int64

In [26]:
# Find requests whose department(s) contain the given substring, case insensitive
dept = 'Chief Operating Officer'
test_df[test_df['depts'].str.contains(dept, case=False)]

Unnamed: 0,id,status,desc,depts,docs,poc,msgs,docs_df,msgs_df,date,via
30,15-1840,CLOSED,Writings and communications related to Sai Kir...,Deputy Chief Operating Officer - Infrastructur...,"title,link Letter to Finch (CCPRA 2015-1840).p...",Jacqueline Palmer,"title,item,time ""Request Closed Hide Public""...",...,title \ 0 Request ...,"December 9, 2015",web
351,16-212,CLOSED,Records relating to SAP software agreement and...,Deputy Chief Operating Officer - Infrastructur...,,Erin Noel,"title,item,time ""Request Closed Public"",""02. R...",,title \ 0 R...,"February 8, 2016",web
3397,17-327,CLOSED,Location: Intersection of Canon St. & Catalina...,Deputy Chief Operating Officer - Infrastructur...,,Travis Brady,"title,item,time ""Request Published Public"",,""F...",,title \ 0 Requ...,"February 9, 2017",web
3730,17-660,CLOSED,1. Please provide a copy of the executed ...,"Information Technology, Deputy Chief Operating...","title,link 17-660 Email Results_Redacted.pdf,h...",Jacqueline Palmer,"title,item,time ""Request Published Public"",,""J...",...,title \ 0 Re...,"March 20, 2017",web
3731,17-661,CLOSED,Please provide all conflict of interest disclo...,Deputy Chief Operating Officer - Infrastructur...,"title,link H166584 Exhibit H Signed_02-29-16.p...",Travis Brady,"title,item,time ""Request Published Public"",,""A...",...,title \ 0 Re...,"March 20, 2017",web
...,...,...,...,...,...,...,...,...,...,...,...
25496,21-1956,CLOSED,"Hello, Pursuant to the California Public Recor...",Office of the Chief Operating Officer,"title,link Pages from 20-296_COO Emails re 101...",Angela Laurita,"title,item,time ""Request Published Public"",,""M...",...,title \ 0 Requ...,"April 16, 2021",web
25701,21-2169,CLOSED,"Hello, Pursuant to the California Public Recor...",Office of the Chief Operating Officer,"title,link 21-2169_RVilla Emails_09-2016_Redac...",Angela Laurita,"title,item,time ""Request Published Public"",,""J...",title ...,title \ 0 Requ...,"April 26, 2021",web
25870,21-2338,CLOSED,"Hello, Pursuant to the California Public Recor...","Mayor, Office of the Chief Operating Officer",,Angela Laurita,"title,item,time ""Request Published Public"",,""M...",,title \ 0 Requ...,"May 2, 2021",web
27701,21-4226,CLOSED,"Hello, Pursuant to the California Public Reco...",Office of the Chief Operating Officer,"title,link PRAR 21-4226 - KM Calendar_Redacted...",Angela Laurita,"title,item,time ""Request Published Public"",,""S...",title \ ...,title \ 0 Requ...,"August 4, 2021",web


In [52]:
test_df.loc[100]['msgs_df']

Unnamed: 0,title,item,time,by,time_dt
0,Request Closed Public,02. Released All responsive documents have bee...,"January 3, 2016, 1:04pm",Ginger Rodriquez,2016-01-03 13:04:00
1,Request Reopened Public,,"January 3, 2016, 1:02pm",Ginger Rodriquez,2016-01-03 13:02:00
2,Request Closed Public,02. Released All responsive documents have bee...,"January 3, 2016, 12:59pm",Ginger Rodriquez,2016-01-03 12:59:00
3,Document(s) Released Public,3010 Science Park Dr open building permits.pdf,"January 3, 2016, 12:58pm",Ginger Rodriquez,2016-01-03 12:58:00
4,Request Published Public,,"December 22, 2015, 9:36am",,2015-12-22 09:36:00
5,Department Assignment Public,Development Services,"December 22, 2015, 9:36am","Lea Fields-Bernard, Public Records Administrat...",2015-12-22 09:36:00
6,Request Opened Public,Request received via web,"December 22, 2015, 9:36am","Lea Fields-Bernard, Public Records Administrat...",2015-12-22 09:36:00


In [51]:
def get_open_time(msgs):
    request_published = msgs[lambda df: df['title'].str.contains('Opened')].sort_values(by='time_dt', ignore_index=True)
    return request_published.loc[0]['time_dt']

test_df['open_time'] = test_df['msgs_df'].apply(get_submission_time)
test_df.head()

KeyError: 0

In [50]:
test_df.value_counts('via')

via
web      26220
email     1855
mail       517
phone       99
fax         46
dtype: int64

In [30]:
# TODO: Extract department assignment additions and removals based on message history
def find_dept_assign(msgs):
    dept_assign = msgs[lambda df: df['title'].str.contains('Department')]
    
    dept_assign_added = dept_assign[lambda df: df['item'].str.contains('Added:')]
    dept_assign_removed = dept_assign[lambda df: df['item'].str.contains('Removed:')]

test_df.loc[1]['msgs_df']

Unnamed: 0,title,item,time,by,time_dt
0,Request Published Public,,"December 16, 2015, 9:54am",Jeffrey Wallace,2015-12-16 09:54:00
1,Request Closed Public,02. Released,"December 15, 2015, 7:12am",Jeffrey Wallace,2015-12-15 07:12:00
2,Department Assignment Public,Real Estate Assets,"December 7, 2015, 5:22pm",,2015-12-07 17:22:00
3,Request Opened Public,Request received via web,"December 7, 2015, 5:22pm","Lea Fields-Bernard, Public Records Administrat...",2015-12-07 17:22:00
4,Request Published Public,,"December 7, 2015, 5:22pm",,2015-12-07 17:22:00
