# NextRequest data cleaning and EDA
*Author: Steven Yuan*

This notebook contains potentially useful data cleaning and EDA routines for scraped NextRequest data.

In [1]:
import pandas as pd
from io import StringIO
import zipfile

from nextrequest_eda_utils import *

In [2]:
sd = pd.read_csv(zipfile.ZipFile('../data/sd_requests.zip', 'r').open('sd_requests.csv'))
sd.head()

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...","December 7, 2015 via web",Code Enforcement,"title,link\n5040 ShorehamPlace building permit...",Ginger Rodriguez,"title,item,time\n""Request Closed\nPublic"",02. ..."
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time\n""Request Published\nPublic"",,..."
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,"December 7, 2015 via web",City Clerk,"title,link\nhttp://www.sandiego.gov/park-and-r...",Mailei Ross-Cerezo,"title,item,time\n""Request Closed\nPublic"",Stil..."
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time\n""Request Closed\nPublic"",02. ..."
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...","December 7, 2015 via web",Code Enforcement,"title,link\nSite Plan - 11943 El Camino Real.p...",Ginger Rodriguez,"title,item,time\n""Request Closed\nPublic"",02. ..."


In [8]:
sd_clean.shape[0] # Number of requests scraped

28848

In [4]:
sd_clean = nextrequest_df_clean(sd, debug=True)

fillna complete
docs_df complete
msgs_df complete
date-via split complete
time-by split in msgs complete


In [12]:
sd_clean.head()

Unnamed: 0,id,status,desc,depts,docs,poc,msgs,docs_df,msgs_df,date,via,date_dt
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...",Code Enforcement,"title,link 5040 ShorehamPlace building permits...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ ...,title ...,"December 7, 2015",via web,2015-12-07
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Published Public"",,""D...",,title ...,"December 7, 2015",via web,2015-12-07
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,City Clerk,"title,link http://www.sandiego.gov/park-and-re...",Mailei Ross-Cerezo,"title,item,time ""Request Closed Public"",Still ...",...,title \ 0 R...,"December 7, 2015",via web,2015-12-07
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Closed Public"",02. Re...",,title ...,"December 7, 2015",via web,2015-12-07
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...",Code Enforcement,"title,link Site Plan - 11943 El Camino Real.pd...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ 0 ...,title ...,"December 7, 2015",via web,2015-12-07


In [11]:
sd_clean.loc[4]['msgs_df']

Unnamed: 0,title,item,time,by,time_dt
0,Request Closed Public,02. Released,"December 24, 2015, 10:34am",Ginger Rodriquez,2015-12-24 10:34:00
1,Document(s) Released Public,Site Plan - 11943 El Camino Real.pdf,"December 24, 2015, 10:33am",Ginger Rodriquez,2015-12-24 10:33:00
2,Request Reopened Public,,"December 24, 2015, 10:32am",Ginger Rodriquez,2015-12-24 10:32:00
3,Request Closed Public,02. Released,"December 24, 2015, 8:09am",Ginger Rodriquez,2015-12-24 08:09:00
4,Document(s) Released Public,15-1814 Fire Responsive.pdf,"December 24, 2015, 8:09am",Ginger Rodriquez,2015-12-24 08:09:00
5,Request Reopened Public,,"December 24, 2015, 8:02am","Amanda Alvarado, Senior Clerk",2015-12-24 08:02:00
6,Request Closed Public,02. Released,"December 17, 2015, 4:00pm",Ginger Rodriquez,2015-12-17 16:00:00
7,Request Reopened Public,,"December 17, 2015, 3:56pm",Ginger Rodriquez,2015-12-17 15:56:00
8,Request Closed Public,02. Released,"December 17, 2015, 3:54pm",Ginger Rodriquez,2015-12-17 15:54:00
9,Request Closed Public,02. Released,"December 17, 2015, 3:52pm",Ginger Rodriquez,2015-12-17 15:52:00


In [15]:
# Query for info about a specific request
request_id = '"17-22"'
sd_clean.query('id == ' + request_id).iloc[0]['msgs_df']

Unnamed: 0,title,item,time,by,time_dt
0,Request Published Public,,"January 16, 2017, 4:10pm",,2017-01-16 16:10:00
1,Request Closed Public,02. Released All responsive documents have bee...,"January 13, 2017, 4:06pm",Ginger Rodriquez,2017-01-13 16:06:00
2,Document(s) Released Public,20160315_131321.jpg,"January 13, 2017, 4:00pm",Ginger Rodriquez,2017-01-13 16:00:00
3,Document(s) Released Public,20160315_131307.jpg,"January 13, 2017, 4:00pm",Ginger Rodriquez,2017-01-13 16:00:00
4,Document(s) Released Public,20160315_131230.jpg,"January 13, 2017, 4:00pm",Ginger Rodriquez,2017-01-13 16:00:00
...,...,...,...,...,...
1112,Document(s) Released Public,20160307_155141.jpg,"January 13, 2017, 3:54pm",Ginger Rodriquez,2017-01-13 15:54:00
1113,Document(s) Released Public,20160307_155139.jpg,"January 13, 2017, 3:54pm",Ginger Rodriquez,2017-01-13 15:54:00
1114,Document(s) Released Public,7411 Hillside Drive Photos.pdf,"January 13, 2017, 3:54pm",Ginger Rodriquez,2017-01-13 15:54:00
1115,Department Assignment Public,Code Enforcement,"January 6, 2017, 3:23pm","Lea Fields-Bernard, Public Records Administrat...",2017-01-06 15:23:00


In [None]:
# Find request descriptions with the given substring, case insensitive
desc = 'Padres'
test_df[test_df['desc'].str.contains(desc, case=False)]

In [None]:
# Find requests whose department(s) contain the given substring, case insensitive
dept = 'police'
sd_clean[lambda df: df['depts'].str.contains(dept, case=False)]

In [None]:
# Which requests had the longest message history? (Useful for finding worst-case scenarios for the scraper)
long_msgs = sd_clean['msgs_df'].apply(lambda df: df.shape[0] if df is not None else 0).sort_values(ascending=False)
long_msgs.head(10)

In [None]:
# Sort requests by message history length
requests_long_msg = sd_clean.loc[long_msgs.index]
requests_long_msg.head(20)

In [46]:
# TODO: Extract department assignment additions and removals based on message history
def get_dept_assign(msgs, get_time=False):
    if msgs is None: return None

    dept_assign = msgs[lambda df: df['title'].str.contains('Department')].sort_values(by='time_dt', ignore_index=True)
    
    if dept_assign.empty: return None

    # TODO: Add option to include time information
    dept_assign_init = dept_assign[  # Initial department assignment
            lambda df: ~df['item'].str.contains(r'Added|Removed')
        ]['item'].to_numpy()
    dept_assign_added = dept_assign[  # Departments that were added
            lambda df: df['item'].str.contains('Added')
        ]['item'].str.split(
            r'Added: |Removed: '
        ).str[1].str.strip(' .').str.split(', ').to_numpy()
    dept_assign_removed = dept_assign[  # Departments that were removed
            lambda df: df['item'].str.contains('Removed')
        ]['item'].str.split(
            'Removed: '
        ).str[-1].str.strip(' .').str.split(', ').to_numpy()
    
    return {
        'dept_added': list(set([dept for l in list(dept_assign_added).append(dept_assign_init) for dept in l])),
        'dept_removed': list(set([dept for l in list(dept_assign_removed) for dept in l]))
        }

In [106]:
testtest = sd_clean['msgs_df'].apply(lambda df: len(df[df['title'].str.contains('Department')])).sort_values(ascending=False)

In [107]:
testtest.head(20)

22065    53
21905    53
7386     50
14991    50
14983    49
14984    47
19196    45
19496    44
14988    44
14986    43
14166    42
14985    40
14987    38
14989    36
26149    30
20013    27
6398     27
5607     26
20200    25
22047    25
Name: msgs_df, dtype: int64

In [178]:
test = sd_clean.loc[0]['msgs_df'][lambda df: df['title'].str.contains('Department')]
aa = test[lambda df: df['item'].str.contains('Removed')]['item'].str.split('Removed: ').str[-1].str.strip('.').str.split(', ').to_numpy()
aaa = list(set([dept for l in list(aa) for dept in l]))
aaa

[]

In [112]:
vallejo = pd.read_csv(zipfile.ZipFile('../data/vallejo_requests.zip', 'r').open('vallejo_requests.csv'))
vallejo.head()

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs
0,16-1,CLOSED,I need my birth record and if possible open my...,"April 5, 2016 via web",City Clerk’s Office,,"Dawn Abrahamson, City Clerk","title,item,time\n""Request Closed Hide\nPubli..."
1,16-3,CLOSED,"Address:\n408 Tennessee Street, Vallejo, CA 94...","April 27, 2016 via web",,"title,link\nPERMITS FOR 408 TENNESSEE STREET.d...","Livian Ellis, Secretary","title,item,time\n""Request Closed\nPublic"",""Rec..."
2,16-15,CLOSED,A log of records requests submitted to the cit...,"May 19, 2016 via web",None Assigned,"title,link\n2012 Public Records Request log.pd...","Dawn Abrahamson, City Clerk","title,item,time\n""Request Closed\nPublic"",""Re-..."
3,16-16,CLOSED,RE:\nThe Ridge Townhomes\n301 Fairgrounds Driv...,"May 23, 2016 via web",,"title,link\nSD02-0033 Site Map.pdf,https://val...","Leslie Trybull, Executive Secretary","title,item,time\n""Request Closed Hide\nPubli..."
4,16-17,CLOSED,"Under the FOIA, I want to ask for any staff re...","May 20, 2016 via email",,"title,link\nVMT Lease Agreement (Amendment 3)....","Angelina Abella, Administrative Clerk II","title,item,time\n""Request Closed Hide\nPubli..."


In [114]:
vallejo_clean = nextrequest_df_clean(vallejo, debug=True)

fillna complete
docs_df complete
msgs_df complete
date-via split complete
time-by split in msgs complete


In [119]:
eeeeee = vallejo_clean['msgs_df'].apply(lambda df: len(df[df['title'].str.contains('Department')])).sort_values(ascending=False)

In [123]:
vallejo_clean.loc[2197]['msgs_df']

Unnamed: 0,title,item,time,by,time_dt
0,Request Closed Hide Public,We have provided all records responsive to you...,"December 11, 2020, 4:29pm","Dale Miller, Administrative Analyst II",2020-12-11 16:29:00
1,Document(s) Released to Requester Public,Mathis Group_Risk Assessments_2.9.2017 through...,"December 11, 2020, 4:27pm","Dale Miller, Administrative Analyst II",2020-12-11 16:27:00
2,Department Assignment Public,Added: City Clerk’s Office. Removed: Police De...,"December 4, 2020, 1:53pm",Melissa Rhodes,2020-12-04 13:53:00
3,Department Assignment Public,Added: Police Department.,"December 4, 2020, 9:47am","Dawn Abrahamson, City Clerk",2020-12-04 09:47:00
4,Request Published Public,,"November 23, 2020, 7:20am","Dawn Abrahamson, City Clerk",2020-11-23 07:20:00
5,Department Assignment Public,Removed: All Other Departments.,"November 23, 2020, 7:20am","Dawn Abrahamson, City Clerk",2020-11-23 07:20:00
6,Department Assignment Public,All Other Departments,"November 22, 2020, 9:25am",,2020-11-22 09:25:00
7,Request Opened Public,Request received via web,"November 22, 2020, 9:25am",,2020-11-22 09:25:00


In [122]:
eeeeee.head(20)

2197    4
1277    3
3468    3
1517    3
2726    3
1127    3
1852    3
1173    3
410     3
3384    3
829     3
3082    3
2126    3
1181    3
1200    3
3666    3
422     3
963     3
90      3
1092    3
Name: msgs_df, dtype: int64