# NextRequest data cleaning and EDA
*Author: Steven Yuan*

This notebook is used to perform preliminary EDA on NextRequest data using pandas

## TO-DO
- Write a procedure to extract all the documents that are attached to a request based on the message history
- Significance test to determine if the response times for police-related requests is different than response times for non-police related requests
- Some requests were re-opened/re-closed, how do we factor that into our analysis?
- Test EDA functions on different datasets once they are scraped

In [37]:
import pandas as pd
import numpy as np
from io import StringIO
import zipfile

from nextrequest_eda_utils import *

In [2]:
sd = pd.read_csv(zipfile.ZipFile('../data/sd_requests.zip', 'r').open('sd_requests.csv'))
sd.head()

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...","December 7, 2015 via web",Code Enforcement,"title,link\n5040 ShorehamPlace building permit...",Ginger Rodriguez,"title,item,time\n""Request Closed\nPublic"",02. ..."
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time\n""Request Published\nPublic"",,..."
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,"December 7, 2015 via web",City Clerk,"title,link\nhttp://www.sandiego.gov/park-and-r...",Mailei Ross-Cerezo,"title,item,time\n""Request Closed\nPublic"",Stil..."
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,"December 7, 2015 via web",Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time\n""Request Closed\nPublic"",02. ..."
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...","December 7, 2015 via web",Code Enforcement,"title,link\nSite Plan - 11943 El Camino Real.p...",Ginger Rodriguez,"title,item,time\n""Request Closed\nPublic"",02. ..."


In [3]:
sd_clean = nextrequest_df_clean(sd, debug=True)

fillna complete
docs_df complete
msgs_df complete
date-via split complete
time-by split in msgs complete


In [4]:
sd_clean.shape[0]

28848

In [5]:
sd_clean.head()

Unnamed: 0,id,status,desc,depts,docs,poc,msgs,docs_df,msgs_df,date,via,date_dt
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...",Code Enforcement,"title,link 5040 ShorehamPlace building permits...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ ...,title ...,"December 7, 2015",via web,2015-12-07
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Published Public"",,""D...",,title ...,"December 7, 2015",via web,2015-12-07
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,City Clerk,"title,link http://www.sandiego.gov/park-and-re...",Mailei Ross-Cerezo,"title,item,time ""Request Closed Public"",Still ...",...,title \ 0 R...,"December 7, 2015",via web,2015-12-07
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Closed Public"",02. Re...",,title ...,"December 7, 2015",via web,2015-12-07
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...",Code Enforcement,"title,link Site Plan - 11943 El Camino Real.pd...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ 0 ...,title ...,"December 7, 2015",via web,2015-12-07


In [6]:
sd_clean.loc[4]['msgs_df']

Unnamed: 0,title,item,time,by,time_dt
0,Request Closed Public,02. Released,"December 24, 2015, 10:34am",Ginger Rodriquez,2015-12-24 10:34:00
1,Document(s) Released Public,Site Plan - 11943 El Camino Real.pdf,"December 24, 2015, 10:33am",Ginger Rodriquez,2015-12-24 10:33:00
2,Request Reopened Public,,"December 24, 2015, 10:32am",Ginger Rodriquez,2015-12-24 10:32:00
3,Request Closed Public,02. Released,"December 24, 2015, 8:09am",Ginger Rodriquez,2015-12-24 08:09:00
4,Document(s) Released Public,15-1814 Fire Responsive.pdf,"December 24, 2015, 8:09am",Ginger Rodriquez,2015-12-24 08:09:00
5,Request Reopened Public,,"December 24, 2015, 8:02am","Amanda Alvarado, Senior Clerk",2015-12-24 08:02:00
6,Request Closed Public,02. Released,"December 17, 2015, 4:00pm",Ginger Rodriquez,2015-12-17 16:00:00
7,Request Reopened Public,,"December 17, 2015, 3:56pm",Ginger Rodriquez,2015-12-17 15:56:00
8,Request Closed Public,02. Released,"December 17, 2015, 3:54pm",Ginger Rodriquez,2015-12-17 15:54:00
9,Request Closed Public,02. Released,"December 17, 2015, 3:52pm",Ginger Rodriquez,2015-12-17 15:52:00


In [15]:
# Query for info about a specific request
request_id = '"17-22"'
sd_clean.query('id == ' + request_id).iloc[0]['msgs_df']

Unnamed: 0,title,item,time,by,time_dt
0,Request Published Public,,"January 16, 2017, 4:10pm",,2017-01-16 16:10:00
1,Request Closed Public,02. Released All responsive documents have bee...,"January 13, 2017, 4:06pm",Ginger Rodriquez,2017-01-13 16:06:00
2,Document(s) Released Public,20160315_131321.jpg,"January 13, 2017, 4:00pm",Ginger Rodriquez,2017-01-13 16:00:00
3,Document(s) Released Public,20160315_131307.jpg,"January 13, 2017, 4:00pm",Ginger Rodriquez,2017-01-13 16:00:00
4,Document(s) Released Public,20160315_131230.jpg,"January 13, 2017, 4:00pm",Ginger Rodriquez,2017-01-13 16:00:00
...,...,...,...,...,...
1112,Document(s) Released Public,20160307_155141.jpg,"January 13, 2017, 3:54pm",Ginger Rodriquez,2017-01-13 15:54:00
1113,Document(s) Released Public,20160307_155139.jpg,"January 13, 2017, 3:54pm",Ginger Rodriquez,2017-01-13 15:54:00
1114,Document(s) Released Public,7411 Hillside Drive Photos.pdf,"January 13, 2017, 3:54pm",Ginger Rodriquez,2017-01-13 15:54:00
1115,Department Assignment Public,Code Enforcement,"January 6, 2017, 3:23pm","Lea Fields-Bernard, Public Records Administrat...",2017-01-06 15:23:00


In [None]:
# Find request descriptions with the given substring, case insensitive
desc = 'Padres'
test_df[test_df['desc'].str.contains(desc, case=False)]

In [54]:
# Find requests whose department(s) contain the given substring, case insensitive
dept_search = 'police'
sd_clean[lambda df: df['depts'].str.contains(dept_search, case=False)]

Unnamed: 0,id,status,desc,depts,docs,poc,msgs,docs_df,msgs_df,date,via,date_dt,open_time,close_time,resp
7,15-1817,CLOSED,File materials related to: P12010041171 P12050...,Police,,Humberto Hernandez,"title,item,time ""Request Closed Public"",02c. R...",,title ...,"December 7, 2015",via web,2015-12-07,2015-12-08 11:40:00,2015-12-21 10:48:00,12 days 23:08:00
49,15-1859,CLOSED,All records related to the demographic study c...,Police,,Humberto Hernandez,"title,item,time ""Request Closed Hide Public""...",,title \ 0 Request ...,"December 11, 2015",via web,2015-12-11,2015-12-14 18:07:00,2016-01-12 10:06:00,28 days 15:59:00
65,15-1875,CLOSED,Reports related to incident of 11/11/2015 wher...,Police,,Humberto Hernandez,"title,item,time ""Request Closed Public"",""02. R...",,title \ 0 R...,"December 15, 2015",via web,2015-12-15,2015-12-16 11:32:00,2016-01-13 07:40:00,27 days 20:08:00
81,15-1891,CLOSED,Communications by the City of San Diego regard...,Police,,Humberto Hernandez,"title,item,time ""Request Closed Public"",Still ...",,title ...,"December 17, 2015",via web,2015-12-17,2015-12-18 10:30:00,2015-12-28 15:26:00,10 days 04:56:00
121,15-1931,CLOSED,"Vehicle Stop Data for period September 1, 2015...",Police,,Humberto Hernandez,"title,item,time ""Request Closed Public"",Still ...",,title \ 0 R...,"December 28, 2015",via web,2015-12-28,2015-12-28 10:56:00,2015-12-31 13:40:00,3 days 02:44:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28838,21-5544,CLOSED,Dan you tell me when San Diego Police Officer ...,Police,,Angela Laurita,"title,item,time ""Request Published Public"",,""N...",,title \ 0 Requ...,"October 26, 2021",via web,2021-10-26,2021-10-26 21:34:00,2021-10-29 07:49:00,2 days 10:15:00
28839,21-5550,CLOSED,I request any police reports involving inciden...,Police,,Angela Laurita,"title,item,time ""Request Published Public"",,""O...",,title \ 0 Requ...,"October 27, 2021",via web,2021-10-27,2021-10-27 11:12:00,2021-10-27 12:50:00,0 days 01:38:00
28840,21-5552,CLOSED,The request is for the time of the towing call...,Police,,Angela Laurita,"title,item,time ""Request Published Public"",,""O...",,title \ 0 Requ...,"October 27, 2021",via web,2021-10-27,2021-10-27 12:22:00,2021-10-28 15:00:00,1 days 02:38:00
28845,21-5584,CLOSED,request for call for service 2110020816,Police,"title,link 2110020816_Redacted.pdf,https://san...",Lori Hernandez,"title,item,time ""Request Published Public"",,""N...",title ...,title \ 0 Requ...,"October 28, 2021",via web,2021-10-28,2021-10-28 13:56:00,2021-10-29 13:59:00,1 days 00:03:00


In [8]:
# Get open and close times for each request
sd_clean['open_time'] = sd_clean['msgs_df'].apply(get_open_time)
print('open_time done')
sd_clean['close_time'] = sd_clean['msgs_df'].apply(get_close_time)
print('close_time done')

open_time done
close_time done


In [13]:
sd_clean['resp'] = sd_clean['close_time'] - sd_clean['open_time']
sd_clean.head()

Unnamed: 0,id,status,desc,depts,docs,poc,msgs,docs_df,msgs_df,date,via,date_dt,open_time,close_time,resp
0,15-1810,CLOSED,"Notices of Violation/Notice to Comply, fire in...",Code Enforcement,"title,link 5040 ShorehamPlace building permits...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ ...,title ...,"December 7, 2015",via web,2015-12-07,2015-12-07 17:08:00,2015-12-17 16:34:00,9 days 23:26:00
1,15-1811,CLOSED,The October 2015 monthly report for SeaWorld,Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Published Public"",,""D...",,title ...,"December 7, 2015",via web,2015-12-07,2015-12-07 17:22:00,2015-12-15 07:12:00,7 days 13:50:00
2,15-1812,CLOSED,Records related to the following BIDS: Adams ...,City Clerk,"title,link http://www.sandiego.gov/park-and-re...",Mailei Ross-Cerezo,"title,item,time ""Request Closed Public"",Still ...",...,title \ 0 R...,"December 7, 2015",via web,2015-12-07,2015-12-07 17:29:00,2015-12-22 11:32:00,14 days 18:03:00
3,15-1813,CLOSED,Historical lease payments made by SeaWorld to ...,Department of Real Estate and Airport Management,,Jeffrey Wallace,"title,item,time ""Request Closed Public"",02. Re...",,title ...,"December 7, 2015",via web,2015-12-07,2015-12-07 17:33:00,2015-12-15 07:10:00,7 days 13:37:00
4,15-1814,CLOSED,"Open violations, variances, ordinances, approv...",Code Enforcement,"title,link Site Plan - 11943 El Camino Real.pd...",Ginger Rodriguez,"title,item,time ""Request Closed Public"",02. Re...",title \ 0 ...,title ...,"December 7, 2015",via web,2015-12-07,2015-12-07 17:39:00,2015-12-24 10:34:00,16 days 16:55:00


In [25]:
dept_search = r'police|sheriff'
sd_clean[lambda df: df['depts'].str.contains(dept_search, case=False)]['resp'].mean()

Timedelta('13 days 09:46:00.469225199')

In [53]:
dept_search = r'commission'
sd_clean[lambda df: df['depts'].str.contains(dept_search, case=False)]

Unnamed: 0,id,status,desc,depts,docs,poc,msgs,docs_df,msgs_df,date,via,date_dt,open_time,close_time,resp
664,16-529,CLOSED,I am writing to request documents related to “...,Planning Commission,,Ginger Rodriguez,"title,item,time ""Request Published Public"",,""A...",,title \ 0 Requ...,"March 23, 2016",via web,2016-03-23,2016-03-23 09:49:00,2016-04-08 14:31:00,16 days 04:42:00
915,16-780,CLOSED,All records related to or arising out of: (i) ...,Planning Commission,,Ginger Rodriguez,"title,item,time ""Request Published Public"",,""M...",,title \ 0 Requ...,"April 22, 2016",via web,2016-04-22,2016-04-22 10:26:00,2016-05-16 16:08:00,24 days 05:42:00
3205,17-135,CLOSED,Saranya Kalai Associate Attorney Solomon Salts...,Planning Commission,"title,link PRA 17-135 Conditional Use Permit 6...",Ginger Rodriguez,"title,item,time ""Request Published Public"",,""J...",...,title \ 0 Requ...,"January 18, 2017",via web,2017-01-18,2017-01-18 13:24:00,2017-01-27 17:05:00,9 days 03:41:00
3434,17-364,CLOSED,"Dear Sir or Madam, I am writing to request all...",Planning Commission,,Angela Laurita,"title,item,time ""Request Published Public"",,""F...",,title \ 0 Requ...,"February 14, 2017",via web,2017-02-14,2017-02-14 14:42:00,2017-02-24 16:12:00,10 days 01:30:00
3888,17-818,CLOSED,Records related to Ranch Valley Farms/Rancho V...,"Development Services, Planning Commission",,Angela Laurita,"title,item,time ""Request Published Public"",,""M...",,title \ 0 Requ...,"April 5, 2017",via mail,2017-04-05,2017-04-05 15:22:00,2017-05-12 12:05:00,36 days 20:43:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27791,21-4322,CLOSED,Municipal Code updates. I often use the Munici...,"Development Services, Planning Commission","title,link https://www.sandiego.gov/city-clerk...",Ginger Rodriguez,"title,item,time ""Request Published Public"",,""A...",...,title \ 0 Requ...,"August 10, 2021",via web,2021-08-10,2021-08-10 13:21:00,2021-08-12 13:54:00,2 days 00:33:00
27935,21-4470,CLOSED,To Whom It May Concern/Custodian of Personnel ...,Commission on Police Practices,,Angela Laurita,"title,item,time ""Request Published Public"",,""S...",,title \ 0 Requ...,"August 18, 2021",via web,2021-08-18,2021-08-18 16:43:00,2021-09-07 08:24:00,19 days 15:41:00
28021,21-4560,CLOSED,"August 18, 2021 San Diego Police Department Re...","Police, Commission on Police Practices",,Angela Laurita,"title,item,time ""Request Published Public"",,""A...",,title \ 0 Requ...,"August 23, 2021",via mail,2021-08-23,2021-08-24 10:38:00,2021-08-24 11:50:00,0 days 01:12:00
28065,21-4606,CLOSED,"The agreement that represents the written ""bus...","Development Services, Planning Commission","title,link https://docs.sandiego.gov/council_r...",Ginger Rodriguez,"title,item,time ""Request Published Public"",,""S...",...,title \ 0 ...,"August 26, 2021",via web,2021-08-26,2021-08-26 15:18:00,2021-09-21 13:24:00,25 days 22:06:00


In [27]:
sd_depts = melt_depts(sd_clean)

Timedelta('24 days 16:48:34.721515504')

In [44]:
sd_depts.value_counts('dept')

dept
Police                                               6886
Development Services                                 4832
Public Records Administration                        4621
Code Enforcement                                     2038
Fire-Rescue                                          1755
                                                     ... 
Commission on Gang Prevention and Intervention          5
Deputy Chief Operating Officer - General Services       5
Compliance Department                                   4
Human Relations Commission                              4
Citizens Assistance                                     4
Length: 71, dtype: int64

In [46]:
len(list(np.sort(sd_depts['dept'].unique())))

71

In [46]:
# TODO: Extract department assignment additions and removals based on message history
def get_dept_assign(msgs, get_time=False):
    if msgs is None: return None

    dept_assign = msgs[lambda df: df['title'].str.contains('Department')].sort_values(by='time_dt', ignore_index=True)
    
    if dept_assign.empty: return None

    # TODO: Add option to include time information
    dept_assign_init = dept_assign[  # Initial department assignment
            lambda df: ~df['item'].str.contains(r'Added|Removed')
        ]['item'].to_numpy()
    dept_assign_added = dept_assign[  # Departments that were added
            lambda df: df['item'].str.contains('Added')
        ]['item'].str.split(
            r'Added: |Removed: '
        ).str[1].str.strip(' .').str.split(', ').to_numpy()
    dept_assign_removed = dept_assign[  # Departments that were removed
            lambda df: df['item'].str.contains('Removed')
        ]['item'].str.split(
            'Removed: '
        ).str[-1].str.strip(' .').str.split(', ').to_numpy()
    
    return {
        'dept_added': list(set([dept for l in list(dept_assign_added).append(dept_assign_init) for dept in l])),
        'dept_removed': list(set([dept for l in list(dept_assign_removed) for dept in l]))
        }

In [106]:
testtest = sd_clean['msgs_df'].apply(lambda df: len(df[df['title'].str.contains('Department')])).sort_values(ascending=False)

In [107]:
testtest.head(20)

22065    53
21905    53
7386     50
14991    50
14983    49
14984    47
19196    45
19496    44
14988    44
14986    43
14166    42
14985    40
14987    38
14989    36
26149    30
20013    27
6398     27
5607     26
20200    25
22047    25
Name: msgs_df, dtype: int64

In [178]:
test = sd_clean.loc[0]['msgs_df'][lambda df: df['title'].str.contains('Department')]
aa = test[lambda df: df['item'].str.contains('Removed')]['item'].str.split('Removed: ').str[-1].str.strip('.').str.split(', ').to_numpy()
aaa = list(set([dept for l in list(aa) for dept in l]))
aaa

[]