# NextRequest data cleaning and EDA
*Author: Steven Yuan*

This notebook is used to perform preliminary EDA on NextRequest data using pandas

In [35]:
import pandas as pd
import numpy as np
from io import StringIO
import zipfile

import nextrequest_eda_utils
from nextrequest_eda_utils import *

In [36]:
import importlib
importlib.reload(nextrequest_eda_utils)

<module 'nextrequest_eda_utils' from '/home/powerofapoint/notebooks/police-records-analysis/steven/eda/nextrequest_eda_utils.py'>

## San Diego

In [None]:
sd = pd.read_csv(zipfile.ZipFile('../data/sd_requests.zip', 'r').open('sd_requests.csv'))
sd.head()

In [None]:
sd_clean = nextrequest_df_clean(sd, debug=True)

In [None]:
sd_clean.shape[0]

In [None]:
sd_clean.head()

In [None]:
sd_clean.loc[4]['msgs_df']

In [None]:
# Query for info about a specific request
request_id = '"17-22"'
sd_clean.query('id == ' + request_id).iloc[0]['msgs_df']

In [None]:
# Find request descriptions with the given substring, case insensitive
desc_search = 'Padres'
sd_clean[lambda df: df['desc'].str.contains(desc_search, case=False)]

In [None]:
# Find requests whose department(s) contain the given substring, case insensitive
dept_search = 'police'
sd_clean[lambda df: df['depts'].str.contains(dept_search, case=False)]

In [None]:
# Get open and close times for each request
sd_clean['open_time'] = sd_clean['msgs_df'].apply(get_open_time)
print('open_time done')
sd_clean['close_time'] = sd_clean['msgs_df'].apply(get_close_time)
print('close_time done')

In [None]:
sd_clean['resp'] = sd_clean['close_time'] - sd_clean['open_time']
sd_clean.head()

In [None]:
dept_search = r'police|sheriff'
sd_clean[lambda df: df['depts'].str.contains(dept_search, case=False)]['resp'].mean()

In [None]:
sd_depts = melt_depts(sd_clean)

In [None]:
sd_depts.value_counts('dept')

In [None]:
len(list(np.sort(sd_depts['dept'].unique())))

## Los Angeles

In [111]:
la = pd.read_csv(zipfile.ZipFile('../data/lacity_requests_fix.zip', 'r').open('lacity_requests.csv'))
la.head()

Unnamed: 0,id,status,desc,date,depts,docs,poc,msgs
0,17-1,CLOSED,From: Andrew Pletcher <Andrew@JMLLAW.com>\nDat...,"December 4, 2017 via email",City Clerk,,Clerk CPRA Coordinator,"title,item,time\n""Request Published\nPublic"",,..."
1,17-3,CLOSED,"Hello,\n Relating to Council File No. 12-0621...","December 8, 2017 via web",City Clerk,,Clerk CPRA Coordinator,"title,item,time\n""Request Closed Hide\nPubli..."
2,17-4,CLOSED,To Custodian of Records:\n Pursuant to my rig...,"December 8, 2017 via web",City Clerk,,Clerk CPRA Coordinator,"title,item,time\n""Request Closed Hide\nPubli..."
3,17-5,CLOSED,Warner Center 2035 Plan section 10.3.1 (a) say...,"December 8, 2017 via web",City Clerk,,Clerk CPRA Coordinator,"title,item,time\n""Request Closed Hide\nPubli..."
4,17-8,CLOSED,"Ordinance/Resolution, staff reports, memorandu...","December 13, 2017 via web",City Clerk,,Clerk CPRA Coordinator,"title,item,time\n""Request Closed Hide\nPubli..."


In [112]:
msgs_df = df_fillna(la)['msgs'].apply(lambda csv: remove_empty_df(df_fillna(csv_to_df(csv))))

In [113]:
msgs_df.apply(
            lambda df: convert_time_to_dt(extract_time(df, col='time', on=' by '), col='time')
        )

ParserError: Unknown string format: November 19, 2020, (Staff)

In [3]:
la_clean = nextrequest_df_clean(la, debug=True)

fillna complete
docs_df complete
msgs_df complete
date-via split complete


TypeError: string indices must be integers

In [None]:
la.loc[4]['msgs']

In [None]:
dept_search = 'police'
la_clean[lambda df: df['depts'].str.contains(desc_search, case=False)]

## TODO
- Fix bug with LA data: some dfs in msgs_df column are being interpreted as strings, not dataframes
- Write a procedure to extract all the documents that are attached to a request based on the message history
- Significance test to determine if the response times for police-related requests is different than response times for non-police related requests
- Some requests were re-opened/re-closed, how do we factor that into our analysis?
- Test EDA functions on different datasets once they are scraped

In [None]:
# TODO: Extract department assignment additions and removals based on message history
def get_dept_assign(msgs, get_time=False):
    if msgs is None: return None

    dept_assign = msgs[lambda df: df['title'].str.contains('Department')].sort_values(by='time_dt', ignore_index=True)
    
    if dept_assign.empty: return None

    # TODO: Add option to include time information
    dept_assign_init = dept_assign[  # Initial department assignment
            lambda df: ~df['item'].str.contains(r'Added|Removed')
        ]['item'].to_numpy()
    dept_assign_added = dept_assign[  # Departments that were added
            lambda df: df['item'].str.contains('Added')
        ]['item'].str.split(
            r'Added: |Removed: '
        ).str[1].str.strip(' .').str.split(', ').to_numpy()
    dept_assign_removed = dept_assign[  # Departments that were removed
            lambda df: df['item'].str.contains('Removed')
        ]['item'].str.split(
            'Removed: '
        ).str[-1].str.strip(' .').str.split(', ').to_numpy()
    
    return {
        'dept_added': list(set([dept for l in list(dept_assign_added).append(dept_assign_init) for dept in l])),
        'dept_removed': list(set([dept for l in list(dept_assign_removed) for dept in l]))
        }

In [None]:
testtest = sd_clean['msgs_df'].apply(lambda df: len(df[df['title'].str.contains('Department')])).sort_values(ascending=False)

In [None]:
testtest.head(20)

In [None]:
test = sd_clean.loc[0]['msgs_df'][lambda df: df['title'].str.contains('Department')]
aa = test[lambda df: df['item'].str.contains('Removed')]['item'].str.split('Removed: ').str[-1].str.strip('.').str.split(', ').to_numpy()
aaa = list(set([dept for l in list(aa) for dept in l]))
aaa