In [1]:
import pandas as pd
import numpy as np

In [2]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj, pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else:  # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2  # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

In [3]:
csv_opts = {'sep': '|',
           'quotechar': '"',
           'compression': 'gzip',
           'encoding': 'utf-8',
           'header': 5}

In [4]:
arrest_dtypes = {'Area of Responsibility': 'category',
 'Apprehension Date': 'str',
 'Apprehension Method': 'category',
 'Apprehension Landmark': 'category',
 'Operation': 'category',
 'Processing Disposition': 'category',
 'Birth Date': 'category',
 'Citizenship': 'category',
 'Gender': 'category'}

encounter_dtypes = {'Event Area of Responsibility': 'category',
 'Event Date': 'str',
 'Landmark': 'category',
 'Operation': 'category',
 'Processing Disposition': 'category',
 'Birth Date': 'category',
 'Citizenship': 'category',
 'Gender': 'category'}

removal_dtypes = {'Area of Responsibility': 'category',
 'Apprehension Date': 'str',
 'Processing Disposition Code': 'category',
 'Birth Date': 'category',
 'Citizenship': 'category',
 'Gender': 'category',
 'RC Threat Level': 'category',
 'Final Charge Section': 'category',
 'Departed Date': 'str',
 'Removal Date': 'str'}

In [None]:
arrests16 = pd.read_csv('../input/EROArrests_FY2016.csv.gz', 
                        **csv_opts,
                        dtype=arrest_dtypes,
                        parse_dates=['Apprehension Date'])
arrests17 = pd.read_csv('../input/EROArrests_FY2017.csv.gz', 
                        **csv_opts,
                        dtype=arrest_dtypes, 
                        parse_dates=['Apprehension Date'])
arrests18 = pd.read_csv('../input/EROArrests_FY2018.csv.gz', 
                        **csv_opts,
                        dtype=arrest_dtypes, 
                        parse_dates=['Apprehension Date'])
arrests19 = pd.read_csv('../input/EROArrests_FY2019.csv.gz', 
                        **csv_opts,
                        dtype=arrest_dtypes, 
                        parse_dates=['Apprehension Date'])
encounters16 = pd.read_csv('../input/EROEncounters_FY2016.csv.gz',
                           **csv_opts,
                           dtype=encounter_dtypes,
                           parse_dates=['Event Date'])
encounters17 = pd.read_csv('../input/EROEncounters_FY2017.csv.gz',
                           **csv_opts,
                           dtype=encounter_dtypes,
                           parse_dates=['Event Date'])
encounters18 = pd.read_csv('../input/EROEncounters_FY2018.csv.gz',
                           **csv_opts,
                           dtype=encounter_dtypes,
                           parse_dates=['Event Date'])
encounters19 = pd.read_csv('../input/EROEncounters_FY2019.csv.gz',
                           **csv_opts,
                           dtype=encounter_dtypes,
                           parse_dates=['Event Date'])
removals16 = pd.read_csv('../input/ICERemovals_FY2016.csv.gz',
                         **csv_opts,
                        dtype=removal_dtypes,
                        parse_dates=['Departed Date', 'Removal Date'])
removals17 = pd.read_csv('../input/ICERemovals_FY2017.csv.gz',
                         **csv_opts,
                        dtype=removal_dtypes,
                        parse_dates=['Departed Date', 'Removal Date'])
removals18 = pd.read_csv('../input/ICERemovals_FY2018.csv.gz',
                         **csv_opts,
                        dtype=removal_dtypes,
                        parse_dates=['Departed Date', 'Removal Date'])
removals19 = pd.read_csv('../input/ICERemovals_FY2019.csv.gz',
                         **csv_opts,
                        dtype=removal_dtypes,
                        parse_dates=['Departed Date', 'Removal Date'])

In [None]:
arrests = pd.concat([arrests16, arrests17, arrests18, arrests19])
encounters = pd.concat([encounters16, encounters17, encounters18, encounters19])
removals = pd.concat([removals16, removals17, removals18, removals19])

In [None]:
# Convert 'object' columns to categories, where efficient.
# Implementation via https://www.dataquest.io/blog/pandas-big-data/
dfs_to_convert = [arrests, encounters, removals]
for df in dfs_to_convert:
    df_obj = df.select_dtypes(include=['object']).copy()
    converted_obj = pd.DataFrame()

    for col in df_obj.columns:
        num_unique_values = len(df_obj[col].unique())
        num_total_values = len(df_obj[col])
        if num_unique_values / num_total_values < 0.5:
            converted_obj.loc[:, col] = df_obj[col].astype('category')
        else:
            converted_obj.loc[:, col] = df_obj[col]

    df[converted_obj.columns] = converted_obj
    del df_obj, converted_obj

In [None]:
arrests.info()

In [None]:
encounters.info()

In [None]:
removals.info()

In [None]:
individual_dfs = [arrests16,
                 arrests17,
                 arrests18,
                 arrests19,
                 encounters16,
                 encounters17,
                 encounters18,
                 encounters19,
                 removals16,
                 removals17,
                 removals18,
                 removals19]
for df in individual_dfs:
    print(len(df))
    del df

In [None]:
del arrests16, arrests17, arrests18, arrests19, encounters16, encounters17, encounters18, encounters19, removals16, removals17, removals18, removals19

In [None]:
redacted = ['Birth Date']
arrests = arrests.drop(redacted, axis=1)
arrests.columns = arrests.columns.str.lower()
arrests.columns = arrests.columns.str.replace(' ', '_')

In [None]:
redacted = ['Birth Date']
encounters = encounters.drop(redacted, axis=1)
encounters.columns = encounters.columns.str.lower()
encounters.columns = encounters.columns.str.replace(' ', '_')

In [None]:
redacted = ['Birth Date']
removals = removals.drop(redacted, axis=1)
removals.columns = removals.columns.str.lower()
removals.columns = removals.columns.str.replace(' ', '_')

In [None]:
arrests['id'] = range(len(arrests))
removals['id'] = range(len(removals))
encounters['id'] = range(len(encounters))

In [None]:
arrests = arrests.rename({'area_of_responsibility': 'aor'}, axis=1)
encounters = encounters.rename({'event_area_of_responsibility': 'aor'}, axis=1)
removals = removals.rename({'area_of_responsibility': 'aor'}, axis=1)

In [None]:
removals.aor = removals.aor.str.replace('Area of Responsibility', '')
removals.aor = removals.aor.astype('category')
arrests.aor = arrests.aor.str.replace('Area of Responsibility', '')
arrests.aor = arrests.aor.astype('category')
encounters.aor = encounters.aor.str.replace('Area of Responsibility', '')
encounters.aor = encounters.aor.astype('category')

In [None]:
arrests.info()

In [None]:
grouper = [pd.Grouper(freq='AS-OCT'), 'aor']
arrests.set_index('apprehension_date').groupby(grouper)['id'].count()