# Analysis: Arrests to Offenses

Workflow: 3 <br>
Goal: Run analysis on arrest / offenses ratio. <br>

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import crime_helper as ch

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_colwidth', -1)

## Initial Setup

In [3]:
ARREST_FILE = 'data/arrest_tkm_state.csv'
EMPLOYMENT_FILE = 'data/employment_state.csv'
OFFENDER_FILE = 'data/offender_tkm_state.csv'
OFFENSE_FILE = 'data/offense_tkm_state.csv'
VICTIM_FILE = 'data/victim_tkm_state.csv'
STATE_FILE = 'data/lookup_state.csv'
REGION_FILE = 'data/lookup_region.csv'

In [4]:
state_df = pd.read_csv(STATE_FILE)
state_df.head()
state_df.info()

Unnamed: 0,region_code,state_abbr,state_fips_code,state_id,state_name
0,4,AK,2.0,1,Alaska
1,3,AL,1.0,2,Alabama
2,3,AR,5.0,3,Arkansas
3,99,AS,60.0,4,American Samoa
4,4,AZ,4.0,5,Arizona


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 5 columns):
region_code        58 non-null int64
state_abbr         58 non-null object
state_fips_code    57 non-null float64
state_id           58 non-null int64
state_name         58 non-null object
dtypes: float64(1), int64(2), object(2)
memory usage: 2.3+ KB


In [5]:
region_df = pd.read_csv(REGION_FILE)
region_df

Unnamed: 0,region_code,region_desc,region_name
0,0,U.S. Territories,U.S. Territories
1,1,Region I,Northeast
2,2,Region II,Midwest
3,3,Region III,South
4,4,Region IV,West
5,99,Other,Other


In [6]:
state_full_df = pd.merge(state_df, region_df, how='inner', on='region_code')
state_full_df.head()

Unnamed: 0,region_code,state_abbr,state_fips_code,state_id,state_name,region_desc,region_name
0,4,AK,2.0,1,Alaska,Region IV,West
1,4,AZ,4.0,5,Arizona,Region IV,West
2,4,CA,6.0,6,California,Region IV,West
3,4,CO,8.0,7,Colorado,Region IV,West
4,4,HI,15.0,15,Hawaii,Region IV,West


In [7]:
state_clean_df = state_full_df.sort_values('state_abbr').reset_index()
state_clean_df = state_clean_df[['state_abbr', 'state_name', 'region_code', 'region_name']]
state_clean_df.head(10)
state_clean_df.info()

Unnamed: 0,state_abbr,state_name,region_code,region_name
0,AK,Alaska,4,West
1,AL,Alabama,3,South
2,AR,Arkansas,3,South
3,AS,American Samoa,99,Other
4,AZ,Arizona,4,West
5,CA,California,4,West
6,CO,Colorado,4,West
7,CT,Connecticut,1,Northeast
8,CZ,Canal Zone,99,Other
9,DC,District of Columbia,3,South


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 4 columns):
state_abbr     58 non-null object
state_name     58 non-null object
region_code    58 non-null int64
region_name    58 non-null object
dtypes: int64(1), object(3)
memory usage: 1.9+ KB


## Explore Arrests

In [8]:
arrest_df = pd.read_csv(ARREST_FILE)
arrest_df = arrest_df.rename(columns={'count': 'arrests'})
arrest_df.head()
arrest_df.info()

Unnamed: 0,state,year,gender,offense,arrests
0,AK,2000,female,aggravated-assault,194
1,AK,2000,female,arson,3
2,AK,2000,female,burglary,49
3,AK,2000,female,curfew,1
4,AK,2000,female,disorderly-conduct,221


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51102 entries, 0 to 51101
Data columns (total 5 columns):
state      51102 non-null object
year       51102 non-null int64
gender     51102 non-null object
offense    51102 non-null object
arrests    51102 non-null int64
dtypes: int64(2), object(3)
memory usage: 1.9+ MB


In [9]:
arrest_total_df = arrest_df.groupby(['state', 'year']).sum().sort_values(['state', 'year'])
arrest_total_df = arrest_total_df.reset_index()
arrest_total_df.head()

Unnamed: 0,state,year,arrests
0,AK,2000,21354
1,AK,2001,20267
2,AK,2002,19818
3,AK,2003,22746
4,AK,2004,21027


In [10]:
arrest_year_df = arrest_total_df.groupby(['year']).count().sort_values(['year']) 
arrest_year_df = arrest_year_df.rename(columns={'arrests': 'row_count'})
arrest_year_df

Unnamed: 0_level_0,state,row_count
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,48,48
2001,50,50
2002,50,50
2003,50,50
2004,50,50
2005,50,50
2006,50,50
2007,50,50
2008,50,50
2009,50,50


## Explore Offenses

Conclusion: All states are not available.

In [11]:
offense_df = pd.read_csv(OFFENSE_FILE)
offense_df = offense_df.rename(columns={'count': 'offenses'})
offense_df.head()
offense_df.info()

Unnamed: 0,state,year,offense,offenses
0,AL,1991,aggravated-assault,21909
1,AL,1991,arson,92
2,AL,1991,burglary,47915
3,AL,1991,homicide,434
4,AL,1991,larceny,107142


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7323 entries, 0 to 7322
Data columns (total 4 columns):
state       7323 non-null object
year        7323 non-null int64
offense     7323 non-null object
offenses    7323 non-null int64
dtypes: int64(2), object(2)
memory usage: 228.9+ KB


In [12]:
offense_total_df = offense_df.loc[offense_df['year']>2000]
offense_total_df = offense_total_df.groupby(['state', 'year']).sum().sort_values(['state', 'year'])
offense_total_df = offense_total_df.reset_index()
offense_total_df.head()

Unnamed: 0,state,year,offenses
0,AL,2006,5036
1,AL,2007,5144
2,AL,2008,4876
3,AL,2009,4762
4,AL,2010,4524


In [13]:
offense_year_df = offense_total_df.groupby(['year']).count().sort_values(['year']) 
offense_year_df = offense_year_df.rename(columns={'offenses': 'row_count'})
offense_year_df

Unnamed: 0_level_0,state,row_count
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,23,23
2002,24,24
2003,27,27
2004,31,31
2005,33,33
2006,35,35
2007,35,35
2008,36,36
2009,36,36
2010,36,36


## Merge DataFrames

In [14]:
merge_df = pd.merge(arrest_total_df, offense_total_df, how='inner', on=['state', 'year'])
merge_df.head()
merge_df.info()

Unnamed: 0,state,year,arrests,offenses
0,AL,2006,99804,5036
1,AL,2007,122305,5144
2,AL,2008,130248,4876
3,AL,2009,118120,4762
4,AL,2010,85352,4524


<class 'pandas.core.frame.DataFrame'>
Int64Index: 626 entries, 0 to 625
Data columns (total 4 columns):
state       626 non-null object
year        626 non-null int64
arrests     626 non-null int64
offenses    626 non-null int64
dtypes: int64(3), object(1)
memory usage: 24.5+ KB


In [30]:
merge_df['ratio'] = round(merge_df['arrests'] / merge_df['offenses'], 4)
merge_df.head()

Unnamed: 0,state,year,arrests,offenses,ratio
0,AL,2006,99804,5036,19.8181
1,AL,2007,122305,5144,23.7762
2,AL,2008,130248,4876,26.7121
3,AL,2009,118120,4762,24.8047
4,AL,2010,85352,4524,18.8665


In [31]:
merge_year_df = merge_df.groupby(['year']).count().sort_values(['year']) 
merge_year_df

Unnamed: 0_level_0,state,arrests,offenses,ratio
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001,23,23,23,23
2002,24,24,24,24
2003,27,27,27,27
2004,31,31,31,31
2005,33,33,33,33
2006,35,35,35,35
2007,35,35,35,35
2008,36,36,36,36
2009,36,36,36,36
2010,36,36,36,36


In [16]:
merge_count_df = merge_df.groupby(['year']).sum().sort_values(['year'])
merge_count_df

Unnamed: 0_level_0,arrests,offenses,ratio
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2001,2973231,3614358,0.822617
2002,2972690,3834648,0.775218
2003,3169240,3931718,0.80607
2004,4060281,4373688,0.928343
2005,4289353,4952372,0.866121
2006,4619419,5166540,0.894103
2007,4807110,5256348,0.914534
2008,4881589,5310958,0.919154
2009,4743882,5377078,0.882242
2010,4457328,5426710,0.821368


In [17]:
# explore if sparse data can be broken up by region

combined_df = pd.merge(merge_df, state_clean_df, how='inner', left_on='state', right_on='state_abbr')
combined_df.head()

Unnamed: 0,state,year,arrests,offenses,state_abbr,state_name,region_code,region_name
0,AL,2006,99804,5036,AL,Alabama,3,South
1,AL,2007,122305,5144,AL,Alabama,3,South
2,AL,2008,130248,4876,AL,Alabama,3,South
3,AL,2009,118120,4762,AL,Alabama,3,South
4,AL,2010,85352,4524,AL,Alabama,3,South


In [18]:
combined_region_df = combined_df.groupby(['region_name', 'year']).count()[['state']]
combined_region_df

Unnamed: 0_level_0,Unnamed: 1_level_0,state
region_name,year,Unnamed: 2_level_1
Midwest,2001,8
Midwest,2002,8
Midwest,2003,8
Midwest,2004,9
Midwest,2005,9
Midwest,2006,10
Midwest,2007,10
Midwest,2008,10
Midwest,2009,10
Midwest,2010,10


## Create 3 period Dataframes

In [22]:
period_0405_df = combined_df.loc[combined_df['year'].between(2004, 2005)]
period_0405_df.head()
period_0405_df.shape

Unnamed: 0,state,year,arrests,offenses,state_abbr,state_name,region_code,region_name
16,AR,2004,95911,126152,AR,Arkansas,3,South
17,AR,2005,103396,152712,AR,Arkansas,3,South
31,AZ,2004,232119,3930,AZ,Arizona,4,West
32,AZ,2005,222729,5886,AZ,Arizona,4,West
49,CO,2004,160958,203544,CO,Colorado,4,West


(64, 8)

In [23]:
period_1112_df = combined_df.loc[combined_df['year'].between(2011, 2012)]
period_1112_df.head()
period_1112_df.shape

Unnamed: 0,state,year,arrests,offenses,state_abbr,state_name,region_code,region_name
5,AL,2011,1676,4524,AL,Alabama,3,South
6,AL,2012,1597,4574,AL,Alabama,3,South
23,AR,2011,77941,265542,AR,Arkansas,3,South
24,AR,2012,79201,264634,AR,Arkansas,3,South
38,AZ,2011,210903,12394,AZ,Arizona,4,West


(73, 8)

In [24]:
period_1718_df = combined_df.loc[combined_df['year'].between(2017, 2018)]
period_1718_df.head()
period_1718_df.shape

Unnamed: 0,state,year,arrests,offenses,state_abbr,state_name,region_code,region_name
11,AL,2017,77342,2156,AL,Alabama,3,South
12,AL,2018,76309,2028,AL,Alabama,3,South
29,AR,2017,71449,212578,AR,Arkansas,3,South
30,AR,2018,75950,206696,AR,Arkansas,3,South
44,AZ,2017,209542,18722,AZ,Arizona,4,West


(83, 8)

In [27]:
# examine number of counts per region

period_0405_df.groupby(['region_name']).count()[['state', 'year']]
period_1112_df.groupby(['region_name']).count()[['state', 'year']]
period_1718_df.groupby(['region_name']).count()[['state', 'year']]

Unnamed: 0_level_0,state,year
region_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Midwest,18,18
Northeast,12,12
South,22,22
West,12,12


Unnamed: 0_level_0,state,year
region_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Midwest,20,20
Northeast,13,13
South,26,26
West,14,14


Unnamed: 0_level_0,state,year
region_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Midwest,24,24
Northeast,14,14
South,29,29
West,16,16


In [19]:
save_path = r'data/crime_data_full.csv'
merge_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'0.01 mb'