In [22]:
# %%
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)


from us_state_abbrev import us_state_abbrev, abbrev_us_state
%matplotlib inline
# %%
datap = Path('../Data')  # set the data path


INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


<br>
<br>

# Import Dataset: Diversity Index by location

In [2]:
# import the data
dfdiv = pd.read_csv(datap/"diversityindex.csv")


# create State and County columns from generic Location column
dfdiv[['County', 'State']] = pd.DataFrame(dfdiv['Location'].str.split(',').tolist())
dfdiv['County'] = dfdiv['County'].str.strip()
dfdiv['State'] = dfdiv['State'].str.strip()
dfdiv = dfdiv.drop(columns='Location')  # remove the original

# @TODO find the source of the NaNs
print(f'Number of nulls in State: {dfdiv["State"].isna().sum()}')
dfdiv = dfdiv.dropna(subset=['State'])
print(f'Number of nulls in State after dropping: {dfdiv["State"].isna().sum()}')


# reorder the columns
cols = dfdiv.columns.tolist()
cols = cols[-1:] + cols[-2:-1] + cols[:-2]
dfdiv = dfdiv[cols]

# replace extraneous instances of 'County' string
dfdiv['County'] = dfdiv['County']. \
    str.replace(' County',''). \
    str.replace('County ',''). \
    str.replace('County','')

dfdiv = dfdiv.sort_values(by=['State', 'County']).reset_index(drop=True)  # sort by geo loc

# Display the df and stats
print('\n\nTransposed DataFrame Head:')
display(dfdiv.head(5).T) # transposed version is easier to see on screen
display(dfdiv.info())
display(dfdiv.describe().T)


Number of nulls in State: 52
Number of nulls in State after dropping: 0


Transposed DataFrame Head:


Unnamed: 0,0,1,2,3,4
State,AK,AK,AK,AK,AK
County,Aleutians East Borough,Aleutians West Census Area,Anchorage Municipality,Bethel Census Area,Bristol Bay Borough
Diversity-Index,0.738867,0.769346,0.601515,0.315556,0.648271
"Black or African American alone, percent, 2013",7.7,7.4,6.3,0.7,0.0
"American Indian and Alaska Native alone, percent, 2013",21.8,13.8,8.1,81.8,33.8
"Asian alone, percent, 2013",41.4,31.1,8.9,1.0,1.1
"Native Hawaiian and Other Pacific Islander alone, percent,",0.7,2.3,2.3,0.3,0.3
"Two or More Races, percent, 2013",3.7,4.8,7.8,4.1,17.5
"Hispanic or Latino, percent, 2013",13.5,14.6,8.6,1.6,3.9
"White alone, not Hispanic or Latino, percent, 2013",12.9,29.2,60.5,11.5,45.3


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3143 entries, 0 to 3142
Data columns (total 10 columns):
 #   Column                                                      Non-Null Count  Dtype  
---  ------                                                      --------------  -----  
 0   State                                                       3143 non-null   object 
 1   County                                                      3143 non-null   object 
 2   Diversity-Index                                             3143 non-null   float64
 3   Black or African American alone, percent, 2013              3143 non-null   float64
 4   American Indian and Alaska Native alone, percent, 2013      3143 non-null   float64
 5   Asian alone, percent, 2013                                  3143 non-null   float64
 6   Native Hawaiian and Other Pacific Islander alone, percent,  3143 non-null   float64
 7   Two or More Races, percent, 2013                            3143 non-null   float64
 8 

None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Diversity-Index,3143.0,0.298603,0.181991,0.021816,0.126351,0.266234,0.471548,0.769346
"Black or African American alone, percent, 2013",3143.0,9.17986,14.44597,0.0,0.7,2.3,10.7,85.3
"American Indian and Alaska Native alone, percent, 2013",3143.0,2.231594,7.618798,0.0,0.3,0.6,1.2,92.9
"Asian alone, percent, 2013",3143.0,1.344798,2.668245,0.0,0.4,0.6,1.2,42.6
"Native Hawaiian and Other Pacific Islander alone, percent,",3143.0,0.117086,0.943694,0.0,0.0,0.0,0.1,47.8
"Two or More Races, percent, 2013",3143.0,1.857111,1.504005,0.0,1.1,1.5,2.1,29.5
"Hispanic or Latino, percent, 2013",3143.0,8.839644,13.398905,0.2,2.0,3.7,9.0,95.7
"White alone, not Hispanic or Latino, percent, 2013",3143.0,77.399459,19.85886,3.2,65.8,84.9,93.35,98.9


<br>
<br>

# Import Dataset: Offenses Known to Law Enforcement

In [3]:
dfoff = pd.read_excel(datap/"Table_8_Offenses_Known_to_Law_Enforcement_by_State_by_Metropolitan_and_Nonmetropolitan_Counties_2016.xls", \
    skiprows=4)

# clean the labels
cols = list(dfoff.columns)
cols = [label.replace('\n', '_') for label in cols]
dfoff.columns = cols

# fill forward the state column as it was originally a multiindex and 
# had NaNs after import
dfoff['State'].fillna(method='ffill', inplace=True)

# drop the last 9 rows which were apparently a footer
dfoff = dfoff.iloc[:-9]

# Display the df of known offenses
print('Transposed DataFrame Head:')
display(dfoff.head(6).T)  # transpose is easier to see on screen 
display(dfoff.info())
display(dfoff.describe().T)

Transposed DataFrame Head:


Unnamed: 0,0,1,2,3,4,5
State,ALABAMA - Metropolitan Counties,ALABAMA - Metropolitan Counties,ALABAMA - Metropolitan Counties,ALABAMA - Metropolitan Counties,ALABAMA - Metropolitan Counties,ALABAMA - Metropolitan Counties
County,Autauga,Baldwin,Bibb,Blount,Calhoun,Chilton
Violent_crime,73.0,127.0,0.0,394.0,23.0,151.0
Murder and_nonnegligent_manslaughter,1.0,1.0,0.0,1.0,0.0,0.0
Rape_(revised_definition)1,12.0,5.0,0.0,17.0,7.0,10.0
Rape_(legacy_definition)2,,,,,,
Robbery,8.0,23.0,0.0,9.0,5.0,3.0
Aggravated_assault,52.0,98.0,0.0,367.0,11.0,138.0
Property_crime,429.0,613.0,37.0,867.0,319.0,592.0
Burglary,146.0,229.0,20.0,261.0,137.0,247.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2612 entries, 0 to 2611
Data columns (total 13 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   State                                 2612 non-null   object 
 1   County                                2612 non-null   object 
 2   Violent_crime                         2610 non-null   float64
 3   Murder and_nonnegligent_manslaughter  2612 non-null   float64
 4   Rape_(revised_definition)1            2276 non-null   float64
 5   Rape_(legacy_definition)2             335 non-null    float64
 6   Robbery                               2612 non-null   float64
 7   Aggravated_assault                    2611 non-null   float64
 8   Property_crime                        2598 non-null   float64
 9   Burglary                              2602 non-null   float64
 10  Larceny-_theft                        2609 non-null   float64
 11  Motor_vehicle_the

None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Violent_crime,2610.0,84.9341,343.189771,0.0,5.0,19.0,56.0,8854.0
Murder and_nonnegligent_manslaughter,2612.0,1.265697,4.893567,0.0,0.0,0.0,1.0,94.0
Rape_(revised_definition)1,2276.0,10.226714,29.212003,0.0,0.0,3.0,9.0,537.0
Rape_(legacy_definition)2,335.0,6.871642,16.962399,0.0,0.0,3.0,7.0,162.0
Robbery,2612.0,14.078867,99.094475,0.0,0.0,1.0,4.0,3044.0
Aggravated_assault,2611.0,59.794332,221.547003,0.0,4.0,14.0,42.0,5181.0
Property_crime,2598.0,548.147036,1927.669957,0.0,45.0,140.0,372.0,45228.0
Burglary,2602.0,135.31937,386.059692,0.0,13.0,42.0,115.0,9162.0
Larceny-_theft,2609.0,364.490609,1365.058087,0.0,26.0,82.0,231.0,29962.0
Motor_vehicle_theft,2610.0,47.947893,223.335308,0.0,3.0,9.0,28.0,6104.0


<br>
<br>

In [4]:
# remove extraneous data from the State strings
states = dfoff['State'].str.split(' - ').tolist()
states = [l[0] for l in states]
dfoff['State'] = states
dfoff['State'] = dfoff['State'].str.title()  # to standard capitalization
dfoff['State'] = dfoff['State'].apply(lambda x: us_state_abbrev[x])  # convert full State names to abbreviations
dfoff['State'].sample(12).tolist()  # display a random sample of States

['NC', 'AZ', 'WI', 'OK', 'PA', 'CO', 'NJ', 'TX', 'KS', 'FL', 'MO', 'OK']

In [5]:
# Sort by geographical location
dfoff = dfoff.sort_values(by=['State', 'County']).reset_index(drop=True)  
dfoff

Unnamed: 0,State,County,Violent_crime,Murder and_nonnegligent_manslaughter,Rape_(revised_definition)1,Rape_(legacy_definition)2,Robbery,Aggravated_assault,Property_crime,Burglary,Larceny-_theft,Motor_vehicle_theft,Arson3
0,AL,Autauga,73.0,1.0,12.0,,8.0,52.0,429.0,146.0,233.0,50.0,
1,AL,Baldwin,127.0,1.0,5.0,,23.0,98.0,613.0,229.0,342.0,42.0,
2,AL,Barbour,32.0,0.0,4.0,,2.0,26.0,124.0,50.0,65.0,9.0,
3,AL,Bibb,0.0,0.0,0.0,,0.0,0.0,37.0,20.0,14.0,3.0,
4,AL,Blount,394.0,1.0,17.0,,9.0,367.0,867.0,261.0,501.0,105.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2607,WY,Sublette,3.0,0.0,0.0,,1.0,2.0,79.0,5.0,69.0,5.0,0.0
2608,WY,Sweetwater,25.0,0.0,7.0,,0.0,18.0,124.0,33.0,69.0,22.0,5.0
2609,WY,Uinta,4.0,0.0,0.0,,0.0,4.0,60.0,5.0,50.0,5.0,0.0
2610,WY,Washakie,0.0,0.0,0.0,,0.0,0.0,14.0,4.0,7.0,3.0,0.0


In [6]:
dfdiv.set_index(['State', 'County'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Diversity-Index,"Black or African American alone, percent, 2013","American Indian and Alaska Native alone, percent, 2013","Asian alone, percent, 2013","Native Hawaiian and Other Pacific Islander alone, percent,","Two or More Races, percent, 2013","Hispanic or Latino, percent, 2013","White alone, not Hispanic or Latino, percent, 2013"
State,County,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AK,Aleutians East Borough,0.738867,7.7,21.8,41.4,0.7,3.7,13.5,12.9
AK,Aleutians West Census Area,0.769346,7.4,13.8,31.1,2.3,4.8,14.6,29.2
AK,Anchorage Municipality,0.601515,6.3,8.1,8.9,2.3,7.8,8.6,60.5
AK,Bethel Census Area,0.315556,0.7,81.8,1.0,0.3,4.1,1.6,11.5
AK,Bristol Bay Borough,0.648271,0.0,33.8,1.1,0.3,17.5,3.9,45.3
...,...,...,...,...,...,...,...,...,...
WY,Sweetwater,0.340296,1.7,1.2,0.9,0.2,1.7,15.9,79.6
WY,Teton,0.325230,1.5,1.0,1.3,0.1,1.5,15.1,80.7
WY,Uinta,0.225552,0.8,1.3,0.4,0.2,1.7,9.1,87.5
WY,Washakie,0.296868,0.8,1.7,0.7,0.1,1.7,14.2,82.6


<br>
<br>

# Merge the DataFrames

In [20]:
df = dfdiv.merge(dfoff, how='inner', on=['State', 'County'])
df

Unnamed: 0,State,County,Diversity-Index,"Black or African American alone, percent, 2013","American Indian and Alaska Native alone, percent, 2013","Asian alone, percent, 2013","Native Hawaiian and Other Pacific Islander alone, percent,","Two or More Races, percent, 2013","Hispanic or Latino, percent, 2013","White alone, not Hispanic or Latino, percent, 2013",...,Murder and_nonnegligent_manslaughter,Rape_(revised_definition)1,Rape_(legacy_definition)2,Robbery,Aggravated_assault,Property_crime,Burglary,Larceny-_theft,Motor_vehicle_theft,Arson3
0,AL,Autauga,0.388898,18.4,0.5,1.1,0.1,1.7,2.7,75.9,...,1.0,12.0,,8.0,52.0,429.0,146.0,233.0,50.0,
1,AL,Baldwin,0.297942,9.5,0.7,0.9,0.1,1.5,4.6,83.1,...,1.0,5.0,,23.0,98.0,613.0,229.0,342.0,42.0,
2,AL,Barbour,0.556605,47.6,0.6,0.5,0.2,0.9,4.8,46.3,...,0.0,4.0,,2.0,26.0,124.0,50.0,65.0,9.0,
3,AL,Bibb,0.394100,22.1,0.4,0.2,0.1,0.9,2.1,74.6,...,0.0,0.0,,0.0,0.0,37.0,20.0,14.0,3.0,
4,AL,Blount,0.221056,1.8,0.6,0.3,0.1,1.1,8.7,87.8,...,1.0,17.0,,9.0,367.0,867.0,261.0,501.0,105.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2465,WY,Sublette,0.218882,1.4,1.1,1.1,0.0,1.4,7.8,88.0,...,0.0,0.0,,1.0,2.0,79.0,5.0,69.0,5.0,0.0
2466,WY,Sweetwater,0.340296,1.7,1.2,0.9,0.2,1.7,15.9,79.6,...,0.0,7.0,,0.0,18.0,124.0,33.0,69.0,22.0,5.0
2467,WY,Uinta,0.225552,0.8,1.3,0.4,0.2,1.7,9.1,87.5,...,0.0,0.0,,0.0,4.0,60.0,5.0,50.0,5.0,0.0
2468,WY,Washakie,0.296868,0.8,1.7,0.7,0.1,1.7,14.2,82.6,...,0.0,0.0,,0.0,0.0,14.0,4.0,7.0,3.0,0.0


<br>
<br>

# Export the DataFrame to a file

In [23]:
df.to_parquet(datap/'merged_df.parquet.gzip')