# **Data Cleaning - 38741-0001-Data.tsv and 38741-0002-Data.tsv**
* The datasets contain uniform crime data with the number of arrest by states, county, police agents age, sex, and race in 2017
* County information is provided by number from 1 to 254, which are mapped with the county name using another data obtained separately
* **Source:** United States. Federal Bureau of Investigation 2017
* **URL:** https://www.icpsr.umich.edu/web/ICPSR/studies/38741
* **Data Dictionary:** 38741-0001-Codebook-ICPSR.pdf

* **Source:** Originating Agency Identifier (ORI) Lookup Table
* **URL:** https://www.icpsr.umich.edu/files/NACJD/ORIs/STATESoris.html

* **Data dictionary:** 38741-0001-Codebook-ICPSR.pdf

# **Import Modules**

In [25]:
#### Import the libraries needed
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pathlib import Path
import os
import glob

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)
warnings.filterwarnings('ignore')
%matplotlib notebook
%matplotlib inline

# **Set Environment**

In [26]:
import plotly.io as pio
pio.renderers.default = "vscode"

In [27]:
# Set up directory
working_directory = Path.cwd()
# src = working_directory.parents[1]
# data directory
raw_data_directory = working_directory / 'data' / 'raw'
processed_data_directory = working_directory / 'data' / 'processed'
final_data_directory = working_directory / 'data' / 'final'

In [28]:
# Set pd.options to add slide bars
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

In [29]:
# Set default title color
plt.style.use('fivethirtyeight')

# set default plt figure size
plt.rcParams["figure.figsize"] = [10, 5]
# suptitle
plt.rcParams["figure.titlesize"] = 22
plt.rcParams["figure.titleweight"] = "bold"
plt.rcParams['text.color'] = '#333333'
# title
plt.rcParams["axes.titlesize"] = 16
plt.rcParams["axes.titleweight"] = "bold"
plt.rcParams["axes.titlelocation"] = "left"
plt.rcParams['axes.titlecolor'] = '#333333'
# label
plt.rcParams["axes.labelsize"] = 12
plt.rcParams["axes.labelweight"] = "bold"
plt.rcParams['axes.labelcolor'] = '#333333'
# spines
plt.rcParams["axes.spines.bottom"] = True
plt.rcParams["axes.spines.left"] = True
plt.rcParams["axes.spines.top"] = False
plt.rcParams["axes.spines.right"] = False
# tick
plt.rcParams['xtick.color'] = "#333333"
plt.rcParams['ytick.color'] = "#333333"
# line width
plt.rcParams['lines.linewidth'] = 1

# **Load 38741-0001-Data.tsv and 38741-0002-Data.tsv**

* 3195 rows and 51 columns

* According to the data dictionary the values of 'SUMLEV', 'REGION' and 'DIVISION' are follows:
**'SUMLEV'**
040 = State and/or Statistical Equivalent  
050 = County and/or Statistical Equivalent 

**'REGION'**
1 = Northeast  
2 = Midwest  
3 = South  
4 = West  

**'DIVISION'**
1 = New England  
2 = Middle Atlantic  
3 = East North Central  
4 = West North Central  
5 = South Atlantic  
6 = East South Central  
7 = West South Central  
8 = Mountain  
9 = Pacific  

In [30]:
# Convert the 'date' columns to datetime data type
file = os.path.join(raw_data_directory, '38741-0001-Data.tsv')
crime_2017_1_df = pd.read_csv(file, sep='\t')
file = os.path.join(raw_data_directory, '38741-0002-Data.tsv')
crime_2017_2_df = pd.read_csv(file, sep='\t')

In [31]:
print("Shape of crime_2017_1_df", crime_2017_1_df.shape)
print("Shape of crime_2017_2_df", crime_2017_2_df.shape)

Shape of crime_2017_1_df (294667, 78)
Shape of crime_2017_2_df (257361, 78)


In [32]:
crime_2017_1_df.head(3)

Unnamed: 0,ASR_ID,CONTENTS,STATE,ORI,GROUP,DIV,YEAR,MSA,SUB,REPORT,ADJUST,SEQNO,COUNTY,CORE,POP,AGENCNT,AGENCY,STNAME,CARD1,CARD2,CARD3,OFFENSE,M0_9,M10_12,M13_14,M15,M16,M17,M18,M19,M20,M21,M22,M23,M24,M25_29,M30_34,M35_39,M40_44,M45_49,M50_54,M55_59,M60_64,M65,F0_9,F10_12,F13_14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25_29,F30_34,F35_39,F40_44,F45_49,F50_54,F55_59,F60_64,F65,JW,JB,JI,JA,JH,JN,AW,AB,AI,AA,AH,AN
0,,1,,017 0,,,,9998,,,9,,,,,,,,8,8,8,998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,999998
1,,1,,017 1,,,,9998,,,9,,,,,,,,8,8,8,998,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,999998
2,,1,,017 1,,,,9998,,,9,,,,,,,,8,8,8,998,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,2,999998


In [33]:
crime_2017_2_df.head(3)

Unnamed: 0,ASR_ID,CONTENTS,STATE,ORI,GROUP,DIV,YEAR,MSA,SUB,REPORT,ADJUST,SEQNO,COUNTY,CORE,POP,AGENCNT,AGENCY,STNAME,CARD1,CARD2,CARD3,OFFENSE,M0_9,M10_12,M13_14,M15,M16,M17,M18,M19,M20,M21,M22,M23,M24,M25_29,M30_34,M35_39,M40_44,M45_49,M50_54,M55_59,M60_64,M65,F0_9,F10_12,F13_14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25_29,F30_34,F35_39,F40_44,F45_49,F50_54,F55_59,F60_64,F65,JW,JB,JI,JA,JH,JN,AW,AB,AI,AA,AH,AN
0,3,2,50,AK00101,1C,9,2017,9998,0,0,0,0,0,Y,296188,1,ANCHORAGE,ALASKA,1,1,1,11,0,0,0,0,1,0,2,0,1,1,0,2,0,1,2,1,2,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,6,3,5,1,0,15
1,3,2,50,AK00101,1C,9,2017,9998,0,0,0,0,0,Y,296188,1,ANCHORAGE,ALASKA,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,2,50,AK00101,1C,9,2017,9998,0,0,0,0,0,Y,296188,1,ANCHORAGE,ALASKA,1,1,1,20,0,0,1,1,0,1,0,2,1,1,1,0,0,4,5,3,7,3,1,0,3,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,4,9,7,16,1,0,33


In [34]:
crime_2017_1_df.tail(3)

Unnamed: 0,ASR_ID,CONTENTS,STATE,ORI,GROUP,DIV,YEAR,MSA,SUB,REPORT,ADJUST,SEQNO,COUNTY,CORE,POP,AGENCNT,AGENCY,STNAME,CARD1,CARD2,CARD3,OFFENSE,M0_9,M10_12,M13_14,M15,M16,M17,M18,M19,M20,M21,M22,M23,M24,M25_29,M30_34,M35_39,M40_44,M45_49,M50_54,M55_59,M60_64,M65,F0_9,F10_12,F13_14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25_29,F30_34,F35_39,F40_44,F45_49,F50_54,F55_59,F60_64,F65,JW,JB,JI,JA,JH,JN,AW,AB,AI,AA,AH,AN
294664,3,2,49,WYDI050,7,8,2017,9998,0,0,0,0,0,N,0,1,WIND RIVER AGENCY,WYOMIN,0,0,0,990,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
294665,3,1,49,WYWHP00,8D,8,2017,9998,0,5,0,0,0,N,0,1,WYOMING HIGHWAY PATROL,WYOMIN,8,8,8,998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998
294666,3,1,49,WYWHP01,9E,8,2017,9998,1,5,0,0,11,N,0,1,"HIGHWAY PATROL, CHEYENNE",WYOMIN,8,8,8,998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998


In [35]:
crime_2017_2_df.tail(3)

Unnamed: 0,ASR_ID,CONTENTS,STATE,ORI,GROUP,DIV,YEAR,MSA,SUB,REPORT,ADJUST,SEQNO,COUNTY,CORE,POP,AGENCNT,AGENCY,STNAME,CARD1,CARD2,CARD3,OFFENSE,M0_9,M10_12,M13_14,M15,M16,M17,M18,M19,M20,M21,M22,M23,M24,M25_29,M30_34,M35_39,M40_44,M45_49,M50_54,M55_59,M60_64,M65,F0_9,F10_12,F13_14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25_29,F30_34,F35_39,F40_44,F45_49,F50_54,F55_59,F60_64,F65,JW,JB,JI,JA,JH,JN,AW,AB,AI,AA,AH,AN
257358,3,2,49,WY02301,6,8,2017,9998,0,0,0,0,23,N,3537,1,NEWCASTLE,WYOMIN,1,1,1,260,0,6,1,0,0,1,1,0,0,0,0,0,0,2,3,4,0,1,1,4,0,0,0,1,0,1,0,2,0,0,0,0,0,0,0,0,1,3,2,1,1,0,0,0,12,0,0,0,3,9,23,0,1,0,0,24
257359,3,2,49,WY02301,6,8,2017,9998,0,0,0,0,23,N,3537,1,NEWCASTLE,WYOMIN,0,0,0,990,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
257360,3,1,49,WYDI050,7,8,2017,9998,0,0,0,0,0,N,0,1,WIND RIVER AGENCY,WYOMIN,8,8,8,998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998


## **Merge 38741-0001-Data.tsv and 38741-0002-Data.tsv**

In [36]:
crime_2017_df = pd.concat([crime_2017_1_df, crime_2017_2_df], axis=0).reset_index(drop=True)

In [37]:
crime_2017_df.shape

(552028, 78)

In [38]:
crime_2017_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 552028 entries, 0 to 552027
Data columns (total 78 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   ASR_ID    552028 non-null  object
 1   CONTENTS  552028 non-null  int64 
 2   STATE     552028 non-null  object
 3   ORI       552028 non-null  object
 4   GROUP     552028 non-null  object
 5   DIV       552028 non-null  object
 6   YEAR      552028 non-null  object
 7   MSA       552028 non-null  int64 
 8   SUB       552028 non-null  object
 9   REPORT    552028 non-null  object
 10  ADJUST    552028 non-null  int64 
 11  SEQNO     552028 non-null  object
 12  COUNTY    552028 non-null  object
 13  CORE      552028 non-null  object
 14  POP       552028 non-null  object
 15  AGENCNT   552028 non-null  object
 16  AGENCY    552028 non-null  object
 17  STNAME    552028 non-null  object
 18  CARD1     552028 non-null  int64 
 19  CARD2     552028 non-null  int64 
 20  CARD3     552028 non-null 

In [39]:
crime_2017_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CONTENTS,552028.0,1.982611,0.130714,1.0,2.0,2.0,2.0,2.0
MSA,552028.0,9998.0,0.0,9998.0,9998.0,9998.0,9998.0,9998.0
ADJUST,552028.0,0.053861,0.404443,0.0,0.0,0.0,0.0,9.0
CARD1,552028.0,1.001429,0.986069,0.0,1.0,1.0,1.0,8.0
CARD2,552028.0,0.883524,1.037629,0.0,1.0,1.0,1.0,8.0
CARD3,552028.0,0.749944,1.077633,0.0,0.0,1.0,1.0,8.0
OFFENSE,552028.0,172.712573,162.4542,0.0,80.0,180.0,200.0,998.0
M0_9,552028.0,17316.130475,130446.402081,0.0,0.0,0.0,0.0,999998.0
M10_12,552028.0,17316.242767,130446.387182,0.0,0.0,0.0,0.0,999998.0
M13_14,552028.0,17316.53291,130446.348699,0.0,0.0,0.0,0.0,999998.0


In [40]:
crime_2017_df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
ASR_ID,552028,3,3,527452
STATE,552028,70,42,69411
ORI,552028,19231,MD01606,98
GROUP,552028,21,6,136065
DIV,552028,19,7,98616
YEAR,552028,3,2017,527452
SUB,552028,5,1,286144
REPORT,552028,8,0,500148
SEQNO,552028,3,0,527452
COUNTY,552028,408,2,16409


In [41]:
# The number of null in the entire dataset
crime_2017_df[crime_2017_df.isna()].sum().sum()

0.0

# **Load ori_county_mapping.tsv**

* Originating Agency Identifier (ORI) Lookup Table was scraped from the web and created as **ori_county_mapping.tsv**
* SOUTH CENTRAL KY DRUG TASK FORCE in KY has "_Undetermined" in 'county'
* **Source:** Originating Agency Identifier (ORI) Lookup Table
* **URL:** https://www.icpsr.umich.edu/files/NACJD/ORIs/STATESoris.html

In [42]:
# Load the ORI and the county name mapping
file = os.path.join(raw_data_directory, 'ori_county_mapping.tsv')
ori_county_mapping_df = pd.read_csv(file, sep='\t')

In [43]:
ori_county_mapping_df.shape

(23522, 5)

In [44]:
ori_county_mapping_df.head(3)

Unnamed: 0,agency,ORI7,ORI9,state,county
0,AUTAUGA COUNTY SHERIFF'S OFFICE,AL00400,AL0040000,ALABAMA,AUTAUGA
1,AUTAUGAVILLE POLICE DEPARTMENT,AL00402,AL0040200,ALABAMA,AUTAUGA
2,PRATTVILLE POLICE DEPARTMENT,AL00401,AL0040100,ALABAMA,AUTAUGA


In [45]:
ori_county_mapping_df.tail(3)

Unnamed: 0,agency,ORI7,ORI9,state,county
23519,NEWCASTLE POLICE DEPT,WY02301,WY0230100,WYOMING,WESTON
23520,UPTON POLICE DEPARTMENT,WY02302,WY0230200,WYOMING,WESTON
23521,WESTON COUNTY SHERIFF'S OFFICE,WY02300,WY0230000,WYOMING,WESTON


## **Data Cleanup and Preparation**

* About 10%, 73448 county names are still missing

* MSA, ASR_ID, SEQNO, AGENCNT has a single value --> delete 'MSA', 'ASR_ID', 'SEQNO', 'AGENCNT'
* Drop rows where 'STATE', 'STNAME', 'AGENCNT', 'AGENCY', 'COUNTY' have null
* Convert to the str data type if a column contains both string and numerical data types
    * 'YEAR' has 3 unique values --> contain string and integer --> convert integer to string
    * 'STATE' has 70 unique values --> contain string and integer --> convert integer to string
    * 'STMANE' has 58 unique values --> contain only 6 characters, if the state name is less than 6 chars, padded with a space
* Create 'state_name' to populate complete state names
* Create 'county_name to populate county names
* Drop 'STNAME'
* Convert POP to the int data type
* 999998 in the columns from M0_9 through M_65, F0-9 through F_65, and JW through AN means 0 --> Replace with 0
* Check null --> 11 rows where county_name is missing
* Those rows have all 0s after 'M0_9' columns --> Delete 

* Check unique values in the categorical columns

In [46]:
cat_cols = crime_2017_df.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
    print(f"{col} has {crime_2017_df[col].nunique()} unique values")
    print(crime_2017_df[col].unique())
    print()

ASR_ID has 3 unique values
[' ' '3' 3]

STATE has 70 unique values
[' ' '50' '99' '1' '54' '3' 3 99 2 4 98 5 6 52 8 7 9 10 '10' '55' '51'
 '14' '11' 11 12 13 15 16 17 20 19 18 21 69 22 24 23 25 26 32 33 28 29 30
 27 31 34 35 36 37 53 38 39 40 41 42 43 45 62 44 46 48 47 49 50 1 51 14
 '12' '13']

ORI has 19231 unique values
['017   0' '017   1' 'AK00101' ... 'NJ00407' 'OK01601' 'TX22002']

GROUP has 21 unique values
[' ' '1C' ' 4' ' 6' ' 7' ' 5' '8E' '8A' '9E' '9A' ' 2' ' 3' '9B' '9D' '9C'
 '8D' '8C' '8B' '1A' '1B' ' 0']

DIV has 19 unique values
[' ' '9' '6' '0' '7' 7 9 8 1 0 5 '5' '4' '8' 3 4 6 2 '3']



YEAR has 3 unique values
[' ' '2017' 2017]

SUB has 5 unique values
[' ' '0' '1' 1 0]

REPORT has 8 unique values
[' ' '0' '5' '2' 0 2 5 1]

SEQNO has 3 unique values
[' ' '0' 0]

COUNTY has 408 unique values
[' ' '0' '37' '49' '51' '1' '2' '3' '4' '5' '6' '7' '8' '9' '10' '11' '12'
 '13' '14' '15' '16' '17' '18' '19' '20' '21' '22' '23' '24' '25' '26'
 '27' '28' '29' '30' '31' '32' '33' '34' '35' '36' '38' '39' '40' '41'
 '42' '43' '44' '45' '46' '47' '48' '50' '52' '53' '54' '55' '56' '57'
 '58' '59' '60' '61' '62' '63' '64' '65' '66' '67' 4 5 6 7 8 9 10 11 12 13
 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 63 0
 61 62 64 65 66 67 68 69 70 71 72 73 74 75 1 2 3 300 999 112 76 77 78 79
 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
 103 104 105 106 107 108 109 110 111 113 114 115 '115' '116' '117' '118'
 '119' '68' '120' '121' '122' '123' '124' '125' '126' '127' '

* Drop the 'ASR_ID', 'CONTENTS', 'MSA', 'SEQNO', 'AGENCNT' columns

In [47]:
cols_to_drop = ['ASR_ID', 'CONTENTS', 'MSA', 'SEQNO', 'AGENCNT']
crime_2017_df.drop(cols_to_drop, axis=1, inplace=True)
crime_2017_df.reset_index(drop=True, inplace=True)

* Drop rows where 'STATE', 'STNAME', 'AGENCNT', 'AGENCY', 'COUNTY' have null

In [48]:
# Check whether columns contain ' ' and replace ' ' with NaN
cat_cols = [col for col in cat_cols if col not in cols_to_drop]
for col in cat_cols:
    if (crime_2017_df[col] == ' ').sum() > 0:
        print(f"{col} has {(crime_2017_df[col] == ' ').sum()} ' '")
        crime_2017_df[col].replace(' ', np.nan, inplace=True)

STATE has 50 ' '
GROUP has 50 ' '
DIV has 50 ' '
YEAR has 50 ' '
SUB has 50 ' '
REPORT has 50 ' '
COUNTY has 50 ' '
CORE has 50 ' '
POP has 50 ' '
AGENCY has 50 ' '
STNAME has 50 ' '


In [49]:
# crime_2017_df[crime_2017_df.isna().sum(axis=1) > 0].shape
conditions = ((crime_2017_df['STATE'].isna()) &\
             (crime_2017_df['STNAME'].isna()) &\
             (crime_2017_df['AGENCY'].isna()) &\
             (crime_2017_df['COUNTY'].isna()))

In [50]:
# Delete rows 552028--> 551978
crime_2017_df = crime_2017_df[~conditions].reset_index(drop=True)
crime_2017_df.shape

(551978, 73)

In [51]:
# Check null at the row level
crime_2017_df[crime_2017_df.isna().sum(axis=1) > 0]

Unnamed: 0,STATE,ORI,GROUP,DIV,YEAR,SUB,REPORT,ADJUST,COUNTY,CORE,POP,AGENCY,STNAME,CARD1,CARD2,CARD3,OFFENSE,M0_9,M10_12,M13_14,M15,M16,M17,M18,M19,M20,M21,M22,M23,M24,M25_29,M30_34,M35_39,M40_44,M45_49,M50_54,M55_59,M60_64,M65,F0_9,F10_12,F13_14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25_29,F30_34,F35_39,F40_44,F45_49,F50_54,F55_59,F60_64,F65,JW,JB,JI,JA,JH,JN,AW,AB,AI,AA,AH,AN


* Convert integer to string data type where columns contain both string and integer

In [52]:
# If columns contain both string and integer, convert integer values to string data type
def convert_to_str(val):
    try:
        return str(val)
    except ValueError:
        return val
    
for col in cat_cols:
    crime_2017_df[col] = crime_2017_df[col].apply(convert_to_str)

In [53]:
crime_2017_df.describe(include='O').T

Unnamed: 0,count,unique,top,freq
STATE,551978,59,42,69411
ORI,551978,19228,OH07714,98
GROUP,551978,20,6,136065
DIV,551978,10,7,98962
YEAR,551978,1,2017,551978
SUB,551978,2,1,295498
REPORT,551978,4,0,523632
COUNTY,551978,258,0,17287
CORE,551978,2,N,513700
POP,551978,10682,0,62420


* Create 'state_name' and populate the actual state name

In [54]:
# Check unique values in 'STNAME'
crime_2017_df['STNAME'].unique()

array(['ALASKA', ' OTHER', 'ALABAM', 'AMERIC', 'ARKANS', 'ARIZON',
       'CALIFO', 'FEDERA', 'COLORA', 'CONNEC', 'CANAL', 'DISTRI',
       'DELAWA', 'FLORID', 'GEORGI', '  GUAM', 'HAWAII', '  IOWA',
       ' IDAHO', 'ILLINO', 'INDIAN', 'KANSAS', 'KENTUC', 'LOUISI',
       'MASSAC', 'MARYLA', ' MAINE', 'MICHIG', 'MARIAN', 'MINNES',
       'MISSOU', 'MISSIS', 'MONTAN', 'NEBRAS', 'NORTH', 'NEW HA',
       'NEW JE', 'NEW ME', 'NEVADA', 'NEW YO', '  OHIO', 'OKLAHO',
       'OREGON', 'PENNSY', 'PUERTO', 'RHODE', 'SOUTH', 'TENNES', ' TEXAS',
       '  UTAH', 'VIRGIN', 'U.S. V', 'VERMON', 'WASHIN', 'WISCON',
       'WEST V', 'WYOMIN'], dtype=object)

In [55]:
# Create a dictionary that contains 'STATE': 'STNAME'
stname_state_mapping = set(zip(crime_2017_df['STNAME'], crime_2017_df['STATE']))
stname_state_mapping_dic = {t[1]: t[0] for t in list(stname_state_mapping)}

In [56]:
# Manually completed the state names and deleted US teritories
stname_state_mapping_dict = {
    '14': 'IOWA', '34': 'OHIO', '43': 'UTAH',  '11': 'IDAHO',  '18':  'MAINE',
    '42': 'TEXAS', '1': 'ALABAMA', '50': 'ALASKA', '2': 'ARIZONA', '3': 'ARKANSAS',
    '4': 'CALIFORNIA', '5': 'COLORADO', '6': 'CONNECTICUT', '7': 'DELAWARE', '8': 'DISTRICT OF COLUMBIA',
    '9': 'FLORIDA', '10': 'GEORGIA', '51': 'HAWAII', '12': 'ILLINOIS', '13': 'INDIANA',
    '15': 'KANSAS', '16': 'KENTUCKY', '17': 'LOUISIANA', '19': 'MARYLAND', '20': 'MASSACHUSETTS',
    '21': 'MICHIGAN', '22': 'MINNESOTA', '23': 'MISSISSIPPI', '24': 'MISSOURI', '25': 'MONTANA',
    '26': 'NEBRASKA', '27': 'NEVADA', '28': 'NEW HAMPSHIRE', '29': 'NEW JERSEY', '30': 'NEW MEXICO',
    '31': 'NEW YORK', '32': 'NORTH CAROLINA', '33': 'NORTH DAKOTA', '35': 'OKLAHOMA', '36': 'OREGON',
    '37': 'PENNSYLVANIA', '38': 'RHODE ISLAND', '39': 'SOUTH CAROLINA', '40': 'SOUTH DAKOTA', '41': 'TENNESSEE',
    '44': 'VERMONT', '45': 'VIRGINIA', '46': 'WASHINGTON', '47': 'WEST VIRGINIA', '48': 'WISCONSIN',
    '49': 'WYOMING'}

In [57]:
# Populate state name
state_name = [stname_state_mapping_dict[s] if s in stname_state_mapping_dict.keys() else "delete" for s in crime_2017_df['STATE']]
crime_2017_df['state_name'] = state_name

* Create 'county_name' and populate the county name

In [58]:
# Map 'ORI' and 'ORI7' in ori_county_mapping.tsv
county_to_fill = []
for i in range(len(crime_2017_df)):
    ori_to_check = crime_2017_df.loc[i, 'ORI']
    county_series = ori_county_mapping_df.loc[ori_county_mapping_df['ORI7'] == ori_to_check, 'county']
    if len(county_series) > 0:
        county = county_series.values[0]
    else:
        county = np.nan
    county_to_fill.append(county)

# Add county_to_fill to crime_2017_df
crime_2017_df['county_name'] = county_to_fill

In [59]:
# Create sets of unique combination of 'county', 'COUNTY', and 'STATE'
county_state_combination = tuple(zip(crime_2017_df['county_name'], crime_2017_df['COUNTY'], crime_2017_df['STATE']))
unique_county_state_combination = set(county_state_combination)

In [60]:
# Extract tuples that contain county name, 'COUNTY' county code and 'STATE' state code
county_name_mapping = [s for s in unique_county_state_combination if type(s[0])==str]

In [61]:
# Fill 'county' if the combination of 'COUNTY' and 'STATE' match the tuple's 2nd and 3rd combination
for i in range(len(crime_2017_df)):
    if type(crime_2017_df.loc[i, 'county_name']) != str:
        county_code = crime_2017_df.loc[i, 'COUNTY']
        state_code = crime_2017_df.loc[i, 'STATE']
        for t in county_name_mapping:
            if (county_code==t[1]) & (state_code==t[2]):
                crime_2017_df.loc[i, 'county_name'] = t[0]

* Delete rows that are US teritory records

In [62]:
# Delete rows 551978 --> 551704
condition = crime_2017_df['state_name'] == 'delete'
crime_2017_df = crime_2017_df[~condition].reset_index(drop=True)
crime_2017_df.head()

Unnamed: 0,STATE,ORI,GROUP,DIV,YEAR,SUB,REPORT,ADJUST,COUNTY,CORE,POP,AGENCY,STNAME,CARD1,CARD2,CARD3,OFFENSE,M0_9,M10_12,M13_14,M15,M16,M17,M18,M19,M20,M21,M22,M23,M24,M25_29,M30_34,M35_39,M40_44,M45_49,M50_54,M55_59,M60_64,M65,F0_9,F10_12,F13_14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25_29,F30_34,F35_39,F40_44,F45_49,F50_54,F55_59,F60_64,F65,JW,JB,JI,JA,JH,JN,AW,AB,AI,AA,AH,AN,state_name,county_name
0,50,AK00101,1C,9,2017,0,0,0,0,Y,296188,ANCHORAGE,ALASKA,1,1,1,11,0,0,0,0,1,0,2,0,1,1,0,2,0,1,2,1,2,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,6,3,5,1,0,15,ALASKA,ANCHORAGE MUNICIPALITY
1,50,AK00101,1C,9,2017,0,0,0,0,Y,296188,ANCHORAGE,ALASKA,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,ALASKA,ANCHORAGE MUNICIPALITY
2,50,AK00101,1C,9,2017,0,0,0,0,Y,296188,ANCHORAGE,ALASKA,1,1,1,20,0,0,1,1,0,1,0,2,1,1,1,0,0,4,5,3,7,3,1,0,3,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,4,9,7,16,1,0,33,ALASKA,ANCHORAGE MUNICIPALITY
3,50,AK00101,1C,9,2017,0,0,0,0,Y,296188,ANCHORAGE,ALASKA,1,1,1,30,0,0,2,4,3,2,9,8,2,8,8,7,7,30,37,21,10,11,5,0,0,0,0,0,2,0,0,0,3,3,0,3,3,1,1,9,7,6,3,1,0,0,0,0,6,5,2,0,1,12,78,46,68,11,1,202,ALASKA,ANCHORAGE MUNICIPALITY
4,50,AK00101,1C,9,2017,0,0,0,0,Y,296188,ANCHORAGE,ALASKA,1,1,1,40,0,9,10,8,7,5,9,18,16,25,24,21,28,141,108,85,57,58,36,33,21,11,0,0,6,5,1,3,6,2,11,9,9,13,15,64,43,31,12,23,16,11,4,2,23,17,9,5,0,54,361,145,370,86,5,957,ALASKA,ANCHORAGE MUNICIPALITY


In [63]:
crime_2017_df.shape

(551704, 75)

* Drop 'STNAME'

In [64]:
crime_2017_df.drop("STNAME", axis=1, inplace=True)

* Convert POP to the int data type

In [65]:
# Convert from string to the integer data type
crime_2017_df['POP'] = crime_2017_df['POP'].astype(int)

* 999998 in the columns from M0_9 through M_65, F0-9 through F_65, and JW through AN means 0 --> Replace with 0

In [66]:
num_cols = crime_2017_df.select_dtypes(exclude=['object', 'category']).columns
for col in num_cols:
    crime_2017_df.loc[crime_2017_df[col]==999998, col] = 0

* Check crime_2017_df

In [67]:
crime_2017_df.head()

Unnamed: 0,STATE,ORI,GROUP,DIV,YEAR,SUB,REPORT,ADJUST,COUNTY,CORE,POP,AGENCY,CARD1,CARD2,CARD3,OFFENSE,M0_9,M10_12,M13_14,M15,M16,M17,M18,M19,M20,M21,M22,M23,M24,M25_29,M30_34,M35_39,M40_44,M45_49,M50_54,M55_59,M60_64,M65,F0_9,F10_12,F13_14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25_29,F30_34,F35_39,F40_44,F45_49,F50_54,F55_59,F60_64,F65,JW,JB,JI,JA,JH,JN,AW,AB,AI,AA,AH,AN,state_name,county_name
0,50,AK00101,1C,9,2017,0,0,0,0,Y,296188,ANCHORAGE,1,1,1,11,0,0,0,0,1,0,2,0,1,1,0,2,0,1,2,1,2,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,6,3,5,1,0,15,ALASKA,ANCHORAGE MUNICIPALITY
1,50,AK00101,1C,9,2017,0,0,0,0,Y,296188,ANCHORAGE,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,ALASKA,ANCHORAGE MUNICIPALITY
2,50,AK00101,1C,9,2017,0,0,0,0,Y,296188,ANCHORAGE,1,1,1,20,0,0,1,1,0,1,0,2,1,1,1,0,0,4,5,3,7,3,1,0,3,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,4,9,7,16,1,0,33,ALASKA,ANCHORAGE MUNICIPALITY
3,50,AK00101,1C,9,2017,0,0,0,0,Y,296188,ANCHORAGE,1,1,1,30,0,0,2,4,3,2,9,8,2,8,8,7,7,30,37,21,10,11,5,0,0,0,0,0,2,0,0,0,3,3,0,3,3,1,1,9,7,6,3,1,0,0,0,0,6,5,2,0,1,12,78,46,68,11,1,202,ALASKA,ANCHORAGE MUNICIPALITY
4,50,AK00101,1C,9,2017,0,0,0,0,Y,296188,ANCHORAGE,1,1,1,40,0,9,10,8,7,5,9,18,16,25,24,21,28,141,108,85,57,58,36,33,21,11,0,0,6,5,1,3,6,2,11,9,9,13,15,64,43,31,12,23,16,11,4,2,23,17,9,5,0,54,361,145,370,86,5,957,ALASKA,ANCHORAGE MUNICIPALITY


In [68]:
crime_2017_df.tail()

Unnamed: 0,STATE,ORI,GROUP,DIV,YEAR,SUB,REPORT,ADJUST,COUNTY,CORE,POP,AGENCY,CARD1,CARD2,CARD3,OFFENSE,M0_9,M10_12,M13_14,M15,M16,M17,M18,M19,M20,M21,M22,M23,M24,M25_29,M30_34,M35_39,M40_44,M45_49,M50_54,M55_59,M60_64,M65,F0_9,F10_12,F13_14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25_29,F30_34,F35_39,F40_44,F45_49,F50_54,F55_59,F60_64,F65,JW,JB,JI,JA,JH,JN,AW,AB,AI,AA,AH,AN,state_name,county_name
551699,49,WY02301,6,8,2017,0,0,0,23,N,3537,NEWCASTLE,1,1,1,220,0,1,0,2,0,1,1,7,3,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,2,1,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,7,17,0,1,0,2,16,WYOMING,WESTON
551700,49,WY02301,6,8,2017,0,0,0,23,N,3537,NEWCASTLE,1,1,1,240,0,0,2,0,0,0,0,0,1,0,0,0,0,0,2,1,0,1,2,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,1,0,0,0,3,0,0,0,1,2,12,0,0,0,0,12,WYOMING,WESTON
551701,49,WY02301,6,8,2017,0,0,0,23,N,3537,NEWCASTLE,1,1,1,260,0,6,1,0,0,1,1,0,0,0,0,0,0,2,3,4,0,1,1,4,0,0,0,1,0,1,0,2,0,0,0,0,0,0,0,0,1,3,2,1,1,0,0,0,12,0,0,0,3,9,23,0,1,0,0,24,WYOMING,WESTON
551702,49,WY02301,6,8,2017,0,0,0,23,N,3537,NEWCASTLE,0,0,0,990,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,WYOMING,WESTON
551703,49,WYDI050,7,8,2017,0,0,0,0,N,0,WIND RIVER AGENCY,8,8,8,998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,WYOMING,FREMONT


* Check missing values

In [69]:
# Check the rows where county_name is missing
print(f"Columns have missing values: {crime_2017_df.columns[crime_2017_df.isna().sum()>0].values}")
print(f"Number of rows have missing values: {crime_2017_df['county_name'].isna().sum()}")

Columns have missing values: ['county_name']
Number of rows have missing values: 11


In [70]:
# Check the rows where county name is missing
crime_2017_df[crime_2017_df['county_name'].isna()]

Unnamed: 0,STATE,ORI,GROUP,DIV,YEAR,SUB,REPORT,ADJUST,COUNTY,CORE,POP,AGENCY,CARD1,CARD2,CARD3,OFFENSE,M0_9,M10_12,M13_14,M15,M16,M17,M18,M19,M20,M21,M22,M23,M24,M25_29,M30_34,M35_39,M40_44,M45_49,M50_54,M55_59,M60_64,M65,F0_9,F10_12,F13_14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25_29,F30_34,F35_39,F40_44,F45_49,F50_54,F55_59,F60_64,F65,JW,JB,JI,JA,JH,JN,AW,AB,AI,AA,AH,AN,state_name,county_name
39630,8,DCPPD00,9D,5,2017,1,5,0,999,N,0,UNITED STATES PARK POLIC,8,8,8,998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,DISTRICT OF COLUMBIA,
51419,51,HIDEA01,7,9,2017,0,5,0,0,N,0,"DEA, HONOLULU",8,8,8,998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,HAWAII,
51420,51,HIDOD01,7,9,2017,0,5,0,0,N,0,"DEPT OF DEFENSE, HONOLULU",8,8,8,998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,HAWAII,
51421,51,HIFBIHN,7,9,2017,1,5,0,0,N,0,"FBI, HONOLULU",8,8,8,998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,HAWAII,
165184,31,NY10100,8E,2,2017,0,5,0,314,N,0,"STATE POLICE HQS, ALBANY",8,8,8,998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,NEW YORK,
195672,37,PA02233,7,2,2017,0,5,0,0,N,0,PENNSYLVANIA GAME COMMISS,8,8,8,998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,PENNSYLVANIA,
204169,37,PA05112,7,2,2017,0,5,0,0,N,0,FIRST JUDICIAL DISTRICT O,8,8,8,998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,PENNSYLVANIA,
214675,39,SCDEA02,7,5,2017,0,5,0,0,N,0,"DEA, COLUMBIA",8,8,8,998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,SOUTH CAROLINA,
214676,39,SCFBICO,7,5,2017,1,5,0,0,N,0,"FBI, COLUMBIA",8,8,8,998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,SOUTH CAROLINA,
214680,39,SCSHP06,8E,5,2017,0,5,0,0,N,0,SOUTH CAROLINA HIGHWAY PA,8,8,8,998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,SOUTH CAROLINA,


In [71]:
# Check values in the numerical columns after 'M0_9'
missing_county = crime_2017_df[crime_2017_df['county_name'].isna()][num_cols[12:]]
missing_county.sum(axis=1)

39630     0
51419     0
51420     0
51421     0
165184    0
195672    0
204169    0
214675    0
214676    0
214680    0
291569    0
dtype: int64

* Delete those 11 rows

In [72]:
crime_2017_df = crime_2017_df[~crime_2017_df['county_name'].isna()].reset_index(drop=True)

In [73]:
# No missing values
print(f"Number of mising values: {crime_2017_df.isna().sum().sum()}")
print(f"Number of rows: {crime_2017_df.isna().shape[0]}, Number of rows: {crime_2017_df.isna().shape[1]}")

Number of mising values: 0
Number of rows: 551693, Number of rows: 74


# **Save the Dataframe to CSV**

In [74]:
file = os.path.join(processed_data_directory, 'cleaned_crime_2017.csv')
crime_2017_df.to_csv(file, index=False)

# **Verify the CSV File**

In [75]:
file = os.path.join(processed_data_directory, 'cleaned_crime_2017.csv')
crime_2017_df = pd.read_csv(file)

In [76]:
crime_2017_df.head()

Unnamed: 0,STATE,ORI,GROUP,DIV,YEAR,SUB,REPORT,ADJUST,COUNTY,CORE,POP,AGENCY,CARD1,CARD2,CARD3,OFFENSE,M0_9,M10_12,M13_14,M15,M16,M17,M18,M19,M20,M21,M22,M23,M24,M25_29,M30_34,M35_39,M40_44,M45_49,M50_54,M55_59,M60_64,M65,F0_9,F10_12,F13_14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25_29,F30_34,F35_39,F40_44,F45_49,F50_54,F55_59,F60_64,F65,JW,JB,JI,JA,JH,JN,AW,AB,AI,AA,AH,AN,state_name,county_name
0,50,AK00101,1C,9,2017,0,0,0,0,Y,296188,ANCHORAGE,1,1,1,11,0,0,0,0,1,0,2,0,1,1,0,2,0,1,2,1,2,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,6,3,5,1,0,15,ALASKA,ANCHORAGE MUNICIPALITY
1,50,AK00101,1C,9,2017,0,0,0,0,Y,296188,ANCHORAGE,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,ALASKA,ANCHORAGE MUNICIPALITY
2,50,AK00101,1C,9,2017,0,0,0,0,Y,296188,ANCHORAGE,1,1,1,20,0,0,1,1,0,1,0,2,1,1,1,0,0,4,5,3,7,3,1,0,3,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,4,9,7,16,1,0,33,ALASKA,ANCHORAGE MUNICIPALITY
3,50,AK00101,1C,9,2017,0,0,0,0,Y,296188,ANCHORAGE,1,1,1,30,0,0,2,4,3,2,9,8,2,8,8,7,7,30,37,21,10,11,5,0,0,0,0,0,2,0,0,0,3,3,0,3,3,1,1,9,7,6,3,1,0,0,0,0,6,5,2,0,1,12,78,46,68,11,1,202,ALASKA,ANCHORAGE MUNICIPALITY
4,50,AK00101,1C,9,2017,0,0,0,0,Y,296188,ANCHORAGE,1,1,1,40,0,9,10,8,7,5,9,18,16,25,24,21,28,141,108,85,57,58,36,33,21,11,0,0,6,5,1,3,6,2,11,9,9,13,15,64,43,31,12,23,16,11,4,2,23,17,9,5,0,54,361,145,370,86,5,957,ALASKA,ANCHORAGE MUNICIPALITY


In [77]:
crime_2017_df.tail()

Unnamed: 0,STATE,ORI,GROUP,DIV,YEAR,SUB,REPORT,ADJUST,COUNTY,CORE,POP,AGENCY,CARD1,CARD2,CARD3,OFFENSE,M0_9,M10_12,M13_14,M15,M16,M17,M18,M19,M20,M21,M22,M23,M24,M25_29,M30_34,M35_39,M40_44,M45_49,M50_54,M55_59,M60_64,M65,F0_9,F10_12,F13_14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25_29,F30_34,F35_39,F40_44,F45_49,F50_54,F55_59,F60_64,F65,JW,JB,JI,JA,JH,JN,AW,AB,AI,AA,AH,AN,state_name,county_name
551688,49,WY02301,6,8,2017,0,0,0,23,N,3537,NEWCASTLE,1,1,1,220,0,1,0,2,0,1,1,7,3,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,2,1,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,7,17,0,1,0,2,16,WYOMING,WESTON
551689,49,WY02301,6,8,2017,0,0,0,23,N,3537,NEWCASTLE,1,1,1,240,0,0,2,0,0,0,0,0,1,0,0,0,0,0,2,1,0,1,2,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,1,0,0,0,3,0,0,0,1,2,12,0,0,0,0,12,WYOMING,WESTON
551690,49,WY02301,6,8,2017,0,0,0,23,N,3537,NEWCASTLE,1,1,1,260,0,6,1,0,0,1,1,0,0,0,0,0,0,2,3,4,0,1,1,4,0,0,0,1,0,1,0,2,0,0,0,0,0,0,0,0,1,3,2,1,1,0,0,0,12,0,0,0,3,9,23,0,1,0,0,24,WYOMING,WESTON
551691,49,WY02301,6,8,2017,0,0,0,23,N,3537,NEWCASTLE,0,0,0,990,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,WYOMING,WESTON
551692,49,WYDI050,7,8,2017,0,0,0,0,N,0,WIND RIVER AGENCY,8,8,8,998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,WYOMING,FREMONT
