# Hate Crime ETL Project
## January 12, 2022

Start by loading in the proper modules and importing the data file

In [1]:
# Import necessary modules
import pandas as pd
import numpy as np
import psycopg2
# Import the .py file that I created to store my password
from passwords import password

In [65]:
# Import csv file from FBI CDE website
hate_crime = pd.read_csv("hate_crime.csv", low_memory=False)

# View data
hate_crime.head()

Unnamed: 0,INCIDENT_ID,DATA_YEAR,ORI,PUB_AGENCY_NAME,PUB_AGENCY_UNIT,AGENCY_TYPE_NAME,STATE_ABBR,STATE_NAME,DIVISION_NAME,REGION_NAME,...,OFFENDER_RACE,OFFENDER_ETHNICITY,VICTIM_COUNT,OFFENSE_NAME,TOTAL_INDIVIDUAL_VICTIMS,LOCATION_NAME,BIAS_DESC,VICTIM_TYPES,MULTIPLE_OFFENSE,MULTIPLE_BIAS
0,3015,1991,AR0040200,Rogers,,City,AR,Arkansas,West South Central,South,...,White,,1,Intimidation,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-Black or African American,Individual,S,S
1,3016,1991,AR0290100,Hope,,City,AR,Arkansas,West South Central,South,...,Black or African American,,1,Simple Assault,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-White,Individual,S,S
2,43,1991,AR0350100,Pine Bluff,,City,AR,Arkansas,West South Central,South,...,Black or African American,,1,Aggravated Assault,1.0,Residence/Home,Anti-Black or African American,Individual,S,S
3,44,1991,AR0350100,Pine Bluff,,City,AR,Arkansas,West South Central,South,...,Black or African American,,2,Aggravated Assault;Destruction/Damage/Vandalis...,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-White,Individual,M,S
4,3017,1991,AR0350100,Pine Bluff,,City,AR,Arkansas,West South Central,South,...,Black or African American,,1,Aggravated Assault,1.0,Service/Gas Station,Anti-White,Individual,S,S


Complete some small preprocessing steps so that the data will properly insert into postgres

In [66]:
# Fill in null values so that SQL will understand them
hate_crime = hate_crime.fillna(np.nan).replace([np.nan], [None])

In [67]:
# Eliminate commas in certain variables. 
# so, sql will not be confused by the comma delimited and assume extra columns
hate_crime["BIAS_DESC"] = hate_crime["BIAS_DESC"].str.replace(","," ")
hate_crime["OFFENSE_NAME"] = hate_crime["OFFENSE_NAME"].str.replace(","," ")
hate_crime["POPULATION_GROUP_DESC"] = hate_crime["POPULATION_GROUP_DESC"].str.replace(","," ")
hate_crime["PUB_AGENCY_UNIT"] = hate_crime["PUB_AGENCY_UNIT"].str.replace(","," ")
hate_crime["PUB_AGENCY_NAME"] = hate_crime["PUB_AGENCY_NAME"].str.replace(","," ")

Separate the data file into our two tables:
1) Incident

2) Location

**Incident Table**

In [68]:
# Separate the incident data
Incident = hate_crime[['INCIDENT_ID', 'DATA_YEAR', 'INCIDENT_DATE', 'ADULT_VICTIM_COUNT', 'JUVENILE_VICTIM_COUNT',
                      'TOTAL_OFFENDER_COUNT', 'ADULT_OFFENDER_COUNT', 'JUVENILE_OFFENDER_COUNT', 'OFFENDER_RACE',
                      'OFFENDER_ETHNICITY', 'VICTIM_COUNT', 'OFFENSE_NAME', 'TOTAL_INDIVIDUAL_VICTIMS', 'BIAS_DESC',
                      'VICTIM_TYPES', 'MULTIPLE_OFFENSE', 'MULTIPLE_BIAS', 'ORI']]
# View incident data
Incident.sort_values("INCIDENT_ID").head()

Unnamed: 0,INCIDENT_ID,DATA_YEAR,INCIDENT_DATE,ADULT_VICTIM_COUNT,JUVENILE_VICTIM_COUNT,TOTAL_OFFENDER_COUNT,ADULT_OFFENDER_COUNT,JUVENILE_OFFENDER_COUNT,OFFENDER_RACE,OFFENDER_ETHNICITY,VICTIM_COUNT,OFFENSE_NAME,TOTAL_INDIVIDUAL_VICTIMS,BIAS_DESC,VICTIM_TYPES,MULTIPLE_OFFENSE,MULTIPLE_BIAS,ORI
12,2,1991,15-JAN-91,,,0,,,Unknown,,1,Intimidation,1.0,Anti-Black or African American,Individual,S,S,AZ0072300
13,3,1991,22-JAN-91,,,1,,,Unknown,,1,Intimidation,1.0,Anti-Jewish,Individual,S,S,AZ0072300
14,4,1991,23-JAN-91,,,1,,,Unknown,,1,Intimidation,1.0,Anti-Arab,Individual,S,S,AZ0072300
15,5,1991,04-FEB-91,,,1,,,White,,1,Aggravated Assault,1.0,Anti-Black or African American,Individual,S,S,AZ0072300
16,6,1991,14-FEB-91,,,0,,,Unknown,,1,Destruction/Damage/Vandalism of Property,0.0,Anti-Protestant,Religious Organization,S,S,AZ0072300


In [69]:
# Confirm the number of unique incident_ids, which is our primary key
Incident['INCIDENT_ID'].nunique()

219577

In [70]:
# Confirm that this value matches the number of rows
len(Incident.index)

219577

In [71]:
# Convert to data frame type
Incident_df = pd.DataFrame(Incident)

**Location Table**

In [72]:
# Separate the location data
Location = hate_crime[['ORI', 'LOCATION_NAME', 'PUB_AGENCY_NAME', 'PUB_AGENCY_UNIT', 'AGENCY_TYPE_NAME', 'STATE_ABBR',
                      'STATE_NAME', 'DIVISION_NAME', 'REGION_NAME', 'POPULATION_GROUP_CODE', 'POPULATION_GROUP_DESC']]

# View the data
Location.head()

Unnamed: 0,ORI,LOCATION_NAME,PUB_AGENCY_NAME,PUB_AGENCY_UNIT,AGENCY_TYPE_NAME,STATE_ABBR,STATE_NAME,DIVISION_NAME,REGION_NAME,POPULATION_GROUP_CODE,POPULATION_GROUP_DESC
0,AR0040200,Highway/Road/Alley/Street/Sidewalk,Rogers,,City,AR,Arkansas,West South Central,South,5,Cities from 10 000 thru 24 999
1,AR0290100,Highway/Road/Alley/Street/Sidewalk,Hope,,City,AR,Arkansas,West South Central,South,6,Cities from 2 500 thru 9 999
2,AR0350100,Residence/Home,Pine Bluff,,City,AR,Arkansas,West South Central,South,3,Cities from 50 000 thru 99 999
3,AR0350100,Highway/Road/Alley/Street/Sidewalk,Pine Bluff,,City,AR,Arkansas,West South Central,South,3,Cities from 50 000 thru 99 999
4,AR0350100,Service/Gas Station,Pine Bluff,,City,AR,Arkansas,West South Central,South,3,Cities from 50 000 thru 99 999


In [73]:
# Convert to data frame type
Location_df = pd.DataFrame(Location)

In [74]:
# Delete duplicates of combination of LOCATION_NAME and ORI
Location_df = Location_df.drop_duplicates(subset=['ORI', 'LOCATION_NAME'], keep=False)

In [75]:
# View the data frame to check results
Location_df.head()

Unnamed: 0,ORI,LOCATION_NAME,PUB_AGENCY_NAME,PUB_AGENCY_UNIT,AGENCY_TYPE_NAME,STATE_ABBR,STATE_NAME,DIVISION_NAME,REGION_NAME,POPULATION_GROUP_CODE,POPULATION_GROUP_DESC
4,AR0350100,Service/Gas Station,Pine Bluff,,City,AR,Arkansas,West South Central,South,3,Cities from 50 000 thru 99 999
5,AR0350100,Grocery/Supermarket,Pine Bluff,,City,AR,Arkansas,West South Central,South,3,Cities from 50 000 thru 99 999
6,AR0600200,School/College,Little Rock,,City,AR,Arkansas,West South Central,South,2,Cities from 100 000 thru 249 999
11,AR0670000,School/College,Sevier,,County,AR,Arkansas,West South Central,South,8D,Non-MSA counties under 10 000
67,CO0010000,Specialty Store,Adams,,County,CO,Colorado,Mountain,West,9B,MSA counties from 25 000 thru 99 999


In [76]:
# Save these data frames to csv
Location_df.to_csv('hate_crime_location.csv', index=False)
Incident_df.to_csv('hate_crime_incident.csv', index=False)

In [33]:
Incident_df.dtypes

INCIDENT_ID                   int64
DATA_YEAR                     int64
INCIDENT_DATE                object
ADULT_VICTIM_COUNT          float64
JUVENILE_VICTIM_COUNT       float64
TOTAL_OFFENDER_COUNT          int64
ADULT_OFFENDER_COUNT        float64
JUVENILE_OFFENDER_COUNT     float64
OFFENDER_RACE                object
OFFENDER_ETHNICITY           object
VICTIM_COUNT                  int64
OFFENSE_NAME                 object
TOTAL_INDIVIDUAL_VICTIMS    float64
BIAS_DESC                    object
VICTIM_TYPES                 object
MULTIPLE_OFFENSE             object
MULTIPLE_BIAS                object
ORI                          object
LOCATION_NAME                object
dtype: object

**Add Data to SQL Database**

In [28]:
# Connect to postgreSQL database with default connections
# Need to add my password for it to work

# This instantiates a persistant client to speak with
conn = psycopg2.connect(host='localhost', dbname='postgres', user='postgres', password=password)

# The above 'conn' code created a connection "object" for us to use in future code

# Next, we instantiate the 'cursor' object, which helps us to execute certain commands
cur = conn.cursor()

**Incident Table**

In [78]:
# Create the new transaction
conn = psycopg2.connect(host='localhost', dbname='postgres', user='postgres', password=password)
cur = conn.cursor()


# Create an empty table, with which we will eventually populate data
# The table is called "practice"
# The 'id' column is an integer, and it is our primary key
# The next column is email, then name, then address; they are all text values
cur.execute("""
    CREATE TABLE Incident(
    INCIDENT_ID integer PRIMARY KEY,
    DATA_YEAR integer,
    INCIDENT_DATE date,
    ADULT_VICTIM_COUNT float,
    JUVENILE_VICTIM_COUNT float,
    TOTAL_OFFENDER_COUNT integer,
    ADULT_OFFENDER_COUNT float,
    JUVENILE_OFFENDER_COUNT float, 
    OFFENDER_RACE text,
    OFFENDER_ETHNICITY text,
    VICTIM_COUNT integer,
    OFFENSE_NAME text,
    TOTAL_INDIVIDUAL_VICTIMS float,
    BIAS_DESC text,
    VICTIM_TYPES text,
    MULTIPLE_OFFENSE text,
    MULTIPLE_BIAS text,
    ORI text
    
)
""")

conn.commit()

In [79]:
# Create the new transaction
conn = psycopg2.connect(host='localhost', dbname='postgres', user='postgres', password=password)
cur = conn.cursor()

# open the saved csv file
with open('hate_crime_incident.csv', 'r') as i:
    # Skip the header row
    next(i) 
    
    # Call the 'incident' table we created; delimited by a comma
    cur.copy_from(i, 'incident', sep=',', null='')

conn.commit()
conn.close()

In [80]:
Location_df.head()

Unnamed: 0,ORI,LOCATION_NAME,PUB_AGENCY_NAME,PUB_AGENCY_UNIT,AGENCY_TYPE_NAME,STATE_ABBR,STATE_NAME,DIVISION_NAME,REGION_NAME,POPULATION_GROUP_CODE,POPULATION_GROUP_DESC
4,AR0350100,Service/Gas Station,Pine Bluff,,City,AR,Arkansas,West South Central,South,3,Cities from 50 000 thru 99 999
5,AR0350100,Grocery/Supermarket,Pine Bluff,,City,AR,Arkansas,West South Central,South,3,Cities from 50 000 thru 99 999
6,AR0600200,School/College,Little Rock,,City,AR,Arkansas,West South Central,South,2,Cities from 100 000 thru 249 999
11,AR0670000,School/College,Sevier,,County,AR,Arkansas,West South Central,South,8D,Non-MSA counties under 10 000
67,CO0010000,Specialty Store,Adams,,County,CO,Colorado,Mountain,West,9B,MSA counties from 25 000 thru 99 999


**Location Table**

In [83]:
# Create the new transaction
conn = psycopg2.connect(host='localhost', dbname='postgres', user='postgres', password=password)
cur = conn.cursor()


# Create an empty table, with which we will eventually populate data
# The table is called "practice"
# The 'id' column is an integer, and it is our primary key
# The next column is email, then name, then address; they are all text values
cur.execute("""
    CREATE TABLE location(
    ORI text NOT NULL,
    LOCATION_NAME text NOT NULL,
    PUB_AGENCY_NAME text,
    PUB_AGENCY_UNIT text,
    AGENCY_TYPE_NAME text,
    STATE_ABBR text,
    STATE_NAME text, 
    DIVISION_NAME text,
    REGION_NAME text,
    POPULATION_GROUP_CODE text,
    POPULATION_GROUP_DESC text,
    CONSTRAINT location_id PRIMARY KEY (ORI,LOCATION_NAME)
    
)
""")

conn.commit()
conn.close()

In [84]:
# Create the new transaction
conn = psycopg2.connect(host='localhost', dbname='postgres', user='postgres', password=password)
cur = conn.cursor()

# open the saved csv file
with open('hate_crime_location.csv', 'r') as l:
    # Skip the header row
    next(l) 
    
    # Call the 'location' table we created; delimited by a comma
    cur.copy_from(l, 'location', sep=',', null='')

conn.commit()
conn.close()

# Compute Queries

1. What states have the highest adult victim count? - Join required
2. Which region had the most incidences in 2020? - Join required
3. What are the top 10 bias descriptions - No join
4. Which offense types are most common among juvenile offenders? - no join - No join
5. What are the top 10 location types for incidents involving adult victims? - Join required

**Query1**

What states have the highest adult victim count? - Join required

In [17]:
with psycopg2.connect(host='localhost', dbname='postgres', user='postgres', password=password) as conn:
    
    query1 = """SELECT state_abbr as state, COUNT(adult_victim_count) as adult_victim_count
                FROM incident AS i
                LEFT JOIN location AS l
                ON i.ori = l.ori
                GROUP BY state_abbr
                ORDER BY adult_victim_count DESC;"""
    state_count = pd.read_sql_query(query1,conn)

state_count.head()

Unnamed: 0,state,adult_victim_count
0,CA,31116
1,WA,24420
2,OH,16284
3,MI,15483
4,NJ,14755


**Query2**

Which region had the most incidences in 2020? - Join required

In [22]:
with psycopg2.connect(host='localhost', dbname='postgres', user='postgres', password=password) as conn:
    
    query2 = """SELECT region_name AS region, COUNT(incident_id) AS incident_count, data_year AS year
                FROM incident AS i
                LEFT JOIN location AS l
                ON i.ori = l.ori
                WHERE data_year = 2020
                GROUP BY region_name, data_year
                ORDER BY incident_count DESC"""
    inc_region = pd.read_sql_query(query2,conn)

inc_region.head()

Unnamed: 0,region,incident_count,year
0,West,15357,2020
1,Northeast,10358,2020
2,South,9514,2020
3,Midwest,9153,2020
4,Other,893,2020


**Query3**

What are the top 10 bias descriptions - No join

In [3]:
with psycopg2.connect(host='localhost', dbname='postgres', user='postgres', password=password) as conn:

    query3 = """SELECT bias_desc, COUNT(*)
                FROM incident
                GROUP BY bias_desc
                ORDER BY count DESC;"""
    top_bias = pd.read_sql_query(query3,conn)

top_bias.head(10)

Unnamed: 0,bias_desc,count
0,Anti-Black or African American,74762
1,Anti-Jewish,28013
2,Anti-White,25193
3,Anti-Gay (Male),21854
4,Anti-Hispanic or Latino,14030
5,Anti-Other Race/Ethnicity/Ancestry,10772
6,Anti-Lesbian Gay Bisexual or Transgender (M...,6707
7,Anti-Asian,6424
8,Anti-Multiple Races Group,5232
9,Anti-Lesbian (Female),4510


**Query4**

Which offense types are most common among juvenile offenders? - No join

In [29]:
with psycopg2.connect(host='localhost', dbname='postgres', user='postgres', password=password) as conn:
    
    query4 = """SELECT offense_name, count(juvenile_offender_count) as juvenile_offender_count
                FROM incident
                GROUP BY offense_name
                ORDER BY juvenile_offender_count DESC"""
    juv_count = pd.read_sql_query(query4,conn)

juv_count.head()

Unnamed: 0,offense_name,juvenile_offender_count
0,Intimidation,13345
1,Destruction/Damage/Vandalism of Property,11118
2,Simple Assault,10741
3,Aggravated Assault,5335
4,Robbery,867


What percent of victims are juvenile victims?

In [13]:
with psycopg2.connect(host='localhost', dbname='postgres', user='postgres', password=password) as conn:
    
    query4a = """SELECT CAST((sum(juvenile_victim_count)/sum(adult_victim_count+juvenile_victim_count)*100) 
                as DECIMAL(10,2))
                as proportion
                FROM incident;"""
    prop_juv = pd.read_sql_query(query4a,conn)

prop_juv

Unnamed: 0,proportion
0,12.1


**Query5**

What are the top 10 location types for incidents involving adult victims? - Join required

In [28]:
with psycopg2.connect(host='localhost', dbname='postgres', user='postgres', password=password) as conn:
    
    query5 = """SELECT location_name, SUM(adult_victim_count) as Total_adult_victims
                FROM incident as i
                LEFT JOIN location as l
                ON i.ori = l.ori
                WHERE adult_victim_count > 0
                GROUP BY location_name
                ORDER BY total_adult_victims DESC"""
    loc_adult = pd.read_sql_query(query5,conn)

loc_adult.head(10)

Unnamed: 0,location_name,total_adult_victims
0,Jail/Prison/Penitentiary/Corrections Facility,5700.0
1,Construction Site,5496.0
2,Department/Discount Store,4978.0
3,Service/Gas Station,4947.0
4,Convenience Store,4843.0
5,Government/Public Building,4781.0
6,Bank/Savings and Loan,4773.0
7,School-Elementary/Secondary,4596.0
8,Grocery/Supermarket,4540.0
9,Shopping Mall,4428.0
