# Data Exploration: Combining all datasets together

In [11]:
import os
import requests
import configparser
config = configparser.ConfigParser()
config.read(os.path.join(os.path.dirname(os.getcwd()), 'config.ini'))

['/home/lpascual/Projects/PoliceShootingsDashboard/config.ini']

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, DateType, IntegerType, BooleanType, FloatType

spark = SparkSession.builder.master('local[*]').appName('DataExploration').getOrCreate()

In [14]:
# Schema of the Police Shootings Dataset
psSchema = StructType([\
                       StructField('id', IntegerType(), False),
                       StructField('name', StringType(), True),
                       StructField('date', DateType(), True),
                       StructField('manner_of_death', StringType(), True),
                       StructField('armed', StringType(), True),
                       StructField('age', IntegerType(), True),
                       StructField('gender', StringType(), True),
                       StructField('race', StringType(), True),
                       StructField('city', StringType(), True),
                       StructField('state', StringType(), True),
                       StructField('s_o_m_i', BooleanType(), True),
                       StructField('threat_level', StringType(), True),
                       StructField('flee', StringType(), True),
                       StructField('body_camera', BooleanType(), True),
                       StructField('longitude', FloatType(), True),
                       StructField('latitude', FloatType(), True),
                       StructField('is_geocoding_exact', BooleanType(), True)
                        ])

headersAuth = {
    'Authorization': 'Bearer '+ config['unemploymentAPI']['unemployment_api_key']
}

In [None]:
# US Cities and Counties
usCitiesDF = spark.read.option('header', 'True').option('inferSchema', 'true').csv(config['pathways']['usCities'])
usCitiesDF.createOrReplaceTempView('usCities')

# US Demographics
usDemoDF = spark.read.option('header', 'true').option('inferSchema', 'true').csv(config['pathways']['usDemographics'])
usDemoDF.createOrReplaceTempView('usDemo')

# US Unemployment by County
endPointTemplate = 'https://api.careeronestop.org/v1/unemployment/{}/{}/{}'
url = endPointTemplate.format(config['unemploymentAPI']['unemployment_userID'], 'CA', 'county')
response = requests.get(url, headers=headersAuth, verify=True)
response = response.json()['CountyList']
caUEDataRDD = spark.sparkContext.parallelize(response)
caUEDataDF = spark.read.json(caUEDataRDD)
caUEDataDF = caUEDataDF.withColumn("State", func.lit('CA'))
caUEDataDF.createOrReplaceTempView('usUnemployment')

# Police Shootings
psDF = spark.read.option('header', 'True').schema(psSchema).csv(config['pathways']['policeShootings'])
psDF.createOrReplaceTempView('policeShootings')

## US Cities and Counties


In [None]:
usCitiesDF = spark.sql("""
    SELECT 
        state_id,
        state_name,
        county_name as county,
        city
    FROM usCities
""")

usCitiesDF.createOrReplaceTempView('usCitiesNorm')
usCitiesDF.show(5)

## US Demographics

In [None]:
# Normalize and Select relative columns
# Note: Records that have null values in the following columns: sex, race, min_age, max_age
#       need to be filtered out. Otherwise overcounting occurs.
#       Null in those columns, represents (Across all {sex, race, age})
usDemoNorm = spark.sql("""   
                        SELECT
                            state_name,
                            county_name,
                            sex,
                            min_age,
                            max_age,
                            year,
                            CASE 
                                WHEN race like 'AMERICAN INDIAN%' then 'American Indian'
                                WHEN race like 'SOME OTHER RACE%' then 'Other'
                                WHEN race like 'WHITE%' then 'White'
                                WHEN race like 'ASIAN%' then 'Asian'
                                WHEN race like 'NATIVE HAWAIIAN%' then 'Native Hawaiian'
                                WHEN race like 'TWO OR MORE%' then 'Mixed'
                                WHEN race like 'BLACK%' then 'African American'
                            END as race,
                            population
                        FROM usDemo
                        WHERE year = '2010' AND race is NOT NULL AND sex is NOT NULL 
                              AND min_age is NOT NULL AND max_age is NOT NULL
                        """)
usDemoNorm.createOrReplaceTempView('usDemoNorm')
usDemoNorm.show(5)

## Police Shootings

In [None]:
policeShootingsNorm = spark.sql("""
        SELECT 
            id,
            name,
            date,
            manner_of_death,
            armed,
            age,
            gender,
            s_o_m_i,
            threat_level,
            flee,
            body_camera,
            CASE
                WHEN race = 'A' THEN 'Asian'
                WHEN race = 'B' THEN 'Black'
                WHEN race = 'N' THEN 'Native'
                WHEN race = 'H' THEN 'Hispanic'
                WHEN race = 'W' THEN 'White'
                WHEN race = 'O' THEN 'Other'
                ELSE 'Not Documented'
            END as race, 
            city, 
            state as state_id
        FROM    
          policeShootings

""")
policeShootingsNorm.createOrReplaceTempView('policeShootingsNorm')
policeShootingsNorm.show(5)

## US Unemployment

In [None]:
caUEDataDF.createOrReplaceTempView("caUE")
caUEDataDF = spark.sql("""
        SELECT 
            State as state_id, 
            REPLACE(AreaName, ' County', '') as county,
            UnEmpCount as unemployment_count,
            UnEmpRate as unemployment_rate
        FROM caUE
        """)

caUEDataDF.createOrReplaceTempView("caUE")
caUEDataDF.show(5)

## California Subset

In [None]:
spark.sql("""
    SELECT 
        usc.state_name,
        usc.county,
        ps.name, 
        ps.manner_of_death,
        ps.armed,
        ps.age,
        ps.gender,
        ps.s_o_m_i,
        ps.threat_level,
        ps.flee,
        ps.body_camera,
        ps.race,
        ue.unemployment_count,
        ue.unemployment_rate
    FROM policeShootingsNorm as ps
    JOIN usCitiesNorm as usc
    ON ps.state_id = usc.state_id and ps.city = usc.city
    JOIN caUE as ue 
    ON ps.state_id = ue.state_id and usc.county = ue.county
    WHERE usc.state_name = 'California'
""").show(20)