In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fatal-police-shootings-in-the-us/PercentagePeopleBelowPovertyLevel.csv
/kaggle/input/fatal-police-shootings-in-the-us/PercentOver25CompletedHighSchool.csv
/kaggle/input/fatal-police-shootings-in-the-us/MedianHouseholdIncome2015.csv
/kaggle/input/fatal-police-shootings-in-the-us/PoliceKillingsUS.csv
/kaggle/input/fatal-police-shootings-in-the-us/ShareRaceByCity.csv


In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# **Percent Over 25 Completed High School**

In [13]:
over_25_completed_hs = pd.read_csv('/kaggle/input/fatal-police-shootings-in-the-us/PercentOver25CompletedHighSchool.csv', encoding="windows-1252")
over_25_completed_hs.info()
over_25_completed_hs.head()
over_25_completed_hs.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29329 entries, 0 to 29328
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Geographic Area       29329 non-null  object
 1   City                  29329 non-null  object
 2   percent_completed_hs  29329 non-null  object
dtypes: object(3)
memory usage: 687.5+ KB


Unnamed: 0,Geographic Area,City,percent_completed_hs
0,AL,Abanda CDP,21.2
1,AL,Abbeville city,69.1
2,AL,Adamsville city,78.9
3,AL,Addison town,81.4
4,AL,Akron town,68.6


Unnamed: 0,Geographic Area,City,percent_completed_hs
count,29329,29329,29329
unique,51,24255,728
top,PA,Franklin city,100
freq,1762,16,1301


1. percent_completed_hs will need to be converted to numeric values
2. since the missing entries are minor compared to the total size of the dataset, we are just going to drop them**

In [None]:
print('Before: {} entries'.format(len(over_25_completed_hs)))
print('Entries with missing values: {}'.format(len(over_25_completed_hs[over_25_completed_hs['percent_completed_hs'] == '-'])))

# dropping rows with missing percent_completed_hs
over_25_completed_hs = over_25_completed_hs[over_25_completed_hs['percent_completed_hs'] != '-']
over_25_completed_hs['percent_completed_hs'] = over_25_completed_hs['percent_completed_hs'].astype('float64')
print('Before: {} entries'.format(len(over_25_completed_hs)))
over_25_completed_hs.to_csv('/cleaned_over_25_completed_hs.csv')

# Percentage People Below Poverty Level

In [None]:
people_below_poverty = pd.read_csv('/kaggle/input/fatal-police-shootings-in-the-us/PercentagePeopleBelowPovertyLevel.csv', encoding="windows-1252")
people_below_poverty = people_below_poverty[people_below_poverty['poverty_rate'] != '-']
people_below_poverty['poverty_rate'] = people_below_poverty['poverty_rate'].astype('float64')

In [None]:
people_below_poverty.info()
people_below_poverty.head()
people_below_poverty.describe()

# Median Household Income 2015

In [5]:
median_household_income = pd.read_csv('/kaggle/input/fatal-police-shootings-in-the-us/MedianHouseholdIncome2015.csv', encoding="windows-1252")

print('Cleaning data...')
print('Before: {} entries'.format(len(median_household_income)))
median_household_income = median_household_income[median_household_income['Median Income'] != '(X)']
median_household_income = median_household_income[median_household_income['Median Income'] != '-']
median_household_income = median_household_income[median_household_income['Median Income'].notna()]

# drop 250000+ and 2500-
median_household_income = median_household_income[~median_household_income['Median Income'].str.contains('-')]
median_household_income = median_household_income[~median_household_income['Median Income'].str.contains('+', regex=False)]

median_household_income['Median Income'] = median_household_income['Median Income'].astype('float64')
print('After: {} entries'.format(len(median_household_income)))

Cleaning data...
Before: 29322 entries
After: 27385 entries


In [6]:
median_household_income.info()
median_household_income.head()
median_household_income.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27385 entries, 0 to 29320
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Geographic Area  27385 non-null  object 
 1   City             27385 non-null  object 
 2   Median Income    27385 non-null  float64
dtypes: float64(1), object(2)
memory usage: 855.8+ KB


Unnamed: 0,Geographic Area,City,Median Income
0,AL,Abanda CDP,11207.0
1,AL,Abbeville city,25615.0
2,AL,Adamsville city,42575.0
3,AL,Addison town,37083.0
4,AL,Akron town,21667.0


Unnamed: 0,Median Income
count,27385.0
mean,51177.890268
std,24887.323132
min,4511.0
25%,35625.0
50%,45305.0
75%,59483.0
max,244083.0


# Share Race By City

In [None]:
share_race_by_city = pd.read_csv('/kaggle/input/fatal-police-shootings-in-the-us/ShareRaceByCity.csv', encoding="windows-1252")

In [None]:
share_race_by_city = share_race_by_city[share_race_by_city['share_white']!='(X)']
share_race_by_city['share_white'] = share_race_by_city['share_white'].astype('float64')

share_race_by_city['share_black'] = share_race_by_city['share_black'].astype('float64')
share_race_by_city['share_native_american'] = share_race_by_city['share_native_american'].astype('float64')
share_race_by_city['share_asian'] = share_race_by_city['share_asian'].astype('float64')
share_race_by_city['share_hispanic'] = share_race_by_city['share_hispanic'].astype('float64')

In [None]:
share_race_by_city.info()
share_race_by_city.head()
share_race_by_city.describe()

In [None]:
plt.figure(figsize=(10,10))
sns.boxplot(data = share_race_by_city.loc[:,['share_white','share_black', 'share_native_american', 'share_asian', 'share_hispanic']], whis=1)

# Police Killing US

In [None]:
police_killing_us = pd.read_csv('/kaggle/input/fatal-police-shootings-in-the-us/PoliceKillingsUS.csv', encoding="windows-1252")
police_killing_us = police_killing_us.dropna()

In [None]:
police_killing_us.info()
police_killing_us.head()
police_killing_us.describe()
police_killing_us.describe(include='object')

In [None]:
g = sns.FacetGrid(police_killing_us, row='gender',col='race')
g.map(sns.kdeplot, 'age')
police_killing_us[(police_killing_us['gender']=='F') & (police_killing_us['race']=='A')]

# Combine

The city names in the police_killing_us dataset are different from the others. Therefore, cannot join.

In [None]:
a = [people_below_poverty.set_index(['Geographic Area', 'City']), median_household_income.set_index(['Geographic Area', 'City']), share_race_by_city.set_index(['Geographic area', 'City'])]
demography = over_25_completed_hs.set_index(['Geographic Area', 'City']).join(other=a).reset_index()
demography.info()
demography.head()
demography.describe()
demography.describe(include=['object'])

In [None]:
sns.heatmap(demography.corr())

In [None]:
police_killing_us.head()

In [None]:
no_city_in_each_state = demography.groupby('Geographic Area').size()
death_in_each_state = police_killing_us.groupby('state').size()
death_per_city = death_in_each_state/no_city_in_each_state
# since we are working with states, exclude DC, remaining = 50 states
death_per_city = death_per_city[~(death_per_city.index == 'DC')]

In [None]:
plt.figure(figsize=(30,10))
plt.title('Death per city in each state', fontsize=30)
sns.barplot(y = death_per_city.sort_values(ascending=False), x = death_per_city.sort_values(ascending=False).index)

Predict if a person is black based on demography
Predict predict the education of the victim.

Proportion of each race in victim