In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# **Percent Over 25 Completed High School**

In [35]:
over_25_completed_hs = pd.read_csv('/content/drive/My Drive/data/fatal-police-shootings-in-the-us/PercentOver25CompletedHighSchool.csv', encoding="windows-1252")
over_25_completed_hs.info()
over_25_completed_hs.head()
over_25_completed_hs.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29329 entries, 0 to 29328
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Geographic Area       29329 non-null  object
 1   City                  29329 non-null  object
 2   percent_completed_hs  29329 non-null  object
dtypes: object(3)
memory usage: 687.5+ KB


Unnamed: 0,Geographic Area,City,percent_completed_hs
0,AL,Abanda CDP,21.2
1,AL,Abbeville city,69.1
2,AL,Adamsville city,78.9
3,AL,Addison town,81.4
4,AL,Akron town,68.6


Unnamed: 0,Geographic Area,City,percent_completed_hs
count,29329,29329,29329
unique,51,24255,728
top,PA,Franklin city,100
freq,1762,16,1301


1. percent_completed_hs will need to be converted to numeric values
2. since the missing entries are minor compared to the total size of the dataset, we are just going to drop them**

In [36]:
print('Before: {} entries'.format(len(over_25_completed_hs)))
print('Entries with missing values: {}'.format(len(over_25_completed_hs[over_25_completed_hs['percent_completed_hs'] == '-'])))

# dropping rows with missing percent_completed_hs
over_25_completed_hs = over_25_completed_hs[over_25_completed_hs['percent_completed_hs'] != '-']
over_25_completed_hs['percent_completed_hs'] = over_25_completed_hs['percent_completed_hs'].astype('float64')
print('After: {} entries'.format(len(over_25_completed_hs)))

Before: 29329 entries
Entries with missing values: 197
After: 29132 entries


# **Percentage People Below Poverty Level**

In [37]:
people_below_poverty = pd.read_csv('/content/drive/My Drive/data/fatal-police-shootings-in-the-us/PercentagePeopleBelowPovertyLevel.csv', encoding="windows-1252")
people_below_poverty.info()
people_below_poverty.head()
people_below_poverty.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29329 entries, 0 to 29328
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Geographic Area  29329 non-null  object
 1   City             29329 non-null  object
 2   poverty_rate     29329 non-null  object
dtypes: object(3)
memory usage: 687.5+ KB


Unnamed: 0,Geographic Area,City,poverty_rate
0,AL,Abanda CDP,78.8
1,AL,Abbeville city,29.1
2,AL,Adamsville city,25.5
3,AL,Addison town,30.7
4,AL,Akron town,42.0


Unnamed: 0,Geographic Area,City,poverty_rate
count,29329,29329,29329
unique,51,24255,771
top,PA,Franklin city,0
freq,1762,16,1464


1. poverty_rate will need to be converted to numeric values
2. since the missing entries are minor compared to the total size of the dataset, we are just going to drop them

In [38]:
print('Before: {} entries'.format(len(people_below_poverty)))
print('Entries with missing values: {}'.format(len(people_below_poverty[people_below_poverty['poverty_rate'] == '-'])))
people_below_poverty = people_below_poverty[people_below_poverty['poverty_rate'] != '-']
people_below_poverty['poverty_rate'] = people_below_poverty['poverty_rate'].astype('float64')
print('After: {} entries'.format(len(people_below_poverty)))

Before: 29329 entries
Entries with missing values: 201
After: 29128 entries


# **Median Household Income 2015**

In [39]:
median_household_income = pd.read_csv('/content/drive/My Drive/data/fatal-police-shootings-in-the-us/MedianHouseholdIncome2015.csv', encoding="windows-1252")
median_household_income.info()
median_household_income.head()
median_household_income.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29322 entries, 0 to 29321
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Geographic Area  29322 non-null  object
 1   City             29322 non-null  object
 2   Median Income    29271 non-null  object
dtypes: object(3)
memory usage: 687.4+ KB


Unnamed: 0,Geographic Area,City,Median Income
0,AL,Abanda CDP,11207
1,AL,Abbeville city,25615
2,AL,Adamsville city,42575
3,AL,Addison town,37083
4,AL,Akron town,21667


Unnamed: 0,Geographic Area,City,Median Income
count,29322,29322,29271
unique,51,24249,14592
top,PA,Franklin city,(X)
freq,1762,16,1113




1.   '(X)', '-', and na will need to be removed
2.   median income will be converted to numeric values
3.   values 250000+ and 2500- are safe to drop since they are just outliers and causes issues with manipulating data.



In [40]:
print('Cleaning data...')
print('Before: {} entries'.format(len(median_household_income)))
median_household_income = median_household_income[median_household_income['Median Income'] != '(X)']
median_household_income = median_household_income[median_household_income['Median Income'] != '-']
median_household_income = median_household_income[median_household_income['Median Income'].notna()]

# drop 250000+ and 2500-
median_household_income = median_household_income[~median_household_income['Median Income'].str.contains('-')]
median_household_income = median_household_income[~median_household_income['Median Income'].str.contains('+', regex=False)]

median_household_income['Median Income'] = median_household_income['Median Income'].astype('float64')
print('After: {} entries'.format(len(median_household_income)))

Cleaning data...
Before: 29322 entries
After: 27385 entries


# **Share Race By City**

In [41]:
share_race_by_city = pd.read_csv('/content/drive/My Drive/data/fatal-police-shootings-in-the-us/ShareRaceByCity.csv', encoding="windows-1252")
share_race_by_city.info()
share_race_by_city.head()
share_race_by_city.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29268 entries, 0 to 29267
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Geographic area        29268 non-null  object
 1   City                   29268 non-null  object
 2   share_white            29268 non-null  object
 3   share_black            29268 non-null  object
 4   share_native_american  29268 non-null  object
 5   share_asian            29268 non-null  object
 6   share_hispanic         29268 non-null  object
dtypes: object(7)
memory usage: 1.6+ MB


Unnamed: 0,Geographic area,City,share_white,share_black,share_native_american,share_asian,share_hispanic
0,AL,Abanda CDP,67.2,30.2,0.0,0.0,1.6
1,AL,Abbeville city,54.4,41.4,0.1,1.0,3.1
2,AL,Adamsville city,52.3,44.9,0.5,0.3,2.3
3,AL,Addison town,99.1,0.1,0.0,0.1,0.4
4,AL,Akron town,13.2,86.5,0.0,0.0,0.3


Unnamed: 0,Geographic area,City,share_white,share_black,share_native_american,share_asian,share_hispanic
count,29268,29268,29268,29268,29268,29268,29268
unique,51,24219,997,926,628,409,956
top,PA,Franklin city,100,0,0,0,0
freq,1764,16,1051,6587,6930,8537,2489




1.   Dropping '(X)'
2.   Convert share_white, share_black, share_native_american, share_asian, share_hispanic to numeric values



In [42]:
print('Before: {} entries.'.format(len(share_race_by_city)))

share_race_by_city = share_race_by_city[share_race_by_city['share_white']!='(X)']
share_race_by_city['share_white'] = share_race_by_city['share_white'].astype('float64')

share_race_by_city['share_black'] = share_race_by_city['share_black'].astype('float64')
share_race_by_city['share_native_american'] = share_race_by_city['share_native_american'].astype('float64')
share_race_by_city['share_asian'] = share_race_by_city['share_asian'].astype('float64')
share_race_by_city['share_hispanic'] = share_race_by_city['share_hispanic'].astype('float64')

print('After: {} entries.'.format(len(share_race_by_city)))

Before: 29268 entries.
After: 29248 entries.


# **Police Killing US**

In [43]:
police_killing_us = pd.read_csv('/content/drive/My Drive/data/fatal-police-shootings-in-the-us/FatalPoliceShootingUS.csv', encoding="windows-1252")
police_killing_us.info()
police_killing_us.head()
police_killing_us.describe()
police_killing_us.describe(include='object')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2535 entries, 0 to 2534
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       2535 non-null   int64  
 1   name                     2535 non-null   object 
 2   date                     2535 non-null   object 
 3   manner_of_death          2535 non-null   object 
 4   armed                    2526 non-null   object 
 5   age                      2458 non-null   float64
 6   gender                   2535 non-null   object 
 7   race                     2340 non-null   object 
 8   city                     2535 non-null   object 
 9   state                    2535 non-null   object 
 10  signs_of_mental_illness  2535 non-null   bool   
 11  threat_level             2535 non-null   object 
 12  flee                     2470 non-null   object 
 13  body_camera              2535 non-null   bool   
dtypes: bool(2), float64(1), 

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False


Unnamed: 0,id,age
count,2535.0,2458.0
mean,1445.731755,36.60537
std,794.25949,13.030774
min,3.0,6.0
25%,768.5,26.0
50%,1453.0,34.0
75%,2126.5,45.0
max,2822.0,91.0


Unnamed: 0,name,date,manner_of_death,armed,gender,race,city,state,threat_level,flee
count,2535,2535,2535,2526,2535,2340,2535,2535,2535,2470
unique,2481,879,2,68,2,6,1417,51,3,4
top,TK TK,21/12/16,shot,gun,M,W,Los Angeles,CA,attack,Not fleeing
freq,49,8,2363,1398,2428,1201,39,424,1611,1695




1.   Dropping na



In [44]:
police_killing_us = police_killing_us.dropna()

# **Combining**

Creating 'demographic' dataset, which is the combination of 'people_below_poverty', 'median_household_income', 'share_race_by_city', and 'over_25_completed_hs'

In [45]:
a = [people_below_poverty.set_index(['Geographic Area', 'City']), median_household_income.set_index(['Geographic Area', 'City']), share_race_by_city.set_index(['Geographic area', 'City'])]
demographic = over_25_completed_hs.set_index(['Geographic Area', 'City']).join(other=a).reset_index()
demographic.info()
demographic.head()
demographic.describe()
demographic.describe(include=['object'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29132 entries, 0 to 29131
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Geographic Area        29132 non-null  object 
 1   City                   29132 non-null  object 
 2   percent_completed_hs   29132 non-null  float64
 3   poverty_rate           29125 non-null  float64
 4   Median Income          27383 non-null  float64
 5   share_white            28930 non-null  float64
 6   share_black            28930 non-null  float64
 7   share_native_american  28930 non-null  float64
 8   share_asian            28930 non-null  float64
 9   share_hispanic         28930 non-null  float64
dtypes: float64(8), object(2)
memory usage: 2.2+ MB


Unnamed: 0,Geographic Area,City,percent_completed_hs,poverty_rate,Median Income,share_white,share_black,share_native_american,share_asian,share_hispanic
0,AL,Abanda CDP,21.2,78.8,11207.0,67.2,30.2,0.0,0.0,1.6
1,AL,Abbeville city,69.1,29.1,25615.0,54.4,41.4,0.1,1.0,3.1
2,AL,Adamsville city,78.9,25.5,42575.0,52.3,44.9,0.5,0.3,2.3
3,AL,Addison town,81.4,30.7,37083.0,99.1,0.1,0.0,0.1,0.4
4,AL,Akron town,68.6,42.0,21667.0,13.2,86.5,0.0,0.0,0.3


Unnamed: 0,percent_completed_hs,poverty_rate,Median Income,share_white,share_black,share_native_american,share_asian,share_hispanic
count,29132.0,29125.0,27383.0,28930.0,28930.0,28930.0,28930.0,28930.0
mean,85.733805,16.444045,51177.670672,83.190149,6.882655,2.856685,1.547159,9.203426
std,11.875153,13.176623,24887.873873,21.649996,15.673815,12.627056,4.293338,17.269697
min,0.0,0.0,4511.0,0.0,0.0,0.0,0.0,0.0
25%,81.0,7.1,35625.0,78.5,0.1,0.1,0.0,1.2
50%,88.4,13.7,45305.0,92.5,0.8,0.3,0.4,2.9
75%,93.5,22.5,59482.0,96.9,4.3,0.9,1.2,8.0
max,100.0,100.0,244083.0,100.0,100.0,100.0,67.1,100.0


Unnamed: 0,Geographic Area,City
count,29132,29132
unique,51,24089
top,PA,Franklin city
freq,1761,16


In [46]:
police_killing_us.head()

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False


In [47]:
demographic.groupby(['Geographic Area']).size().sort_values()

Geographic Area
DC       1
RI      36
DE      77
NH      97
VT     121
NV     130
ME     130
CT     144
HI     151
WY     186
ID     225
MA     245
UT     318
AK     336
MS     362
MT     363
OR     374
SD     384
SC     395
ND     400
WV     404
TN     430
NM     432
AZ     444
CO     452
LA     473
MD     516
KY     539
AR     541
NJ     543
NE     577
AL     584
VA     593
WA     619
GA     624
KS     670
IN     680
MI     692
NC     738
OK     739
WI     777
MN     903
FL     915
IA    1003
MO    1024
NY    1195
OH    1213
IL    1365
CA    1501
TX    1710
PA    1761
dtype: int64

In [48]:
demographic[demographic['Geographic Area']=='RI']['City'].sort_values()

23159                Ashaway CDP
23160               Bradford CDP
23161               Carolina CDP
23162         Central Falls city
23163            Charlestown CDP
23164              Chepachet CDP
23165              Clayville CDP
23166              Cranston city
23167        Cumberland Hill CDP
23168       East Providence city
23169          Foster Center CDP
23170                 Greene CDP
23171             Greenville CDP
23172                Harmony CDP
23173            Harrisville CDP
23174            Hope Valley CDP
23175              Hopkinton CDP
23176               Kingston CDP
23177               Melville CDP
23178            Misquamicut CDP
23179      Narragansett Pier CDP
23181           Newport East CDP
23180               Newport city
23182                Pascoag CDP
23183             Pawtucket city
23184            Providence city
23185          Quonochontaug CDP
23186               Tiverton CDP
23187           Valley Falls CDP
23188    Wakefield-Peacedale CDP
23189     

In [49]:
police_killing_us[police_killing_us['state']=='RI']['city']

1226    Pawtucket
1431     Tiverton
Name: city, dtype: object

We can see that although the city naming convention in 'police_shooting_us' is different from 'demographic', they only have the suffix 'CDP' or 'city'. We are going to remove those.

In [50]:
#### find the redundant name
demographic['City'].str.split().str[-1].value_counts()

city            10156
CDP              9512
town             4345
village          3757
borough          1209
County)           119
government          9
(balance)           8
municipality        4
Counties)           3
County              3
City                2
Princeton           1
Bow                 1
county              1
Washington          1
corporation         1
Name: City, dtype: int64

In [51]:
demographic['City'] = demographic['City'].str.replace(' city.*','')
demographic['City'] = demographic['City'].str.replace(' CDP.*','')
demographic['City'] = demographic['City'].str.replace(' town.*','')
#demographic['City'] = demographic['City'].str.replace(' borough.*','')
#demographic['City'] = demographic['City'].str.replace(' village.*','')

In [52]:
# rename column name for demographic
demographic = demographic.rename(columns={'Geographic Area':'state', 'City':'city'})
demographic['city'].head()
#
fatal_police_shooting = demographic.set_index(['state','city']).join(police_killing_us.set_index(['state','city'])).reset_index()
fatal_police_shooting.head()
fatal_police_shooting.info()

0        Abanda
1     Abbeville
2    Adamsville
3       Addison
4         Akron
Name: city, dtype: object

Unnamed: 0,state,city,percent_completed_hs,poverty_rate,Median Income,share_white,share_black,share_native_american,share_asian,share_hispanic,id,name,date,manner_of_death,armed,age,gender,race,signs_of_mental_illness,threat_level,flee,body_camera
0,AK,Adak,93.4,39.3,78500.0,19.6,4.0,5.5,52.5,8.9,,,,,,,,,,,,
1,AK,Akhiok,62.5,40.5,26250.0,8.5,1.4,50.7,1.4,11.3,,,,,,,,,,,,
2,AK,Akiachak,76.7,26.1,38750.0,3.5,0.2,95.1,0.2,0.2,,,,,,,,,,,,
3,AK,Akiak,81.3,31.3,42000.0,5.2,0.0,92.8,0.0,0.3,,,,,,,,,,,,
4,AK,Akutan,73.0,16.1,24750.0,23.3,17.9,5.5,43.3,20.8,,,,,,,,,,,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29969 entries, 0 to 29968
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   state                    29969 non-null  object 
 1   city                     29969 non-null  object 
 2   percent_completed_hs     29969 non-null  float64
 3   poverty_rate             29962 non-null  float64
 4   Median Income            28207 non-null  float64
 5   share_white              29767 non-null  float64
 6   share_black              29767 non-null  float64
 7   share_native_american    29767 non-null  float64
 8   share_asian              29767 non-null  float64
 9   share_hispanic           29767 non-null  float64
 10  id                       1958 non-null   float64
 11  name                     1958 non-null   object 
 12  date                     1958 non-null   object 
 13  manner_of_death          1958 non-null   object 
 14  armed                 

In [53]:
fatal_police_shooting
print(len(police_killing_us))

Unnamed: 0,state,city,percent_completed_hs,poverty_rate,Median Income,share_white,share_black,share_native_american,share_asian,share_hispanic,id,name,date,manner_of_death,armed,age,gender,race,signs_of_mental_illness,threat_level,flee,body_camera
0,AK,Adak,93.4,39.3,78500.0,19.6,4.0,5.5,52.5,8.9,,,,,,,,,,,,
1,AK,Akhiok,62.5,40.5,26250.0,8.5,1.4,50.7,1.4,11.3,,,,,,,,,,,,
2,AK,Akiachak,76.7,26.1,38750.0,3.5,0.2,95.1,0.2,0.2,,,,,,,,,,,,
3,AK,Akiak,81.3,31.3,42000.0,5.2,0.0,92.8,0.0,0.3,,,,,,,,,,,,
4,AK,Akutan,73.0,16.1,24750.0,23.3,17.9,5.5,43.3,20.8,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29964,WY,Woods Landing-Jelm,100.0,18.6,,95.9,0.0,0.0,2.1,0.0,,,,,,,,,,,,
29965,WY,Worland,85.6,15.3,41523.0,89.9,0.3,1.3,0.6,16.6,,,,,,,,,,,,
29966,WY,Wright,89.2,5.9,77114.0,94.5,0.1,1.4,0.2,6.2,,,,,,,,,,,,
29967,WY,Y-O Ranch,100.0,0.0,,92.8,1.5,2.6,0.0,11.8,,,,,,,,,,,,


2254


# **Output**
Eventhough we can merge all data into one single dataset, 'demographic' and 'police_killing_us' hold distinct contexts and can be analyze separately. Therefore, I'm going to output both out:
1.   demographic.csv
2.   police_klling_us.csv
3.   combined.csv (joined data between the other two)
4.   cleaned-MedianHouseholdIncome2015.csv
5.   cleaned-PercentOver25CompletedHighSchool.csv
6.   cleaned-PercentagePeopleBelowPoverty.csv
7.   cleaned-PoliceKillingsUS.csv
8.   cleaned-ShareRaceByCity.csv


In [54]:
demographic.to_csv('/content/drive/My Drive/data/cleaned-data/demographic.csv', index=False)
police_killing_us.to_csv('/content/drive/My Drive/data/cleaned-data/police_killing_us.csv', index=False)
fatal_police_shooting.to_csv('/content/drive/My Drive/data/cleaned-data/combined.csv', index=False)
median_household_income.to_csv('/content/drive/My Drive/data/cleaned-data/cleaned-MedianHouseholdIncome2015.csv', index=False)
over_25_completed_hs.to_csv('/content/drive/My Drive/data/cleaned-data/cleaned-PercentOver25CompletedHighSchool.csv', index=False)
people_below_poverty.to_csv('/content/drive/My Drive/data/cleaned-data/cleaned-PercentagePeopleBelowPoverty.csv', index=False)
police_killing_us.to_csv('/content/drive/My Drive/data/cleaned-data/cleaned-PoliceKillingsUS.csv', index=False)
share_race_by_city.to_csv('/content/drive/My Drive/data/cleaned-data/cleaned-ShareRaceByCity.csv', index=False)