In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# **Percent Over 25 Completed High School**

In [39]:
over_25_completed_hs = pd.read_csv('/content/drive/My Drive/data/fatal-police-shootings-in-the-us/PercentOver25CompletedHighSchool.csv', encoding="windows-1252")
over_25_completed_hs.info()
over_25_completed_hs.head()
over_25_completed_hs.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29329 entries, 0 to 29328
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Geographic Area       29329 non-null  object
 1   City                  29329 non-null  object
 2   percent_completed_hs  29329 non-null  object
dtypes: object(3)
memory usage: 687.5+ KB


Unnamed: 0,Geographic Area,City,percent_completed_hs
0,AL,Abanda CDP,21.2
1,AL,Abbeville city,69.1
2,AL,Adamsville city,78.9
3,AL,Addison town,81.4
4,AL,Akron town,68.6


Unnamed: 0,Geographic Area,City,percent_completed_hs
count,29329,29329,29329
unique,51,24255,728
top,PA,Franklin city,100
freq,1762,16,1301


1. percent_completed_hs will need to be converted to numeric values
2. since the missing entries are minor compared to the total size of the dataset, we are just going to drop them**

In [26]:
print('Before: {} entries'.format(len(over_25_completed_hs)))
print('Entries with missing values: {}'.format(len(over_25_completed_hs[over_25_completed_hs['percent_completed_hs'] == '-'])))

# dropping rows with missing percent_completed_hs
over_25_completed_hs = over_25_completed_hs[over_25_completed_hs['percent_completed_hs'] != '-']
over_25_completed_hs['percent_completed_hs'] = over_25_completed_hs['percent_completed_hs'].astype('float64')
print('After: {} entries'.format(len(over_25_completed_hs)))
over_25_completed_hs.to_csv('/content/data/cleaned-data/cleaned-PercentOver25CompletedHighSchool.csv')

Before: 29329 entries
Entries with missing values: 197
After: 29132 entries


# **Percentage People Below Poverty Level**

In [40]:
people_below_poverty = pd.read_csv('/content/drive/My Drive/data/fatal-police-shootings-in-the-us/PercentagePeopleBelowPovertyLevel.csv', encoding="windows-1252")
people_below_poverty.info()
people_below_poverty.head()
people_below_poverty.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29329 entries, 0 to 29328
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Geographic Area  29329 non-null  object
 1   City             29329 non-null  object
 2   poverty_rate     29329 non-null  object
dtypes: object(3)
memory usage: 687.5+ KB


Unnamed: 0,Geographic Area,City,poverty_rate
0,AL,Abanda CDP,78.8
1,AL,Abbeville city,29.1
2,AL,Adamsville city,25.5
3,AL,Addison town,30.7
4,AL,Akron town,42.0


Unnamed: 0,Geographic Area,City,poverty_rate
count,29329,29329,29329
unique,51,24255,771
top,PA,Franklin city,0
freq,1762,16,1464


1. poverty_rate will need to be converted to numeric values
2. since the missing entries are minor compared to the total size of the dataset, we are just going to drop them

In [30]:
print('Before: {} entries'.format(len(people_below_poverty)))
print('Entries with missing values: {}'.format(len(people_below_poverty[people_below_poverty['poverty_rate'] == '-'])))
people_below_poverty = people_below_poverty[people_below_poverty['poverty_rate'] != '-']
people_below_poverty['poverty_rate'] = people_below_poverty['poverty_rate'].astype('float64')
print('After: {} entries'.format(len(people_below_poverty)))
people_below_poverty.to_csv('/content/data/cleaned-data/cleaned-PercentagePeopleBelowPoverty.csv')

Before: 29329 entries
Entries with missing values: 201
After: 29128 entries


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


# **Median Household Income 2015**

In [55]:
median_household_income = pd.read_csv('/content/drive/My Drive/data/fatal-police-shootings-in-the-us/MedianHouseholdIncome2015.csv', encoding="windows-1252")
median_household_income.info()
median_household_income.head()
median_household_income.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29322 entries, 0 to 29321
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Geographic Area  29322 non-null  object
 1   City             29322 non-null  object
 2   Median Income    29271 non-null  object
dtypes: object(3)
memory usage: 687.4+ KB


Unnamed: 0,Geographic Area,City,Median Income
0,AL,Abanda CDP,11207
1,AL,Abbeville city,25615
2,AL,Adamsville city,42575
3,AL,Addison town,37083
4,AL,Akron town,21667


Unnamed: 0,Geographic Area,City,Median Income
count,29322,29322,29271
unique,51,24249,14592
top,PA,Franklin city,(X)
freq,1762,16,1113




1.   '(X)', '-', and na will need to be removed
2.   median income will be converted to numeric values
3.   values 250000+ and 2500- are safe to drop since they are just outliers and causes issues with manipulating data.



In [56]:
print('Cleaning data...')
print('Before: {} entries'.format(len(median_household_income)))
median_household_income = median_household_income[median_household_income['Median Income'] != '(X)']
median_household_income = median_household_income[median_household_income['Median Income'] != '-']
median_household_income = median_household_income[median_household_income['Median Income'].notna()]

# drop 250000+ and 2500-
median_household_income = median_household_income[~median_household_income['Median Income'].str.contains('-')]
median_household_income = median_household_income[~median_household_income['Median Income'].str.contains('+', regex=False)]

median_household_income['Median Income'] = median_household_income['Median Income'].astype('float64')
print('After: {} entries'.format(len(median_household_income)))
median_household_income.to_csv('/content/drive/My Drive/data/cleaned-data/cleaned-MedianHouseholdIncome2015.csv')

Cleaning data...
Before: 29322 entries
After: 27385 entries


# **Share Race By City**

In [41]:
share_race_by_city = pd.read_csv('/content/drive/My Drive/data/fatal-police-shootings-in-the-us/ShareRaceByCity.csv', encoding="windows-1252")
share_race_by_city.info()
share_race_by_city.head()
share_race_by_city.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29268 entries, 0 to 29267
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Geographic area        29268 non-null  object
 1   City                   29268 non-null  object
 2   share_white            29268 non-null  object
 3   share_black            29268 non-null  object
 4   share_native_american  29268 non-null  object
 5   share_asian            29268 non-null  object
 6   share_hispanic         29268 non-null  object
dtypes: object(7)
memory usage: 1.6+ MB


Unnamed: 0,Geographic area,City,share_white,share_black,share_native_american,share_asian,share_hispanic
0,AL,Abanda CDP,67.2,30.2,0.0,0.0,1.6
1,AL,Abbeville city,54.4,41.4,0.1,1.0,3.1
2,AL,Adamsville city,52.3,44.9,0.5,0.3,2.3
3,AL,Addison town,99.1,0.1,0.0,0.1,0.4
4,AL,Akron town,13.2,86.5,0.0,0.0,0.3


Unnamed: 0,Geographic area,City,share_white,share_black,share_native_american,share_asian,share_hispanic
count,29268,29268,29268,29268,29268,29268,29268
unique,51,24219,997,926,628,409,956
top,PA,Franklin city,100,0,0,0,0
freq,1764,16,1051,6587,6930,8537,2489




1.   Dropping '(X)'
2.   Convert share_white, share_black, share_native_american, share_asian, share_hispanic to numeric values



In [47]:
print('Before: {} entries.'.format(len(share_race_by_city)))

share_race_by_city = share_race_by_city[share_race_by_city['share_white']!='(X)']
share_race_by_city['share_white'] = share_race_by_city['share_white'].astype('float64')

share_race_by_city['share_black'] = share_race_by_city['share_black'].astype('float64')
share_race_by_city['share_native_american'] = share_race_by_city['share_native_american'].astype('float64')
share_race_by_city['share_asian'] = share_race_by_city['share_asian'].astype('float64')
share_race_by_city['share_hispanic'] = share_race_by_city['share_hispanic'].astype('float64')

print('After: {} entries.'.format(len(share_race_by_city)))
share_race_by_city.to_csv('/content/drive/My Drive/data/cleaned-data/cleaned-ShareRaceByCity.csv')

Before: 29248 entries.
After: 29248 entries.


  res_values = method(rvalues)


# **Police Killing US**

In [49]:
police_killing_us = pd.read_csv('/content/drive/My Drive/data/fatal-police-shootings-in-the-us/PoliceKillingsUS.csv', encoding="windows-1252")
police_killing_us.info()
police_killing_us.head()
police_killing_us.describe()
police_killing_us.describe(include='object')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2535 entries, 0 to 2534
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       2535 non-null   int64  
 1   name                     2535 non-null   object 
 2   date                     2535 non-null   object 
 3   manner_of_death          2535 non-null   object 
 4   armed                    2526 non-null   object 
 5   age                      2458 non-null   float64
 6   gender                   2535 non-null   object 
 7   race                     2340 non-null   object 
 8   city                     2535 non-null   object 
 9   state                    2535 non-null   object 
 10  signs_of_mental_illness  2535 non-null   bool   
 11  threat_level             2535 non-null   object 
 12  flee                     2470 non-null   object 
 13  body_camera              2535 non-null   bool   
dtypes: bool(2), float64(1), 

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False


Unnamed: 0,id,age
count,2535.0,2458.0
mean,1445.731755,36.60537
std,794.25949,13.030774
min,3.0,6.0
25%,768.5,26.0
50%,1453.0,34.0
75%,2126.5,45.0
max,2822.0,91.0


Unnamed: 0,name,date,manner_of_death,armed,gender,race,city,state,threat_level,flee
count,2535,2535,2535,2526,2535,2340,2535,2535,2535,2470
unique,2481,879,2,68,2,6,1417,51,3,4
top,TK TK,24/01/17,shot,gun,M,W,Los Angeles,CA,attack,Not fleeing
freq,49,8,2363,1398,2428,1201,39,424,1611,1695




1.   Dropping na



In [0]:
police_killing_us = police_killing_us.dropna()
police_killing_us.to_csv('/content/drive/My Drive/data/cleaned-data/cleaned-PoliceKillingsUS.csv')