In [1101]:
import itertools

import numpy
import pandas as pd
import random
import sklearn
from sklearn.model_selection import train_test_split
import datetime
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb

# import graphviz

In [1102]:
"""
Our first goal is going to be data cleaning and munging. We want to take care of all null values
first.
"""

df_training = pd.read_csv('police_killings_train.csv')
df_test = pd.read_csv('police_killings_test.csv')
df_race_city = pd.read_csv('share_race_by_city.csv')
df_income = pd.read_csv('income.csv', encoding="ANSI")
df_poverty = pd.read_csv('poverty.csv', encoding="ANSI")
df_education = pd.read_csv('education.csv', encoding="ANSI")

df_test.info()
df_training

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 507 entries, 0 to 506
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       507 non-null    int64  
 1   name                     507 non-null    object 
 2   date                     507 non-null    object 
 3   manner_of_death          507 non-null    object 
 4   armed                    504 non-null    object 
 5   age                      467 non-null    float64
 6   gender                   507 non-null    object 
 7   race                     403 non-null    object 
 8   city                     507 non-null    object 
 9   state                    507 non-null    object 
 10  signs_of_mental_illness  507 non-null    bool   
 11  threat_level             507 non-null    object 
 12  flee                     469 non-null    object 
 13  body_camera              507 non-null    bool   
dtypes: bool(2), float64(1), in

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023,2256,Jeremy Lopez-Robledo,24/01/17,shot,knife,29.0,M,H,Las Cruces,NM,True,attack,Foot,True
2024,2257,Jonathan David Sper,24/01/17,shot,unarmed,30.0,M,W,Algoma Township,MI,True,attack,Not fleeing,False
2025,2258,Jose Efrain Rodriguez,24/01/17,shot and Tasered,gun,18.0,M,H,Lancaster City,PA,False,attack,Not fleeing,False
2026,2259,Ramon Milanez,24/01/17,shot,gun,32.0,M,H,Kuna,ID,False,attack,Car,False


In [1103]:
"""
We will combine the training and tesing dataset to make cleaning easier and more accurate
"""

df_total = df_training.append(df_test, ignore_index=True)
df_total

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2530,2822,Rodney E. Jacobs,28/07/17,shot,gun,31.0,M,,Kansas City,MO,False,attack,Not fleeing,False
2531,2813,TK TK,28/07/17,shot,vehicle,,M,,Albuquerque,NM,False,attack,Car,False
2532,2818,Dennis W. Robinson,29/07/17,shot,gun,48.0,M,,Melba,ID,False,attack,Car,False
2533,2817,Isaiah Tucker,31/07/17,shot,vehicle,28.0,M,B,Oshkosh,WI,False,attack,Car,True


In [1104]:
print(df_income.info())
df_race_city.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29322 entries, 0 to 29321
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Geographic Area  29322 non-null  object
 1   City             29322 non-null  object
 2   Median Income    29271 non-null  object
dtypes: object(3)
memory usage: 687.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29268 entries, 0 to 29267
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Geographic area        29268 non-null  object
 1   City                   29268 non-null  object
 2   share_white            29268 non-null  object
 3   share_black            29268 non-null  object
 4   share_native_american  29268 non-null  object
 5   share_asian            29268 non-null  object
 6   share_hispanic         29268 non-null  object
dtypes: object(7)
memory usage: 1.6+ MB


In [1105]:
"""
There are many missing fields across all the supporting/optional datasets, in order to resolve this we need
to merge these datasets and then make the necessary imputation.
1. All data sets have a state and city type columns, merge those columns such that they can all have a column
to for indexing.
2. Get the list of cities within the mandatory dataset, we need to confirm that the supporting dataset
has these cities and if not then we need to impute.
"""
#make a list of unique cities, and include the state so we don't deal with same city names of different states
df_total['city'] = df_total[['state', 'city']].apply("-".join,axis=1)
df_race_city['city'] = df_race_city[['Geographic area', 'City']].apply("-".join,axis=1)
df_income['city'] = df_income[['Geographic Area', 'City']].apply("-".join,axis=1)
df_poverty['city'] = df_poverty[['Geographic Area', 'City']].apply("-".join,axis=1)
df_education['city'] = df_education[['Geographic Area', 'City']].apply("-".join,axis=1)
df_income = df_income.drop(['Geographic Area', 'City'], axis=1)
df_poverty = df_poverty.drop(columns=['Geographic Area', 'City'])
df_education = df_education.drop(columns=['Geographic Area', 'City'])
cities = df_total['city'].tolist()
cities =list(set(cities))
print(len(cities))
cities

1534


['TN-Columbia',
 'GA-Sylvester',
 'IL-Metropolis',
 'AL-Selma',
 'SC-Johnsonville',
 'AL-Lawrence County',
 'TN-Gibson County',
 'NC-Wilkesboro',
 'VA-Stafford',
 'CA-Riverside',
 'PA-Lewistown',
 'FL-Homestead',
 'TN-Bristol',
 'AZ-Surprise',
 'HI-Keaau',
 'AZ-Red Valley',
 'IL-Springfield',
 'OH-Cincinnati',
 'MO-Poplar Bluff',
 'CA-Shafter',
 'PA-Bloomsburg',
 'DE-Middletown',
 'SC-Lexington',
 'TX-Sunset',
 'MN-Columbia Heights',
 'TN-Raleigh',
 'TX-Mesquite',
 'MI-Berrien County',
 'IN-Indianapolis',
 'AL-Brooksville',
 'AL-Muscle Shoals',
 'WI-Monroe',
 'KY-Booneville',
 'NE-Sidney',
 'OH-Kent',
 'CO-Aurora',
 'WI-Millston',
 'UT-Provo',
 'GA-East Point',
 'TX-Rosenberg',
 'MI-Manistee',
 'NM-Alamogordo',
 'AL-Eufaula',
 'CA-Turlock',
 'SD-Rapid City',
 'CA-Desert Edge',
 'IA-Urbandale',
 'MA-Roslindale',
 'NY-Schenectady',
 'MO-Buffalo',
 'CA-Bell',
 'CA-Delhi',
 'KY-Daviess County',
 'CA-Burbank',
 'AR-Bull Shoals',
 'CA-Oxnard',
 'RI-Pawtucket',
 'AL-Phenix City',
 'GA-Commerc

In [1106]:
"""
First lets merge each of the supporting datasets into df_temp.
We noticed that the missing values aren't consistent in the supporting datasets. So we need to change all types of
missing values to NAN such that we can impute using fillna().
"""
print(df_race_city.info())
df_temp = pd.merge(df_race_city, df_income, on='city', how='outer', sort=True)
df_temp = pd.merge(df_temp, df_poverty, on='city', how='outer', sort=True)
df_temp = pd.merge(df_temp, df_education, on='city', how='outer', sort=True)
columns_supp = df_temp.columns.values.tolist()
for i in columns_supp:
    df_temp[i] = df_temp[i].replace('-', '(X)')
    df_temp[i] = df_temp[i].replace('(X)', numpy.NAN)
print(df_temp.info())
df_temp

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29268 entries, 0 to 29267
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Geographic area        29268 non-null  object
 1   City                   29268 non-null  object
 2   share_white            29268 non-null  object
 3   share_black            29268 non-null  object
 4   share_native_american  29268 non-null  object
 5   share_asian            29268 non-null  object
 6   share_hispanic         29268 non-null  object
 7   city                   29268 non-null  object
dtypes: object(8)
memory usage: 1.8+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 29477 entries, 0 to 29476
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Geographic area        29268 non-null  object
 1   City                   29268 non-null  object
 2   share_white        

Unnamed: 0,Geographic area,City,share_white,share_black,share_native_american,share_asian,share_hispanic,city,Median Income,poverty_rate,percent_completed_hs
0,AK,Adak city,19.6,4,5.5,52.5,8.9,AK-Adak city,78500,39.3,93.4
1,AK,Akhiok city,8.5,1.4,50.7,1.4,11.3,AK-Akhiok city,26250,40.5,62.5
2,AK,Akiachak CDP,3.5,0.2,95.1,0.2,0.2,AK-Akiachak CDP,38750,26.1,76.7
3,AK,Akiak city,5.2,0,92.8,0,0.3,AK-Akiak city,42000,31.3,81.3
4,AK,Akutan city,23.3,17.9,5.5,43.3,20.8,AK-Akutan city,24750,16.1,73
...,...,...,...,...,...,...,...,...,...,...,...
29472,WY,Woods Landing-Jelm CDP,95.9,0,0,2.1,0,WY-Woods Landing-Jelm CDP,,18.6,100
29473,WY,Worland city,89.9,0.3,1.3,0.6,16.6,WY-Worland city,41523,15.3,85.6
29474,WY,Wright town,94.5,0.1,1.4,0.2,6.2,WY-Wright town,77114,5.9,89.2
29475,WY,Y-O Ranch CDP,92.8,1.5,2.6,0,11.8,WY-Y-O Ranch CDP,,0,100


In [1107]:
"""
Let's try cleaning the the race of city dataset now viewing the missing datapoints which are denoted at (X).
"""
# df_temp['city'] = df_temp[['Geographic area', 'City']].apply("-".join,axis=1)
# list_race_df = df_temp.values.tolist()
# j = 0
# list_bad_cities = []
# for i in list_race_df:
#     #we can see it is only null when all races are missing
#     if(i.count('(X)') > 0):
#         j+=1
#         list_bad_cities.append(i[1])
#         print(i.count('(X)') )
# print("total cities with bad points = ", j)


"\nLet's try cleaning the the race of city dataset now viewing the missing datapoints which are denoted at (X).\n"

In [1108]:
"""With only 20/29260 holding a row of null values, we shall drop these cities"""
# tempDf_race = df_temp[df_race_city.share_white != '(X)'].copy()
# print(tempDf_race.info())
# df_race_city.info()

'With only 20/29260 holding a row of null values, we shall drop these cities'

In [1109]:
"""
We want to make sure all the cities that are in our mandatory dataset is accounted for in the supporting.
We shall add those cities if they aren't and impute them as nulls
"""
#we can check that all the cities in the total data set is still in the race_city dataset
cities2 = df_temp['city'].tolist()
cities2 =list(set(cities2))
j = 0
missing = 0
cities.sort()
cities2.sort()
rep_val = 0
for i in cities:
    # print(i in cities2)
    # print(i)
    # print(cities2.index(i[0::1]))
    if(i in cities2):
        j+=1
    elif(any(i in string for string in cities2)):
        if len(df_temp[df_temp['city'].str.contains(i) ]) == 1 and sum(i in string for string in cities) == 1:
            print(i)
            df_temp['city'].loc[df_temp['city'].str.contains(i)] = i
            rep_val+=1

        else:
            newDF = {}
            #print(i)
            newDF['Geographic area'] = i[0:2]
            newDF['City'] = i[3:]
            newDF['city'] = i
            ind = 0
            df_temp = df_temp.append(newDF, ignore_index=True)
            cities2 = df_temp['city'].tolist()
            cities2 =list(set(cities2))
            missing+=1

    else:
        newDF = {}
        #print(i)
        newDF['Geographic area'] = i[0:2]
        newDF['City'] = i[3:]
        newDF['city'] = i
        ind = 0
        df_temp = df_temp.append(newDF, ignore_index=True)
        cities2 = df_temp['city'].tolist()
        cities2 =list(set(cities2))
        missing+=1


print(j)
print("Replaced cities ", rep_val)
print(" Was Missing ", missing)
df_temp.drop_duplicates(subset='city', inplace=True)
df_temp.reset_index(inplace=True)
df_temp

AK-Anchorage
AK-Barrow
AK-Big Lake
AK-Fairbanks
AK-Houston
AK-Wasilla
AL-Abbeville
AL-Arab
AL-Ashville
AL-Bay Minette
AL-Birmingham
AL-Clanton
AL-Dadeville
AL-Deer Park
AL-Eufaula
AL-Gadsden
AL-Homewood
AL-Hoover
AL-Huntsville
AL-Meridianville
AL-Millbrook
AL-Mobile
AL-Monroeville
AL-Montgomery
AL-Muscle Shoals
AL-Opelika
AL-Opp
AL-Oxford
AL-Phenix City
AL-Piedmont
AL-Rainbow City
AL-Saraland
AL-Selma
AL-Sylacauga
AL-Tuscaloosa
AL-Tuscumbia
AL-Westover
AR-Austin
AR-Bull Shoals
AR-Cabot
AR-Dover
AR-England
AR-Farmington
AR-Fayetteville
AR-Jonesboro
AR-Little Rock
AR-Manila
AR-Marion
AR-Mena
AR-Mountain Pine
AR-Perryville
AR-Sheridan
AR-Springdale
AR-Strong
AR-West Memphis
AZ-Apache Junction
AZ-Avondale
AZ-Bisbee
AZ-Buckeye
AZ-Bullhead City
AZ-Casa Grande
AZ-Chandler
AZ-Eagar
AZ-Flagstaff
AZ-Florence
AZ-Gilbert
AZ-Glendale
AZ-Golden Shores
AZ-Golden Valley
AZ-Kearny
AZ-Kingman
AZ-Lake Havasu City
AZ-Marana
AZ-Morenci
AZ-Phoenix
AZ-San Manuel
AZ-Scottsdale
AZ-Show Low
AZ-Somerton
AZ-Surpr

Unnamed: 0,index,Geographic area,City,share_white,share_black,share_native_american,share_asian,share_hispanic,city,Median Income,poverty_rate,percent_completed_hs
0,0,AK,Adak city,19.6,4,5.5,52.5,8.9,AK-Adak city,78500,39.3,93.4
1,1,AK,Akhiok city,8.5,1.4,50.7,1.4,11.3,AK-Akhiok city,26250,40.5,62.5
2,2,AK,Akiachak CDP,3.5,0.2,95.1,0.2,0.2,AK-Akiachak CDP,38750,26.1,76.7
3,3,AK,Akiak city,5.2,0,92.8,0,0.3,AK-Akiak city,42000,31.3,81.3
4,4,AK,Akutan city,23.3,17.9,5.5,43.3,20.8,AK-Akutan city,24750,16.1,73
...,...,...,...,...,...,...,...,...,...,...,...,...
29816,29816,WV,Braxton County,,,,,,WV-Braxton County,,,
29817,29817,WV,Gerrardstown,,,,,,WV-Gerrardstown,,,
29818,29818,WV,Hensley,,,,,,WV-Hensley,,,
29819,29819,WV,Petroleum,,,,,,WV-Petroleum,,,


In [1110]:
cities2 = df_temp['city'].tolist()
cities2 =list(set(cities2))
j = 0
for i in cities:
    # print(i in cities2)
    # print(i)
    # print(cities2.index(i[0::1]))
    if(i in  cities2):
        j+=1
#confirming that all the cities were placed in
print(j)

1534


In [1111]:
df_temp.sort_values(by=['city'], inplace=True)
df_temp

Unnamed: 0,index,Geographic area,City,share_white,share_black,share_native_american,share_asian,share_hispanic,city,Median Income,poverty_rate,percent_completed_hs
0,0,AK,Adak city,19.6,4,5.5,52.5,8.9,AK-Adak city,78500,39.3,93.4
1,1,AK,Akhiok city,8.5,1.4,50.7,1.4,11.3,AK-Akhiok city,26250,40.5,62.5
2,2,AK,Akiachak CDP,3.5,0.2,95.1,0.2,0.2,AK-Akiachak CDP,38750,26.1,76.7
3,3,AK,Akiak city,5.2,0,92.8,0,0.3,AK-Akiak city,42000,31.3,81.3
4,4,AK,Akutan city,23.3,17.9,5.5,43.3,20.8,AK-Akutan city,24750,16.1,73
...,...,...,...,...,...,...,...,...,...,...,...,...
29472,29472,WY,Woods Landing-Jelm CDP,95.9,0,0,2.1,0,WY-Woods Landing-Jelm CDP,,18.6,100
29473,29473,WY,Worland city,89.9,0.3,1.3,0.6,16.6,WY-Worland city,41523,15.3,85.6
29474,29474,WY,Wright town,94.5,0.1,1.4,0.2,6.2,WY-Wright town,77114,5.9,89.2
29475,29475,WY,Y-O Ranch CDP,92.8,1.5,2.6,0,11.8,WY-Y-O Ranch CDP,,0,100


In [1112]:
"""
We want to impute the city and our state/Geographic area such that we have a full comprehensive list of both
separately after the merge.
Then we want to get a list of race names within the df_race_city.
"""

df_temp['Geographic area'] = [i[0:2] for i in df_temp['city']]
df_temp['City'] = [i[3:] for i in df_temp['city']]
print(df_temp)
list_states = df_temp['Geographic area'].values.tolist()
list_states = list(set(list_states))
#we expect to see 51 including DC as a state
print(len(list_states))
columns_race = df_temp.columns.values.tolist()
s = 'share'
columns_race = [string for string in columns_race if(s in string)]
columns_race

       index Geographic area                    City share_white share_black  \
0          0              AK               Adak city        19.6           4   
1          1              AK             Akhiok city         8.5         1.4   
2          2              AK            Akiachak CDP         3.5         0.2   
3          3              AK              Akiak city         5.2           0   
4          4              AK             Akutan city        23.3        17.9   
...      ...             ...                     ...         ...         ...   
29472  29472              WY  Woods Landing-Jelm CDP        95.9           0   
29473  29473              WY            Worland city        89.9         0.3   
29474  29474              WY             Wright town        94.5         0.1   
29475  29475              WY           Y-O Ranch CDP        92.8         1.5   
29476  29476              WY              Yoder town        97.4           0   

      share_native_american share_asian

['share_white',
 'share_black',
 'share_native_american',
 'share_asian',
 'share_hispanic']

In [1113]:
"""
Now we can impute the porportion of missing cities based on the state's porportions
"""
for state in list_states:
    j=0
    list_total_races = []
    state_df = df_temp.loc[df_temp['Geographic area'] == state]
    for i in columns_race:
        state_df[i] = state_df[i].astype(float)
        list_total_races.append(state_df[i].sum())
        print(list_total_races[j])
        j+=1
    sum_all_races = sum(list_total_races)
    porportional_share = [i/sum_all_races for i in list_total_races]
    #print(porportional_share, " sum= ", sum(porportional_share))
    j=0
    for i in columns_race:
        state_df[i] = state_df[i].fillna(porportional_share[j])
        j+=1
    df_temp.loc[df_temp['Geographic area'] == state] = state_df
print(df_temp.info())
df_temp



97576.2
568.5999999999999
273.8
404.40000000000003
2844.4
123518.1
6430.800000000001
357.9
1828.2999999999997
7068.1
53454.6
2474.8
10542.1
312.6
4175.5
30833.699999999997
82.7
4332.900000000001
116.4
800.1
52850.7
15077.300000000001
1321.4
684.5
4739.1
10620.1
202.2
719.4000000000001
258.9
1699.3999999999999
48419.1
2321.1000000000004
110.30000000000001
373.3
1168.4
42441.2
8816.6
411.1
258.1
2311.7
38158.3
3132.9
135.8
273.4
1225.3
21790.300000000003
680.9
64.7
693.4
1201.9
29158.5
125.5
1261.7
251.3
2498.2
41909.200000000004
13480.3
381.09999999999997
277.29999999999995
1722.5
38.5
50.7
0.3
3.5
9.1
5794.700000000001
1169.8999999999999
39.800000000000004
152.0
563.7000000000002
62377.899999999994
644.5
1256.9
286.1
3403.1
62744.7
2854.0
745.3
660.3
2446.5
164498.7
5768.7
285.0
1744.6
4820.2
71858.5
724.5
2025.5
611.3
2568.9
32947.2
199.10000000000002
973.3000000000001
519.8
3676.0999999999995
33078.9
113.1
4690.400000000001
124.0
794.4000000000001
30657.300000000003
14556.8
452.8
372

Unnamed: 0,index,Geographic area,City,share_white,share_black,share_native_american,share_asian,share_hispanic,city,Median Income,poverty_rate,percent_completed_hs
0,0,AK,Adak city,19.6,4.0,5.5,52.5,8.9,AK-Adak city,78500,39.3,93.4
1,1,AK,Akhiok city,8.5,1.4,50.7,1.4,11.3,AK-Akhiok city,26250,40.5,62.5
2,2,AK,Akiachak CDP,3.5,0.2,95.1,0.2,0.2,AK-Akiachak CDP,38750,26.1,76.7
3,3,AK,Akiak city,5.2,0.0,92.8,0.0,0.3,AK-Akiak city,42000,31.3,81.3
4,4,AK,Akutan city,23.3,17.9,5.5,43.3,20.8,AK-Akutan city,24750,16.1,73
...,...,...,...,...,...,...,...,...,...,...,...,...
29472,29472,WY,Woods Landing-Jelm CDP,95.9,0.0,0.0,2.1,0.0,WY-Woods Landing-Jelm CDP,,18.6,100
29473,29473,WY,Worland city,89.9,0.3,1.3,0.6,16.6,WY-Worland city,41523,15.3,85.6
29474,29474,WY,Wright town,94.5,0.1,1.4,0.2,6.2,WY-Wright town,77114,5.9,89.2
29475,29475,WY,Y-O Ranch CDP,92.8,1.5,2.6,0.0,11.8,WY-Y-O Ranch CDP,,0,100


In [1114]:
df_temp['Median Income'] = pd.to_numeric(df_temp['Median Income'], errors='coerce')
df_temp['poverty_rate'] = pd.to_numeric(df_temp['poverty_rate'], errors='coerce')
df_temp['percent_completed_hs'] = pd.to_numeric(df_temp['percent_completed_hs'], errors='coerce')
for state in list_states:
    state_df = df_temp.loc[df_temp['Geographic area'] == state]
    state_df['Median Income'] = state_df['Median Income'].fillna(state_df['Median Income'].mean())
    state_df['poverty_rate'] = state_df['poverty_rate'].fillna(state_df['poverty_rate'].mean())
    state_df['percent_completed_hs'] =state_df['percent_completed_hs'].fillna(state_df['percent_completed_hs'].mean())
    df_temp.loc[df_temp['Geographic area'] == state] = state_df
print(df_temp.info())

df_temp

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29821 entries, 0 to 29476
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   index                  29821 non-null  int64  
 1   Geographic area        29821 non-null  object 
 2   City                   29821 non-null  object 
 3   share_white            29821 non-null  object 
 4   share_black            29821 non-null  object 
 5   share_native_american  29821 non-null  object 
 6   share_asian            29821 non-null  object 
 7   share_hispanic         29821 non-null  object 
 8   city                   29821 non-null  object 
 9   Median Income          29821 non-null  float64
 10  poverty_rate           29821 non-null  float64
 11  percent_completed_hs   29821 non-null  float64
dtypes: float64(3), int64(1), object(8)
memory usage: 3.0+ MB
None


Unnamed: 0,index,Geographic area,City,share_white,share_black,share_native_american,share_asian,share_hispanic,city,Median Income,poverty_rate,percent_completed_hs
0,0,AK,Adak city,19.6,4.0,5.5,52.5,8.9,AK-Adak city,78500.000000,39.3,93.4
1,1,AK,Akhiok city,8.5,1.4,50.7,1.4,11.3,AK-Akhiok city,26250.000000,40.5,62.5
2,2,AK,Akiachak CDP,3.5,0.2,95.1,0.2,0.2,AK-Akiachak CDP,38750.000000,26.1,76.7
3,3,AK,Akiak city,5.2,0.0,92.8,0.0,0.3,AK-Akiak city,42000.000000,31.3,81.3
4,4,AK,Akutan city,23.3,17.9,5.5,43.3,20.8,AK-Akutan city,24750.000000,16.1,73.0
...,...,...,...,...,...,...,...,...,...,...,...,...
29472,29472,WY,Woods Landing-Jelm CDP,95.9,0.0,0.0,2.1,0.0,WY-Woods Landing-Jelm CDP,60874.993464,18.6,100.0
29473,29473,WY,Worland city,89.9,0.3,1.3,0.6,16.6,WY-Worland city,41523.000000,15.3,85.6
29474,29474,WY,Wright town,94.5,0.1,1.4,0.2,6.2,WY-Wright town,77114.000000,5.9,89.2
29475,29475,WY,Y-O Ranch CDP,92.8,1.5,2.6,0.0,11.8,WY-Y-O Ranch CDP,60874.993464,0.0,100.0


In [1115]:
#make a list of df fT-oubaor each city
list_df_cities = []
for i in cities:
    list_df_cities.append(df_total.loc[df_total['city'] == i])

In [1116]:
#make a list of race
races = df_total['race'].tolist()
races =list(set(races))
races = races[1::]
races

['W', 'N', 'B', 'A', 'H', 'O']

In [1117]:
mode_total = df_total['race'].mode(dropna=True).tolist()
print(mode_total)
for i in list_df_cities:
    mode = i['race'].mode(dropna=True)
    if(len(mode) == 0):
        i[['race']] = i[['race']].fillna(mode_total[0])
        if(i['race'].isnull().sum() != 0):
            print("null val")
    elif(isinstance(mode, type(i['race']))):
        mode.dropna()
        mode = mode.tolist()
        #if the mode has more than one value then take a random choice between the values
        i[['race']] = i[['race']].fillna(random.choice(tuple(mode)))



    df_total.loc[df_total['city'] ==  i.at[i.first_valid_index(),'city']] = i
df_total.info()

['W']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2535 entries, 0 to 2534
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       2535 non-null   int64  
 1   name                     2535 non-null   object 
 2   date                     2535 non-null   object 
 3   manner_of_death          2535 non-null   object 
 4   armed                    2526 non-null   object 
 5   age                      2458 non-null   float64
 6   gender                   2535 non-null   object 
 7   race                     2535 non-null   object 
 8   city                     2535 non-null   object 
 9   state                    2535 non-null   object 
 10  signs_of_mental_illness  2535 non-null   bool   
 11  threat_level             2535 non-null   object 
 12  flee                     2470 non-null   object 
 13  body_camera              2535 non-null   bool   
dtypes: bool(2), float6

In [1118]:
 # removed all data with NaN in race Column...df_training[df_training['race'].isnull()]
df_total

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,WA-Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,OR-Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,KS-Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,CA-San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,CO-Evans,CO,False,attack,Not fleeing,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2530,2822,Rodney E. Jacobs,28/07/17,shot,gun,31.0,M,B,MO-Kansas City,MO,False,attack,Not fleeing,False
2531,2813,TK TK,28/07/17,shot,vehicle,,M,H,NM-Albuquerque,NM,False,attack,Car,False
2532,2818,Dennis W. Robinson,29/07/17,shot,gun,48.0,M,W,ID-Melba,ID,False,attack,Car,False
2533,2817,Isaiah Tucker,31/07/17,shot,vehicle,28.0,M,B,WI-Oshkosh,WI,False,attack,Car,True


In [1119]:
"""
We noticed that some of the 'age' data is missing.
So we imputed the average age of the people in each city and imputed the average age for the missing ages based on their city.
1. We calculate the total mean age of our total data sample.
2. We group by city, then calculate the mean Age at each city.
3. For each NaN age, we impute the mean age of that city. If that city was not calcuated (meaning this data is the only one from that city), we will use the total mean age.
"""
age_fix = df_total[['id','age','city']].copy()
age_fix = age_fix[age_fix['age'].isnull()].copy()
# 1. Calculate Total Mean Age
total_mean_age = df_total['age'].mean()

# 2. Calculate Mean of Each City
city_mean_ages = df_total.groupby('city')['age'].mean()

#3. Impute Age by the city. If City data does not exist, use Total Mean Age.
for i in age_fix.index:
    city = age_fix['city'][i]
    if city in city_mean_ages and not np.isnan(city_mean_ages[city]):
        age_fix['age'][i] = city_mean_ages[city]
    else:
        age_fix['age'][i] = total_mean_age


# update total list
df_total.update(age_fix)

df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2535 entries, 0 to 2534
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       2535 non-null   float64
 1   name                     2535 non-null   object 
 2   date                     2535 non-null   object 
 3   manner_of_death          2535 non-null   object 
 4   armed                    2526 non-null   object 
 5   age                      2535 non-null   float64
 6   gender                   2535 non-null   object 
 7   race                     2535 non-null   object 
 8   city                     2535 non-null   object 
 9   state                    2535 non-null   object 
 10  signs_of_mental_illness  2535 non-null   bool   
 11  threat_level             2535 non-null   object 
 12  flee                     2470 non-null   object 
 13  body_camera              2535 non-null   bool   
dtypes: bool(2), float64(2), 

In [1120]:
"""
We noticed there were missing values for 'armed' and 'flee'. For these, we imputed the most frequent data that appeared.
These are listed in the variables flee_mode and armed_mode
"""
flee_mode = df_total['flee'].agg(pd.Series.mode).values.tolist()[0]
armed_mode = df_total['armed'].agg(pd.Series.mode).values.tolist()[0]
df_total['flee'].fillna(flee_mode, inplace=True)
df_total['armed'].fillna(armed_mode, inplace=True)
df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2535 entries, 0 to 2534
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       2535 non-null   float64
 1   name                     2535 non-null   object 
 2   date                     2535 non-null   object 
 3   manner_of_death          2535 non-null   object 
 4   armed                    2535 non-null   object 
 5   age                      2535 non-null   float64
 6   gender                   2535 non-null   object 
 7   race                     2535 non-null   object 
 8   city                     2535 non-null   object 
 9   state                    2535 non-null   object 
 10  signs_of_mental_illness  2535 non-null   bool   
 11  threat_level             2535 non-null   object 
 12  flee                     2535 non-null   object 
 13  body_camera              2535 non-null   bool   
dtypes: bool(2), float64(2), 

In [1121]:
"""
****GIVE EXPLANATION OF OUR ENTIRE PROCESS
########################### END OF DATA MUNGING##########################
"""


'\n****GIVE EXPLANATION OF OUR ENTIRE PROCESS\n########################### END OF DATA MUNGING##########################\n'

In [1122]:
"""
Data Analysis
Exploratory Data Analysis (EDA 10 Points)
"""
df_total

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3.0,Tim Elliot,02/01/15,shot,gun,53.000000,M,A,WA-Shelton,WA,True,attack,Not fleeing,False
1,4.0,Lewis Lee Lembke,02/01/15,shot,gun,47.000000,M,W,OR-Aloha,OR,False,attack,Not fleeing,False
2,5.0,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.000000,M,H,KS-Wichita,KS,False,other,Not fleeing,False
3,8.0,Matthew Hoffman,04/01/15,shot,toy weapon,32.000000,M,W,CA-San Francisco,CA,True,attack,Not fleeing,False
4,9.0,Michael Rodriguez,04/01/15,shot,nail gun,39.000000,M,H,CO-Evans,CO,False,attack,Not fleeing,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2530,2822.0,Rodney E. Jacobs,28/07/17,shot,gun,31.000000,M,B,MO-Kansas City,MO,False,attack,Not fleeing,False
2531,2813.0,TK TK,28/07/17,shot,vehicle,31.153846,M,H,NM-Albuquerque,NM,False,attack,Car,False
2532,2818.0,Dennis W. Robinson,29/07/17,shot,gun,48.000000,M,W,ID-Melba,ID,False,attack,Car,False
2533,2817.0,Isaiah Tucker,31/07/17,shot,vehicle,28.000000,M,B,WI-Oshkosh,WI,False,attack,Car,True


In [1123]:
"""
Q: Which state has the most fatal police shootings? Which city is the most dangerous?
Create a collapsed dataframe of state. Create a collapsed data frame of Cities.
"""
# Make a copy of Total Data For City!
df_aux = df_total.copy()
df_aux.sort_values(by=['city'])

# The following dropped Columns are not used when data is collapsed as City
df_aux.drop(columns=['id', 'name', 'date', 'age', 'state'], inplace=True)
df_aux.set_index('city', inplace=True)
df_aux = df_aux.astype(str)
cols = df_aux.columns.tolist() # Move 'arms' to end of list
cols.append(cols.pop(cols.index('armed')))
df_aux = df_aux[cols]

# Get a list of 'df_aux' column names
aux_cols = df_aux.columns

# Get a list of the index cities (Sorted)
list_of_cities = df_aux.index.unique().tolist()
list_of_cities.sort()

# For each column, change the string such that  " 'COLUMN_NAME': + str + (s) " is the result for each string.
# This is important so that when we build/update our resulting Dataframe, it will have unique values per catagory.
for i in list_of_cities:
    for j in aux_cols:
        df_aux.at[i, j] =  j.upper() + ': ' + df_aux.at[i, j]

# Make the df_city_stats Columns
freq_list_cols = ['Total Cases']
for i in aux_cols:
    freq_list_cols = freq_list_cols + df_aux[i].unique().tolist()


# Create Dataframe to build up
df_city_stats = pd.DataFrame(index=list_of_cities, columns=freq_list_cols)
df_city_stats = df_city_stats.fillna(0)
for i in list_of_cities: # indexs
    temp = len(df_aux[df_aux.index == i])
    df_city_stats.loc[i]['Total Cases'] = temp
    for j in aux_cols:
        temp = df_aux[df_aux.index == i][j].value_counts()
        df_city_stats.loc[i].update(temp)

df_city_stats

Unnamed: 0,Total Cases,MANNER_OF_DEATH: shot,MANNER_OF_DEATH: shot and Tasered,GENDER: M,GENDER: F,RACE: A,RACE: W,RACE: H,RACE: B,RACE: O,...,ARMED: metal rake,ARMED: crowbar,ARMED: oar,ARMED: machete and gun,ARMED: tire iron,ARMED: air conditioner,ARMED: pole and knife,ARMED: baseball bat and bottle,ARMED: fireworks,ARMED: pen
AK-Anchorage,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AK-Barrow,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AK-Big Lake,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AK-Fairbanks,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AK-Houston,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WY-Cheyenne,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
WY-Douglas,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
WY-Gillette,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
WY-Laramie,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1124]:
# Make a copy of Total Data For State!
df_aux = df_total.copy()
df_aux.sort_values(by=['state'])

# The following dropped Columns are not used when data is collapsed as City
df_aux.drop(columns=['id', 'name', 'date', 'age', 'city'], inplace=True)
df_aux.set_index('state', inplace=True)
df_aux = df_aux.astype(str)
cols = df_aux.columns.tolist() # Move 'arms' to end of list
cols.append(cols.pop(cols.index('armed')))
df_aux = df_aux[cols]

# Get a list of 'df_aux' column names
aux_cols = df_aux.columns

# Get a list of the index state (Sorted)
list_of_states = df_aux.index.unique().tolist()
list_of_states.sort()

# For each column, change the string such that  " 'COLUMN_NAME': + str + (s) " is the result for each string.
# This is important so that when we build/update our resulting Dataframe, it will have unique values per catagory.
for i in list_of_states:
    for j in aux_cols:
        df_aux.at[i, j] =  j.upper() + ': ' + df_aux.at[i, j]

# Make the df_city_stats Columns
freq_list_cols = ['Total Cases']
for i in aux_cols:
    freq_list_cols = freq_list_cols + df_aux[i].unique().tolist()


# Create Dataframe to build up
df_state_stats = pd.DataFrame(index=list_of_states, columns=freq_list_cols)
df_state_stats = df_state_stats.fillna(0)
for i in list_of_states: # indexs
    temp = len(df_aux[df_aux.index == i])
    df_state_stats.loc[i]['Total Cases'] = temp
    for j in aux_cols:
        temp = df_aux[df_aux.index == i][j].value_counts()
        df_state_stats.loc[i].update(temp)

df_state_stats

Unnamed: 0,Total Cases,MANNER_OF_DEATH: shot,MANNER_OF_DEATH: shot and Tasered,GENDER: M,GENDER: F,RACE: A,RACE: W,RACE: H,RACE: B,RACE: O,...,ARMED: metal rake,ARMED: crowbar,ARMED: oar,ARMED: machete and gun,ARMED: tire iron,ARMED: air conditioner,ARMED: pole and knife,ARMED: baseball bat and bottle,ARMED: fireworks,ARMED: pen
AK,15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL,50,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AR,26,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AZ,118,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CA,424,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CO,74,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CT,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DC,11,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DE,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FL,154,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
"""
Now that we have the most accurate imputation using all of the support data. We want to hold a dataframe that contains cities that are also in the main data frame.
"""
j=0
df_support = pd.DataFrame(columns=columns_supp)
for i in cities:
    newDf= df_temp[df_temp['city'] == i].copy()
    if(len(newDf) > 1):
        print(i)
        print(newDf.info())
        print(newDf)
    df_support = df_support.append(newDf, ignore_index=True)
    df_support.drop_duplicates(inplace=True)
    j+=1
print(j)
print(df_support.info())
print(len(cities))
df_support

In [None]:
"""
Q: Which state has the most fatal police shootings? Which city is the most dangerous?
Create a collapsed dataframe of state. Create a collapsed data frame of Cities.
"""
# Make a copy of Total Data For City!
df_aux = df_total.copy()
df_aux.sort_values(by=['city'])

# The following dropped Columns are not used when data is collapsed as City
df_aux.drop(columns=['id', 'name', 'date', 'age', 'state'], inplace=True)
df_aux.set_index('city', inplace=True)
df_aux = df_aux.astype(str)
cols = df_aux.columns.tolist() # Move 'arms' to end of list
cols.append(cols.pop(cols.index('armed')))
df_aux = df_aux[cols]

# Get a list of 'df_aux' column names
aux_cols = df_aux.columns

# Get a list of the index cities (Sorted)
list_of_cities = df_aux.index.unique().tolist()
list_of_cities.sort()

# For each column, change the string such that  " 'COLUMN_NAME': + str + (s) " is the result for each string.
# This is important so that when we build/update our resulting Dataframe, it will have unique values per catagory.
for i in list_of_cities:
    for j in aux_cols:
        df_aux.at[i, j] =  j.upper() + ': ' + df_aux.at[i, j]

# Make the df_city_stats Columns
freq_list_cols = ['Total Cases']
for i in aux_cols:
    freq_list_cols = freq_list_cols + df_aux[i].unique().tolist()


# Create Dataframe to build up
df_city_stats = pd.DataFrame(index=list_of_cities, columns=freq_list_cols)
df_city_stats = df_city_stats.fillna(0)
for i in list_of_cities: # indexs
    temp = len(df_aux[df_aux.index == i])
    df_city_stats.loc[i]['Total Cases'] = temp
    for j in aux_cols:
        temp = df_aux[df_aux.index == i][j].value_counts()
        df_city_stats.loc[i].update(temp)

df_city_stats

In [None]:
# Make a copy of Total Data For State!
df_aux = df_total.copy()
df_aux.sort_values(by=['state'])

# The following dropped Columns are not used when data is collapsed as City
df_aux.drop(columns=['id', 'name', 'date', 'age', 'city'], inplace=True)
df_aux.set_index('state', inplace=True)
df_aux = df_aux.astype(str)
cols = df_aux.columns.tolist() # Move 'arms' to end of list
cols.append(cols.pop(cols.index('armed')))
df_aux = df_aux[cols]

# Get a list of 'df_aux' column names
aux_cols = df_aux.columns

# Get a list of the index state (Sorted)
list_of_states = df_aux.index.unique().tolist()
list_of_states.sort()

# For each column, change the string such that  " 'COLUMN_NAME': + str + (s) " is the result for each string.
# This is important so that when we build/update our resulting Dataframe, it will have unique values per catagory.
for i in list_of_states:
    for j in aux_cols:
        df_aux.at[i, j] =  j.upper() + ': ' + df_aux.at[i, j]

# Make the df_city_stats Columns
freq_list_cols = ['Total Cases']
for i in aux_cols:
    freq_list_cols = freq_list_cols + df_aux[i].unique().tolist()


# Create Dataframe to build up
df_state_stats = pd.DataFrame(index=list_of_states, columns=freq_list_cols)
df_state_stats = df_state_stats.fillna(0)
for i in list_of_states: # indexs
    temp = len(df_aux[df_aux.index == i])
    df_state_stats.loc[i]['Total Cases'] = temp
    for j in aux_cols:
        temp = df_aux[df_aux.index == i][j].value_counts()
        df_state_stats.loc[i].update(temp)

df_state_stats

In [None]:
'''
The goal is to make a visualization of our data so we can find out which state in the set has the most fatal police shootings

Create a bar graph using the df_state_stats DataFrame to visualize the trend of fatal shootings per state
'''

plt.bar(df_state_stats.index, df_state_stats['Total Cases'])
plt.title('Total Cases of Fatal Shootings by State')
plt.xlabel('State')
plt.ylabel('Number of Fatal Shootings')
plt.show()

In [None]:
'''
The above bar graph is extremely packed and the majority of the values are very small, so they don't matter to us when trying to determine which states have the highest number of fatal shootings. Let's repeat the process using only the states with number of killings greater than 50. With this, we can see the data better.
'''

df_state_stats_temp = df_state_stats[df_state_stats['Total Cases'] > 50]
# df_state_stats_temp

# plt.bar(df_state_stats_temp.index, df_state_stats_temp['Total Cases'])
# plt.title('Total Cases of Fatal Shootings by State')
# plt.xlabel('State')
# plt.ylabel('Number of Fatal Shootings')
# plt.show()

x_list = list(df_state_stats_temp.index)
y_list = list(df_state_stats_temp['Total Cases'])
plt.bar(x_list, y_list)
plt.title('Total Cases of Fatal Shootings by State')
plt.xlabel('State')
plt.ylabel('Number of Fatal Shootings')
for i in range(len(x_list)):
    plt.text(x_list[i], y_list[i], y_list[i], ha='center')
plt.show()

In [None]:
'''
The graph clearly shows that CA is the state with the highest number of fatal police shootings (over 400), followed by TX with slightly over 200 fatal shootings.

We can confirm this conclusion using code by extracting the maximum value of 'Total Cases' from the DataFrame and seeing which state it is associated with. 
The line of code below accomplishes this and does indeed confirm that CA is the state with the highest number of fatal shootings (424). 
'''
s = df_state_stats.loc[df_state_stats['Total Cases'].idxmax()]
s.name, s['Total Cases']

In [None]:
df_state_stats.columns

In [None]:
'''
The goal is to make a visualization of our data so we can find out which city in the set is the most dangerous. We can determine this by comparing the number of fatal shootings 

Create a bar graph using the df_city_stats DataFrame to visualize the trend of fatal shootings per city
'''

plt.bar(df_city_stats.index, df_city_stats['Total Cases'])
plt.title('Total Cases of Fatal Shootings by City')
plt.xlabel('City')
plt.ylabel('Number of Fatal Shootings')
plt.show()

In [None]:
'''
The above bar graph is extremely packed and the majority of the values are very small, so they don't matter to us when trying to determine which cities have the highest number of fatal shootings. Let's repeat the process using only the cities with number of killings greater than 10. With this, we can see the data better.
'''

df_city_stats_temp = df_city_stats[df_city_stats['Total Cases'] > 10]
# df_city_stats_temp

plt.bar(df_city_stats_temp.index, df_city_stats_temp['Total Cases'])
plt.title('Total Cases of Fatal Shootings by City')
plt.xlabel('City')
plt.ylabel('Number of Fatal Shootings')
plt.show()

In [None]:
'''
The graph above looks a bit better, but it's still hard to distinguish the cities because of how packed the data is. We can further eliminate some data from our graph to make it easier to look at, so let's only consider those cities with number of fatal shootings above 20 and see if that works out better.
'''

df_city_stats_temp = df_city_stats[df_city_stats['Total Cases'] > 20]
# df_city_stats_temp

x_list = list(df_city_stats_temp.index)
y_list = list(df_city_stats_temp['Total Cases'])
plt.bar(x_list, y_list)
plt.title('Total Cases of Fatal Shootings by City')
plt.xlabel('City')
plt.ylabel('Number of Fatal Shootings')
for i in range(len(x_list)):
    plt.text(x_list[i], y_list[i], y_list[i], ha='center')
plt.show()

In [None]:
'''
This graph very clearly shows that Los Angeles, CA is the city with the most fatal shootings, which we can use as our basis to claim that Los Angeles is the most dangerous of the cities on this list. It is followed closely by Phoenix, AZ and Houston, TX.
'''

In [None]:
'''
The goal is to find out what the most common way of being armed is. We can do this by iterating through every index and sum up the values in each column for method of arms

We can store the results in a dictionary
'''

armed_counts = dict()
armed_cols = [c for c in df_state_stats.columns if c[0:5] == "ARMED"]

for index, row in df_state_stats.iterrows():
    for col in armed_cols:
        arm_type = col[7:]
        armed_counts[arm_type] = armed_counts.get(arm_type, 0) + row[col]

'''
Now the armed_counts dictionary contains each way of being armed along with its corresponding number of appearances.

We can sort the dictionary by value in non-increasing order to determine what the most common ways of being armed are.

By doing this, we can see that the most common way of being armed is with a gun (1407 occurrences), followed by knife with 373 occurrences.
'''

armed_counts_list = sorted(armed_counts.items(), key=lambda x:x[1], reverse=True)
armed_counts_list

In [None]:
'''
We can also use a bar graph to visualize this data.
'''
plt.bar(armed_counts.keys(), armed_counts.values())
plt.title("Number of Occurrences for Each Method of being Armed")
plt.xlabel("Method of Arms")
plt.ylabel("Number of Occurrences")
plt.show()

In [None]:
armed_counts_reduced = dict()

for tup in armed_counts_list:
    if (tup[1] > 10):
        armed_counts_reduced[tup[0]] = tup[1]

armed_counts_reduced

In [None]:
'''
We can also use a bar graph to visualize this data.
'''
# plt.bar(armed_counts_reduced.keys(), armed_counts_reduced.values())
# plt.title("Number of Occurrences for Each Method of being Armed")
# plt.xlabel("Method of Arms")
# plt.ylabel("Number of Occurrences")
# plt.show()


x_list = list(armed_counts_reduced.keys())
y_list = list(armed_counts_reduced.values())
plt.bar(x_list, y_list)
plt.title("Number of Occurrences for Each Method of being Armed")
plt.xlabel("Method of Arms")
plt.ylabel("Number of Occurrences")
for i in range(len(x_list)):
    plt.text(x_list[i], y_list[i], y_list[i], ha='center')
plt.show()

In [None]:
'''
The graph above clearly shows that guns are the most common by a very wide margin, followed by knives in second and then vehicles in third place.
Interestingly, we can also see here that the 4th most common occurrence is actually "unarmed" - no weapons/items 
'''

In [None]:
'''
Now let's look at the age distribution of all the victims
'''

all_hist = sb.histplot(df_total['age'], bins=20, binrange=(0,100))
all_hist.set_title("Age Distribution of All Victims")

In [None]:
# we can look at the stats summary for the ages across all races
df_total['age'].describe()

# Overall mean age: 36.56
# Overall age stdev: 12.88
# Overall min. age: 6
# Overall median age: 35
# Overall max. age: 91

In [None]:
df_a = df_total[df_total['race'] == 'A']
a_hist = sb.histplot(df_a['age'], bins=20, binrange=(0,100))
a_hist.set_title("Age Distribution of Asian Victims")

In [None]:
# we can look at the stats summary for the ages across Asian people
df_a['age'].describe()

# Asian mean age: 36.76
# Asian age stdev: 11.55
# Asian min. age: 15
# Asian median age: 35
# Asian max. age: 61

In [None]:
df_w = df_total[df_total['race'] == 'W']
w_hist = sb.histplot(df_w['age'], bins=20, binrange=(0,100))
w_hist.set_title("Age Distribution of White Victims")

In [None]:
# we can look at the stats summary for the ages across White people
df_w['age'].describe()

# White mean age: 40.27
# White age stdev: 13.39
# White min. age: 6
# White median age: 38
# White max. age: 91

In [None]:
df_b = df_total[df_total['race'] == 'B']
b_hist = sb.histplot(df_b['age'], bins=20, binrange=(0,100))
b_hist.set_title("Age Distribution of Black Victims")

In [None]:
# we can look at the stats summary for the ages across Black people
df_b['age'].describe()

# Black mean age: 31.64
# Black age stdev: 10.84
# Black min. age: 13
# Black median age: 30
# Black max. age: 77

In [None]:
df_h = df_total[df_total['race'] == 'H']
a_hist = sb.histplot(df_h['age'], bins=20, binrange=(0,100))
a_hist.set_title("Age Distribution of Hispanic Victims")

In [None]:
# we can look at the stats summary for the ages across Hispanic people
df_h['age'].describe()

# Hispanic mean age: 33.07
# Hispanic age stdev: 10.83
# Hispanic min. age: 14
# Hispanic median age: 31.57
# Hispanic max. age: 80

In [None]:
df_n = df_total[df_total['race'] == 'N']
a_hist = sb.histplot(df_n['age'], bins=20, binrange=(0,100))
a_hist.set_title("Age Distribution of Native American Victims")

In [None]:
# we can look at the stats summary for the ages across Native people
df_n['age'].describe()

# Native mean age: 30.38
# Native age stdev: 7.97
# Native min. age: 19
# Native median age: 29
# Native max. age: 49

In [None]:
df_o = df_total[df_total['race'] == 'O']
a_hist = sb.histplot(df_o['age'], bins=20, binrange=(0,100))
a_hist.set_title("Age Distribution of Other Race Victims")

In [None]:
# we can look at the stats summary for the ages across Other races
df_o['age'].describe()

# Other races mean age: 33.79
# Other races age stdev: 11.61
# Other races min. age: 18
# Other races median age: 30
# Other races max. age: 56

In [None]:
'''
Now let's graph the means and medians of the ages for each race and compare them
'''


races = ["Asian", "White", "Black", "Hispanic", "Natives", "Other"]
# mean_list, median_list = [], []
mean_list = [df_a['age'].mean(), df_w['age'].mean(), df_b['age'].mean(), df_h['age'].mean(), df_n['age'].mean(), df_o['age'].mean()]
median_list = [df_a['age'].median(), df_w['age'].median(), df_b['age'].median(), df_h['age'].median(), df_n['age'].median(), df_o['age'].median()]

x_axis = np.arange(len(races))

plt.bar(x_axis - 0.2, mean_list, width = 0.4, label = 'Mean')
plt.bar(x_axis + 0.2, median_list, width = 0.4, label = 'Median')

plt.title("Mean/Median of Ages for Each Race")
plt.xlabel("Race")
plt.ylabel("Value")

plt.xticks(x_axis, races)

plt.legend()

plt.show()

In [None]:
'''
The graph above shows that Native American victims had the lowest mean and median ages, while White victims had the highest mean and median ages.  
'''

In [None]:
list(set(sorted(df_total['age'].unique())))
df_total['race'].unique()

In [None]:
'''
Let's get the total number of people killed per race.
'''

race_count = df_total.groupby('race')['race'].size()
race_count 

In [None]:
x_list = race_count.index.tolist()
y_list = race_count.tolist()
plt.bar(x_list, y_list)
plt.title("Number of People Killed per Race")
plt.xlabel("Race")
plt.ylabel("Number of People Killed")
for i in range(len(x_list)):
    plt.text(x_list[i], y_list[i], y_list[i], ha='center')
plt.show()

In [None]:
'''
Now we can find the ratio of deaths for each race over the total amount of deaths and visualize that data as well
'''

total_deaths = race_count.sum()
x_list = race_count.index.tolist()
y_list = [round((y / total_deaths), 4) for y in race_count.tolist()]
plt.bar(x_list, y_list)
plt.title("Ratio of People Killed per Race")
plt.xlabel("Race")
plt.ylabel("Ratio of People Killed")
for i in range(len(x_list)):
    plt.text(x_list[i], y_list[i], y_list[i], ha='center')
plt.show()

In [None]:
'''
The same proportion data can be visualized better as a pie chart. The pie chart below shows that white people make up the majority of the deaths in our dataset.  
'''

fig1, ax1 = plt.subplots()
ax1.pie(y_list, labels=x_list, autopct='%1.1f%%')
ax1.axis('equal')
plt.title('Pie Chart Showing Ratio of Deaths for Each Race')
plt.show()

In [None]:
df_temp

In [None]:
"""
In order to find the impact of deaths proportionate to its respective race, we need to use the supporting dataset.
We must find the 1/race proportion that race holds in order to see the impact of a person's life proportionate to the race population.
1. Find total proportion each race holds in the united states
2. take 1/proportion
3. Multiply it by the rate of death for that race
4. Compare the impact values to see how much each death impacts one race vs another
"""
list_total_races = []
j = 0
columns_race.sort()
print(columns_race)
for i in columns_race:
    newDF = df_temp[i].astype(float)
    list_total_races.append(newDF.sum())
    print(list_total_races[j])
    j+=1
sum_all_races = sum(list_total_races)
proportional_share = [i/sum_all_races for i in list_total_races]
proportional_share = [1/i for i in proportional_share]
print(proportional_share)
del y_list[4]
print(y_list)
res_race_porportion = []
j = 0
for i in proportional_share:
    x = y_list[j]
    j+=1
    res_race_porportion.append(i*x)
res_race_porportion


In [None]:
import warnings
warnings.filterwarnings("ignore")

cities.sort()
columns_race.sort()
races = df_total['race'].tolist()
races =list(set(races))
races.remove('O')
races.sort()
df_total['Median Income'] = np.nan
df_total['poverty_rate'] = np.nan
df_total['percent_completed_hs'] = np.nan
df_total_temp = df_total.copy()
for i in cities:
    temp = df_total_temp.loc[df_total_temp['city'] == i]
    supptemp = df_support.loc[df_support['city'] == i]
    support_proportion_race = {}
    ind = 0
    for j in columns_race:
        temp2 = temp.loc[temp['race'] == races[ind]]
        support_proportion_race['Median Income'] = supptemp['Median Income'].iat[0] * supptemp[j].iat[0]/100
        support_proportion_race['poverty_rate'] = supptemp['poverty_rate'].iat[0] * supptemp[j].iat[0]/100
        support_proportion_race['percent_completed_hs'] = supptemp['percent_completed_hs'].iat[0] * supptemp[j].iat[0]/100
        #tempDF = pd.DataFrame(support_proportion_race)
        # tempDF = pd.concat([tempDF]*len(temp2), ignore_index=True)
        #print(tempDF.info())
        # print(temp2.info())
        temp2['Median Income'] = temp2['Median Income'].fillna(support_proportion_race['Median Income'])
        temp2['poverty_rate']= temp2['poverty_rate'].fillna(support_proportion_race['poverty_rate'])
        temp2['percent_completed_hs'] = temp2['percent_completed_hs'].fillna(support_proportion_race['percent_completed_hs'])
        #print(temp2.info())
        temp.loc[temp['race'] == races[ind]] = temp2
        #print(temp.info())
        ind+=1
        #print(temp.info())
    temp = temp.fillna(0)
    df_total_temp.loc[df_total_temp['city'] == i] = temp

df_total = df_total_temp
df_total

In [None]:
df_total_copy = df_total.copy()

# pd.get_dummies(df_total_copy, columns=['manner_of_death', 'armed', 'gender', 'city', ])
df_temp = df_total_copy.drop(columns=['id', 'name', 'date', 'state'])
# df_temp

df_temp = pd.get_dummies(df_temp, columns=['manner_of_death', 'armed', 'gender', 'city', 'signs_of_mental_illness', 'threat_level', 'flee', 'body_camera'])

training_es_x = df_temp.iloc[0:2028].drop(columns=['race'])
training_es_y = df_temp.iloc[0:2028]['race']

testing_es_x = df_temp.iloc[2028:].drop(columns=['race'])
testing_es_y = df_temp.iloc[2028:]['race']

testing_es_x

In [None]:
df_temp['race'].unique()

In [None]:
model = LogisticRegression(max_iter=3000)

model.fit(training_es_x, training_es_y)
predict_train = model.predict(training_es_x)

predict_test = model.predict(testing_es_x)
predict_test

In [None]:
testing_es_x

In [None]:
set(predict_test)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

# f1_score(testing_es_y, predict_test, average=None)

accuracy_test = accuracy_score(testing_es_y, predict_test)
# accuracy_test = accuracy_score(training_es_y, predict_train)
accuracy_test
# This is the accuracy of the Logistic Regression Classifier on the testing data



In [None]:
tmodel = DecisionTreeClassifier()
# tmodel.fit(training_es_x, training_es_y)
# predict_train_tree = tmodel
tmodel.fit(training_es_x, training_es_y)
predict_tree = tmodel.predict(training_es_x)

predict_test_tree = tmodel.predict(testing_es_x)
accuracy_test_tree = accuracy_score(testing_es_y, predict_test_tree)

accuracy_test_tree
# This is the accuracy of the Decision Tree Classifier on the testing data

In [None]:
# sklearn.tree.plot_tree(tmodel)

plt.figure(figsize=(120,120))
sklearn.tree.plot_tree(tmodel, fontsize=10, feature_names=testing_es_x.columns, class_names=testing_es_y.unique().tolist(), filled=True)

In [None]:
from sklearn.naive_bayes import GaussianNB

bmodel = GaussianNB()

bmodel.fit(training_es_x, training_es_y)

In [None]:
predict_test_b = bmodel.predict(testing_es_x)
predict_test_b 

In [None]:
accuracy_test_b = accuracy_score(testing_es_y, predict_test_b)
accuracy_test_b 
# This is the accuracy of the Naive Bayes model on the testing data

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model_k = KNeighborsClassifier()

model_k.fit(training_es_x, training_es_y)
predict_train_k = model_k.predict(training_es_x)

# f1_score(training_es_y, predict_train_k, average=None)


In [None]:
predict_test_k = model_k.predict(testing_es_x)
accuracy_test_k = accuracy_score(testing_es_y, predict_test_k)
accuracy_test_k
# This is the accuracy of the K Neighbors Classifier on the testing data

In [None]:
predict_test_k 

In [271]:
accuracy_test_b = accuracy_score(testing_es_y, predict_test_b)
accuracy_test_b 
# This is the accuracy of the Naive Bayes model on the testing data

0.6015779092702169

In [272]:
from sklearn.neighbors import KNeighborsClassifier

model_k = KNeighborsClassifier()

model_k.fit(training_es_x, training_es_y)
predict_train_k = model_k.predict(training_es_x)

# f1_score(training_es_y, predict_train_k, average=None)


In [273]:
predict_test_k = model_k.predict(testing_es_x)
accuracy_test_k = accuracy_score(testing_es_y, predict_test_k)
accuracy_test_k
# This is the accuracy of the K Neighbors Classifier on the testing data

0.7731755424063116

In [274]:
predict_test_k 

array(['H', 'W', 'B', 'W', 'B', 'W', 'H', 'W', 'W', 'W', 'B', 'W', 'H',
       'B', 'W', 'H', 'W', 'B', 'B', 'B', 'B', 'W', 'W', 'H', 'B', 'W',
       'B', 'W', 'B', 'B', 'B', 'B', 'W', 'B', 'B', 'B', 'B', 'B', 'B',
       'W', 'W', 'H', 'W', 'W', 'W', 'B', 'B', 'W', 'B', 'B', 'B', 'W',
       'B', 'W', 'W', 'W', 'W', 'W', 'W', 'W', 'W', 'B', 'W', 'W', 'B',
       'W', 'W', 'W', 'B', 'W', 'W', 'W', 'B', 'W', 'W', 'B', 'H', 'B',
       'W', 'W', 'W', 'B', 'B', 'B', 'W', 'B', 'A', 'W', 'H', 'W', 'W',
       'W', 'B', 'B', 'H', 'B', 'B', 'W', 'B', 'B', 'W', 'W', 'H', 'B',
       'H', 'H', 'W', 'H', 'B', 'H', 'W', 'W', 'W', 'B', 'W', 'B', 'B',
       'W', 'B', 'W', 'B', 'H', 'H', 'W', 'W', 'B', 'W', 'W', 'H', 'H',
       'W', 'W', 'W', 'W', 'W', 'B', 'W', 'W', 'W', 'N', 'H', 'B', 'H',
       'W', 'H', 'W', 'W', 'W', 'W', 'W', 'B', 'W', 'B', 'B', 'H', 'W',
       'B', 'H', 'W', 'B', 'W', 'W', 'B', 'W', 'B', 'W', 'H', 'W', 'B',
       'B', 'B', 'B', 'B', 'H', 'B', 'W', 'B', 'H', 'W', 'W', 'W