In [262]:
import itertools
import pandas as pd
import random
import sklearn
from sklearn.model_selection import train_test_split
import datetime
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb

In [263]:
"""
Our first goal is going to be data cleaning and munging. We want to take care of all null values
first.
"""

df_training = pd.read_csv('police_killings_train.csv')
df_test = pd.read_csv('police_killings_test.csv')
df_race_city = pd.read_csv('share_race_by_city.csv')
df_income = pd.read_csv('income.csv', encoding="ANSI")
df_poverty = pd.read_csv('poverty.csv', encoding="ANSI")
df_education = pd.read_csv('education.csv', encoding="ANSI")

df_test.info()
df_training

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 507 entries, 0 to 506
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       507 non-null    int64  
 1   name                     507 non-null    object 
 2   date                     507 non-null    object 
 3   manner_of_death          507 non-null    object 
 4   armed                    504 non-null    object 
 5   age                      467 non-null    float64
 6   gender                   507 non-null    object 
 7   race                     403 non-null    object 
 8   city                     507 non-null    object 
 9   state                    507 non-null    object 
 10  signs_of_mental_illness  507 non-null    bool   
 11  threat_level             507 non-null    object 
 12  flee                     469 non-null    object 
 13  body_camera              507 non-null    bool   
dtypes: bool(2), float64(1), in

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023,2256,Jeremy Lopez-Robledo,24/01/17,shot,knife,29.0,M,H,Las Cruces,NM,True,attack,Foot,True
2024,2257,Jonathan David Sper,24/01/17,shot,unarmed,30.0,M,W,Algoma Township,MI,True,attack,Not fleeing,False
2025,2258,Jose Efrain Rodriguez,24/01/17,shot and Tasered,gun,18.0,M,H,Lancaster City,PA,False,attack,Not fleeing,False
2026,2259,Ramon Milanez,24/01/17,shot,gun,32.0,M,H,Kuna,ID,False,attack,Car,False


In [264]:
"""
We will combine the training and tesing dataset to make cleaning easier and more accurate
"""

df_total = df_training.append(df_test, ignore_index=True)
df_total

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2530,2822,Rodney E. Jacobs,28/07/17,shot,gun,31.0,M,,Kansas City,MO,False,attack,Not fleeing,False
2531,2813,TK TK,28/07/17,shot,vehicle,,M,,Albuquerque,NM,False,attack,Car,False
2532,2818,Dennis W. Robinson,29/07/17,shot,gun,48.0,M,,Melba,ID,False,attack,Car,False
2533,2817,Isaiah Tucker,31/07/17,shot,vehicle,28.0,M,B,Oshkosh,WI,False,attack,Car,True


In [265]:
"""
Race is one of the fields that have missing fields, we will use the race_city dataset to help
get an insight into which race to pick. First we ant to get the mode of the race's by city and impute
it directly in. Then we want to get the mode of the race and place it in, if there is more than one
mode then use race_city and take the race of highest population
"""
#make a list of unique cities
cities = df_total['city'].tolist()
cities =list(set(cities))
cities

['Lewisburg',
 'Clarksville',
 'Lisle',
 'Plantation',
 'Los Angeles',
 'Oxford',
 'Richland',
 'Mascotte',
 'Ladera Heights',
 'Las Vegas',
 'North Las Vegas',
 'Friendswood',
 'Moultrie',
 'Fairfield',
 'Montgomery',
 'Draper',
 'Tarboro',
 'Pacific Beach',
 'Eau Claire',
 'Douglas City',
 'Concord',
 'Milcreek',
 'Norwalk',
 'Germantown',
 'Elkins',
 'Visalia',
 'St. Paul',
 'Meadow Bridge',
 'Lafayette',
 'Crescent City',
 'Somerton',
 'Brick Township',
 'Jonesboro',
 'Belle Glade',
 'Ideal Beach',
 'Mankato',
 'Wilderville',
 'Deerfield',
 'Sumas',
 'Marana',
 'Neenah',
 'Sidney',
 'Huntley',
 'Nacogdoches',
 'Cedartown',
 'Mashantucket',
 'Boulder City',
 'Denham Springs',
 'Lovington',
 'Kokomo',
 'Bellville',
 'Brookhaven',
 'Sumter',
 'Ventnor',
 'New York',
 'Lompoc',
 'Cape Canaveral',
 'Wilkesboro',
 'San Ysidro',
 'Naples',
 'Osteen',
 'Wood County',
 'Memphis',
 'Freeport',
 'Old Saybrook',
 'Rotterdam',
 'Englewood',
 'Orrington',
 'Edgewood',
 'Tyrone',
 'Midland',
 'Sa

In [266]:
"""
Let's try cleaning the the race of city dataset now viewing the missing datapoints which are denoted at (X).
"""
list_race_df = df_race_city.values.tolist()
j = 0
list_bad_cities = []
for i in list_race_df:
    #we can see it is only null when all races are missing
    if(i.count('(X)') > 0):
        j+=1
        list_bad_cities.append(i[1])
        print(i.count('(X)'))
print("total cities with bad points = ", j)


5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
total cities with bad points =  20


In [267]:
"""With only 20/29260 holding a row of null values, we shall drop these cities"""
tempDf_race = df_race_city[df_race_city.share_white != '(X)'].copy()
print(tempDf_race.info())
df_race_city.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29248 entries, 0 to 29267
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Geographic area        29248 non-null  object
 1   City                   29248 non-null  object
 2   share_white            29248 non-null  object
 3   share_black            29248 non-null  object
 4   share_native_american  29248 non-null  object
 5   share_asian            29248 non-null  object
 6   share_hispanic         29248 non-null  object
dtypes: object(7)
memory usage: 1.8+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29268 entries, 0 to 29267
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Geographic area        29268 non-null  object
 1   City                   29268 non-null  object
 2   share_white            29268 non-null  object
 3   share_black         

In [268]:
#we can check that all the cities in the total data set is still in the race_city dataset

cities2 = tempDf_race['City'].tolist()
cities2 =list(set(cities2))
j = 0
for i in cities:
    if(i in cities2):
        print(i)
        j+=1
print(j)
print(list_bad_cities)

Carson City
1
['Chisana CDP', 'Flat CDP', 'Mertarvik CDP', 'Almanor CDP', 'Caribou CDP', 'Cedar Slope CDP', 'Silver City CDP', 'Sugarloaf Mountain Park CDP', 'University of California Merced CDP', 'Valley Wells CDP', 'Southfield CDP', 'Florida village', 'Goss town', 'Lakeside city', 'Greenhorn city', 'Laredo Ranchettes West CDP', 'Pueblo East CDP', 'Valle Hermoso CDP', 'Valle Verde CDP', 'Table Rock CDP']


In [269]:
#make a list of df for each city
list_df_cities = []
for i in cities:
    list_df_cities.append(df_total.loc[df_total['city'] == i])

In [270]:
#make a list of race
races = df_total['race'].tolist()
races =list(set(races))
races = races[1::]
races

['W', 'O', 'H', 'N', 'B', 'A']

In [271]:
mode_total = df_total['race'].mode(dropna=True).tolist()
print(mode_total)
for i in list_df_cities:
    mode = i['race'].mode(dropna=True)
    if(len(mode) == 0):
        i[['race']] = i[['race']].fillna(mode_total[0])
        if(i['race'].isnull().sum() != 0):
            print("null val")
    elif(isinstance(mode, type(i['race']))):
        mode.dropna()
        mode = mode.tolist()
        #if the mode has more than one value then take a random choice between the values
        i[['race']] = i[['race']].fillna(random.choice(tuple(mode)))



    df_total.loc[df_total['city'] ==  i.at[i.first_valid_index(),'city']] = i
df_total.info()

['W']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2535 entries, 0 to 2534
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       2535 non-null   int64  
 1   name                     2535 non-null   object 
 2   date                     2535 non-null   object 
 3   manner_of_death          2535 non-null   object 
 4   armed                    2526 non-null   object 
 5   age                      2458 non-null   float64
 6   gender                   2535 non-null   object 
 7   race                     2535 non-null   object 
 8   city                     2535 non-null   object 
 9   state                    2535 non-null   object 
 10  signs_of_mental_illness  2535 non-null   bool   
 11  threat_level             2535 non-null   object 
 12  flee                     2470 non-null   object 
 13  body_camera              2535 non-null   bool   
dtypes: bool(2), float64(1), 

In [272]:
 # removed all data wiht NaN in race Column...df_training[df_training['race'].isnull()]
df_total

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2530,2822,Rodney E. Jacobs,28/07/17,shot,gun,31.0,M,B,Kansas City,MO,False,attack,Not fleeing,False
2531,2813,TK TK,28/07/17,shot,vehicle,,M,H,Albuquerque,NM,False,attack,Car,False
2532,2818,Dennis W. Robinson,29/07/17,shot,gun,48.0,M,W,Melba,ID,False,attack,Car,False
2533,2817,Isaiah Tucker,31/07/17,shot,vehicle,28.0,M,B,Oshkosh,WI,False,attack,Car,True


In [273]:
"""
Fix the 'race' column
We want to impute the NaN in the 'race' column with the most frequent race that is reported in the same city.
First get a list of cities,
second, get the mode for each city.
"""
training_race_noNaN = df_training[df_training['race'].notnull()].copy()

training_cities = training_race_noNaN['city'].unique()

training_race_noNaN

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023,2256,Jeremy Lopez-Robledo,24/01/17,shot,knife,29.0,M,H,Las Cruces,NM,True,attack,Foot,True
2024,2257,Jonathan David Sper,24/01/17,shot,unarmed,30.0,M,W,Algoma Township,MI,True,attack,Not fleeing,False
2025,2258,Jose Efrain Rodriguez,24/01/17,shot and Tasered,gun,18.0,M,H,Lancaster City,PA,False,attack,Not fleeing,False
2026,2259,Ramon Milanez,24/01/17,shot,gun,32.0,M,H,Kuna,ID,False,attack,Car,False


In [274]:
training_race_noNaN.groupby('city')['race'].agg(pd.Series.mode).to_frame()

Unnamed: 0_level_0,race
city,Unnamed: 1_level_1
Abingdon,W
Acworth,W
Addison,H
Aiken,W
Aitkin,W
...,...
York,W
York County,B
Yuma,H
Zanesville,W


In [48]:
training_race_noNaN.groupby('city')['race'].agg(pd.Series.mode).to_frame()

Unnamed: 0_level_0,race
city,Unnamed: 1_level_1
Abingdon,W
Acworth,W
Addison,H
Aiken,W
Aitkin,W
...,...
York,W
York County,B
Yuma,H
Zanesville,W
