In [1]:
import itertools
import pandas as pd
import random
import sklearn
from sklearn.model_selection import train_test_split
import datetime
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb

In [2]:
"""
Our first goal is going to be data cleaning and munging. We want to take care of all null values first.
"""

df_training = pd.read_csv('police_killings_train.csv')
df_test = pd.read_csv('police_killings_test.csv')
df_race_city = pd.read_csv('share_race_by_city.csv')
df_income = pd.read_csv('income.csv', encoding="ANSI")
df_poverty = pd.read_csv('poverty.csv', encoding="ANSI")
df_education = pd.read_csv('education.csv', encoding="ANSI")

df_test.info()
df_training

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 507 entries, 0 to 506
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       507 non-null    int64  
 1   name                     507 non-null    object 
 2   date                     507 non-null    object 
 3   manner_of_death          507 non-null    object 
 4   armed                    504 non-null    object 
 5   age                      467 non-null    float64
 6   gender                   507 non-null    object 
 7   race                     403 non-null    object 
 8   city                     507 non-null    object 
 9   state                    507 non-null    object 
 10  signs_of_mental_illness  507 non-null    bool   
 11  threat_level             507 non-null    object 
 12  flee                     469 non-null    object 
 13  body_camera              507 non-null    bool   
dtypes: bool(2), float64(1), in

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023,2256,Jeremy Lopez-Robledo,24/01/17,shot,knife,29.0,M,H,Las Cruces,NM,True,attack,Foot,True
2024,2257,Jonathan David Sper,24/01/17,shot,unarmed,30.0,M,W,Algoma Township,MI,True,attack,Not fleeing,False
2025,2258,Jose Efrain Rodriguez,24/01/17,shot and Tasered,gun,18.0,M,H,Lancaster City,PA,False,attack,Not fleeing,False
2026,2259,Ramon Milanez,24/01/17,shot,gun,32.0,M,H,Kuna,ID,False,attack,Car,False


In [3]:
"""
We will combine the training and tesing dataset to make cleaning easier and more accurate
"""

df_total = df_training.append(df_test, ignore_index=True)
df_total

  df_total = df_training.append(df_test, ignore_index=True)


Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2530,2822,Rodney E. Jacobs,28/07/17,shot,gun,31.0,M,,Kansas City,MO,False,attack,Not fleeing,False
2531,2813,TK TK,28/07/17,shot,vehicle,,M,,Albuquerque,NM,False,attack,Car,False
2532,2818,Dennis W. Robinson,29/07/17,shot,gun,48.0,M,,Melba,ID,False,attack,Car,False
2533,2817,Isaiah Tucker,31/07/17,shot,vehicle,28.0,M,B,Oshkosh,WI,False,attack,Car,True


In [4]:
print(df_income.info())
df_race_city.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29322 entries, 0 to 29321
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Geographic Area  29322 non-null  object
 1   City             29322 non-null  object
 2   Median Income    29271 non-null  object
dtypes: object(3)
memory usage: 687.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29268 entries, 0 to 29267
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Geographic area        29268 non-null  object
 1   City                   29268 non-null  object
 2   share_white            29268 non-null  object
 3   share_black            29268 non-null  object
 4   share_native_american  29268 non-null  object
 5   share_asian            29268 non-null  object
 6   share_hispanic         29268 non-null  object
dtypes: object(7)
memory usage: 1.6+ MB


In [5]:
"""
Race is one of the fields that have missing fields, we will use the df_race_city dataset to help
get an insight into which race to impute.

In order to determine the race to impute we must:
1. At each missing race, determine the most frequent race of that specific city (mode).
2. If the mode returns more than 1 race, use the race with the highest percentage distribution in that city. Otherwise, use the returned race.
3. Impute the race into the missing data cell.
i.e.: City 'Zion' returns a mode of ['W', 'B'] which indicates we have as many 'W' White and 'B' Black people datapoints for the city of 'Zion'. Using the supporting dataframe from df_race_city, 'Zion' has a distribution of 'W' as 0.6, 'B' as 0.

"""
#make a list of unique cities, and include the state so we don't deal with same city names of different states
df_total['city'] = df_total[['state', 'city']].apply("-".join,axis=1)
cities = df_total['city'].tolist()
cities =list(set(cities))
print(len(cities))
cities

1534


['OK-Norwood',
 'MS-St. Martin',
 'CA-Bellflower',
 'AL-Killeen',
 'OK-Schulter',
 'RI-Tiverton',
 'GA-Loganville',
 'TX-San Antonio',
 'FL-Vero Beach',
 'CA-Saratoga',
 'WA-Lynnwood',
 'OK-Pryor',
 'HI-Keaau',
 'ME-Portland',
 'HI-Kahuku',
 'WA-Wapato',
 'GA-East Point',
 'NV-Pahrump',
 'MI-Cato Township',
 'VT-Burlington',
 'GA-Stockbridge',
 'AK-Big Lake',
 'GA-Cartersville',
 'SC-Hardeeville',
 'CA-Orange',
 'CO-Fort Collins',
 'TX-Azle',
 'TX-Reno',
 'MN-St. Cloud',
 'MO-Sullivan',
 'NC-Salisbury',
 'CO-Trinidad',
 'FL-Pensacola',
 'KY-Barbourville',
 'VA-Hopewell',
 'NY-Staten Island',
 'CO-Colorado City',
 'MA-Spencer',
 'CA-Walnut Creek',
 'WA-East Wenatchee',
 'FL-Jacksonville',
 'AL-Clay',
 'SD-Mitchell',
 'GA-Powder Springs',
 'TX-Winnie',
 'FL-Tallahassee',
 'TX-Lake Jackson',
 'RI-Pawtucket',
 'AR-Dover',
 'MO-Blue Springs',
 'MN-Morris',
 'NV-Reno',
 'MO-Sedalia',
 'MI-Holland Township',
 'GA-Riverdale',
 'NV-Boulder City',
 'PA-Fort Littleton',
 'GA-Tyrone',
 'CT-Bolton'

In [6]:
"""
Let's try cleaning the the race of city dataset now viewing the missing datapoints which are denoted at (X).
"""
df_race_city['city'] = df_race_city[['Geographic area', 'City']].apply("-".join,axis=1)
list_race_df = df_race_city.values.tolist()
j = 0
list_bad_cities = []
for i in list_race_df:
    #we can see it is only null when all races are missing
    if(i.count('(X)') > 0):
        j+=1
        list_bad_cities.append(i[1])
        print(i.count('(X)'))
print("total cities with bad points = ", j)


5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
total cities with bad points =  20


In [7]:
"""With only 20/29260 holding a row of null values, we shall drop these cities"""
tempDf_race = df_race_city[df_race_city.share_white != '(X)'].copy()
print(tempDf_race.info())
df_race_city.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29248 entries, 0 to 29267
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Geographic area        29248 non-null  object
 1   City                   29248 non-null  object
 2   share_white            29248 non-null  object
 3   share_black            29248 non-null  object
 4   share_native_american  29248 non-null  object
 5   share_asian            29248 non-null  object
 6   share_hispanic         29248 non-null  object
 7   city                   29248 non-null  object
dtypes: object(8)
memory usage: 2.0+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29268 entries, 0 to 29267
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Geographic area        29268 non-null  object
 1   City                   29268 non-null  object
 2   share_white         

In [8]:
list_total_races = []
columns_race = tempDf_race.columns.values.tolist()
s = 'share'
columns_race = [string for string in columns_race if(s in string)]
columns_race

['share_white',
 'share_black',
 'share_native_american',
 'share_asian',
 'share_hispanic']

In [9]:
temp = ["ok", "o", "ok", " sdos", 2, 3, 4, 5, 6, 6]
j = 0
for i in temp:
    if(isinstance(i,str)):
        j+=1
print( j )
for i in columns_race:
    j = 0
    for k in tempDf_race[i].values.tolist():
        if(isinstance(k,str)):
            j+=1
    print(j)
    tempDf_race[i] = tempDf_race[i].astype(float)

    list_total_races.append(tempDf_race[i].sum().astype(float))
list_total_races[0]

4
29248
29248
29248
29248
29248


2433809.0

In [10]:
#we can check that all the cities in the total data set is still in the race_city dataset
cities2 = tempDf_race['city'].tolist()
cities2 =list(set(cities2))
j = 0
for i in cities:
    # print(i in cities2)
    # print(i)
    # print(cities2.index(i[0::1]))
    if(any(i in string for string in cities2)):
        #print(i)
        j+=1
    else:
        print(i)
        print(cities2.index(i[0::1]))

print(j)
print(list_bad_cities)

OK-Norwood


ValueError: 'OK-Norwood' is not in list

In [None]:
#make a list of df fT-oubaor each city
list_df_cities = []
for i in cities:
    list_df_cities.append(df_total.loc[df_total['city'] == i])

In [None]:
#make a list of race
races = df_total['race'].tolist()
races =list(set(races))
races = races[1::]
races

In [None]:
mode_total = df_total['race'].mode(dropna=True).tolist()
print(mode_total)
for i in list_df_cities:
    mode = i['race'].mode(dropna=True)
    if(len(mode) == 0):
        i[['race']] = i[['race']].fillna(mode_total[0])
        if(i['race'].isnull().sum() != 0):
            print("null val")
    elif(isinstance(mode, type(i['race']))):
        mode.dropna()
        mode = mode.tolist()
        #if the mode has more than one value then take a random choice between the values
        i[['race']] = i[['race']].fillna(random.choice(tuple(mode)))



    df_total.loc[df_total['city'] ==  i.at[i.first_valid_index(),'city']] = i
df_total.info()

In [None]:
 # removed all data wiht NaN in race Column...df_training[df_training['race'].isnull()]
df_total

In [None]:
"""
We noticed that some of the 'age' data is missing.
So we imputed the average age of the people in each city and imputed the average age for the missing ages based on their city.
1. We calculate the total mean age of our total data sample.
2. We group by city, then calculate the mean Age at each city.
3. For each NaN age, we impute the mean age of that city. If that city was not calcuated (meaning this data is the only one from that city), we will use the total mean age.
"""
age_fix = df_total[['id','age','city']].copy()
age_fix = age_fix[age_fix['age'].isnull()].copy()
# 1. Calculate Total Mean Age
total_mean_age = df_total['age'].mean()

# 2. Calculate Mean of Each City
city_mean_ages = df_total.groupby('city')['age'].mean()

#3. Impute Age by the city. If City data does not exist, use Total Mean Age.
for i in age_fix.index:
    city = age_fix['city'][i]
    if city in city_mean_ages and not np.isnan(city_mean_ages[city]):
        age_fix['age'][i] = city_mean_ages[city]
    else:
        age_fix['age'][i] = total_mean_age


# update total list
df_total.update(age_fix)

df_total.info()

In [None]:
"""
Fix the 'race' column
We want to impute the NaN in the 'race' column with the most frequent race that is reported in the same city.
First get a list of cities,
second, get the mode for each city.
"""
training_race_noNaN = df_training[df_training['race'].notnull()].copy()

training_cities = training_race_noNaN['city'].unique()

training_race_noNaN

In [None]:
training_race_noNaN.groupby('city')['race'].agg(pd.Series.mode).to_frame()

In [None]:
training_race_noNaN.groupby('city')['race'].agg(pd.Series.mode).to_frame()