In [717]:
import itertools

import numpy
import pandas as pd
import random
import sklearn
from sklearn.model_selection import train_test_split
import datetime
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb

In [718]:
"""
Our first goal is going to be data cleaning and munging. We want to take care of all null values
first.
"""

df_training = pd.read_csv('police_killings_train.csv')
df_test = pd.read_csv('police_killings_test.csv')
df_race_city = pd.read_csv('share_race_by_city.csv')
df_income = pd.read_csv('income.csv', encoding="ANSI")
df_poverty = pd.read_csv('poverty.csv', encoding="ANSI")
df_education = pd.read_csv('education.csv', encoding="ANSI")

df_test.info()
df_training

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 507 entries, 0 to 506
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       507 non-null    int64  
 1   name                     507 non-null    object 
 2   date                     507 non-null    object 
 3   manner_of_death          507 non-null    object 
 4   armed                    504 non-null    object 
 5   age                      467 non-null    float64
 6   gender                   507 non-null    object 
 7   race                     403 non-null    object 
 8   city                     507 non-null    object 
 9   state                    507 non-null    object 
 10  signs_of_mental_illness  507 non-null    bool   
 11  threat_level             507 non-null    object 
 12  flee                     469 non-null    object 
 13  body_camera              507 non-null    bool   
dtypes: bool(2), float64(1), in

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023,2256,Jeremy Lopez-Robledo,24/01/17,shot,knife,29.0,M,H,Las Cruces,NM,True,attack,Foot,True
2024,2257,Jonathan David Sper,24/01/17,shot,unarmed,30.0,M,W,Algoma Township,MI,True,attack,Not fleeing,False
2025,2258,Jose Efrain Rodriguez,24/01/17,shot and Tasered,gun,18.0,M,H,Lancaster City,PA,False,attack,Not fleeing,False
2026,2259,Ramon Milanez,24/01/17,shot,gun,32.0,M,H,Kuna,ID,False,attack,Car,False


In [719]:
"""
We will combine the training and tesing dataset to make cleaning easier and more accurate
"""

df_total = df_training.append(df_test, ignore_index=True)
df_total

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2530,2822,Rodney E. Jacobs,28/07/17,shot,gun,31.0,M,,Kansas City,MO,False,attack,Not fleeing,False
2531,2813,TK TK,28/07/17,shot,vehicle,,M,,Albuquerque,NM,False,attack,Car,False
2532,2818,Dennis W. Robinson,29/07/17,shot,gun,48.0,M,,Melba,ID,False,attack,Car,False
2533,2817,Isaiah Tucker,31/07/17,shot,vehicle,28.0,M,B,Oshkosh,WI,False,attack,Car,True


In [720]:
print(df_income.info())
df_race_city.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29322 entries, 0 to 29321
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Geographic Area  29322 non-null  object
 1   City             29322 non-null  object
 2   Median Income    29271 non-null  object
dtypes: object(3)
memory usage: 687.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29268 entries, 0 to 29267
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Geographic area        29268 non-null  object
 1   City                   29268 non-null  object
 2   share_white            29268 non-null  object
 3   share_black            29268 non-null  object
 4   share_native_american  29268 non-null  object
 5   share_asian            29268 non-null  object
 6   share_hispanic         29268 non-null  object
dtypes: object(7)
memory usage: 1.6+ MB


In [721]:
"""
There are many missing fields across all the supporting/optional datasets, in order to resolve this we need
to merge these datasets and then make the necessary imputation.
1. All data sets have a state and city type columns, merge those columns such that they can all have a column
to for indexing.
2. Get the list of cities within the mandatory dataset, we need to confirm that the supporting dataset
has these cities and if not then we need to impute.
"""
#make a list of unique cities, and include the state so we don't deal with same city names of different states
df_total['city'] = df_total[['state', 'city']].apply("-".join,axis=1)
df_race_city['city'] = df_race_city[['Geographic area', 'City']].apply("-".join,axis=1)
df_income['city'] = df_income[['Geographic Area', 'City']].apply("-".join,axis=1)
df_poverty['city'] = df_poverty[['Geographic Area', 'City']].apply("-".join,axis=1)
df_education['city'] = df_education[['Geographic Area', 'City']].apply("-".join,axis=1)
df_income = df_income.drop(['Geographic Area', 'City'], axis=1)
df_poverty = df_poverty.drop(columns=['Geographic Area', 'City'])
df_education = df_education.drop(columns=['Geographic Area', 'City'])
cities = df_total['city'].tolist()
cities =list(set(cities))
print(len(cities))
cities

1534


['TN-Columbia',
 'GA-Sylvester',
 'IL-Metropolis',
 'AL-Selma',
 'SC-Johnsonville',
 'AL-Lawrence County',
 'TN-Gibson County',
 'NC-Wilkesboro',
 'VA-Stafford',
 'CA-Riverside',
 'PA-Lewistown',
 'FL-Homestead',
 'TN-Bristol',
 'AZ-Surprise',
 'HI-Keaau',
 'AZ-Red Valley',
 'IL-Springfield',
 'OH-Cincinnati',
 'MO-Poplar Bluff',
 'CA-Shafter',
 'PA-Bloomsburg',
 'DE-Middletown',
 'SC-Lexington',
 'TX-Sunset',
 'MN-Columbia Heights',
 'TN-Raleigh',
 'TX-Mesquite',
 'MI-Berrien County',
 'IN-Indianapolis',
 'AL-Brooksville',
 'AL-Muscle Shoals',
 'WI-Monroe',
 'KY-Booneville',
 'NE-Sidney',
 'OH-Kent',
 'CO-Aurora',
 'WI-Millston',
 'UT-Provo',
 'GA-East Point',
 'TX-Rosenberg',
 'MI-Manistee',
 'NM-Alamogordo',
 'AL-Eufaula',
 'CA-Turlock',
 'SD-Rapid City',
 'CA-Desert Edge',
 'IA-Urbandale',
 'MA-Roslindale',
 'NY-Schenectady',
 'MO-Buffalo',
 'CA-Bell',
 'CA-Delhi',
 'KY-Daviess County',
 'CA-Burbank',
 'AR-Bull Shoals',
 'CA-Oxnard',
 'RI-Pawtucket',
 'AL-Phenix City',
 'GA-Commerc

In [722]:
"""
First lets merge each of the supporting datasets into df_temp.
We noticed that the missing values aren't consistent in the supporting datasets. So we need to change all types of
missing values to NAN such that we can impute using fillna().
"""
print(df_race_city.info())
df_temp = pd.merge(df_race_city, df_income, on='city', how='outer', sort=True)
df_temp = pd.merge(df_temp, df_poverty, on='city', how='outer', sort=True)
df_temp = pd.merge(df_temp, df_education, on='city', how='outer', sort=True)
columns_supp = df_temp.columns.values.tolist()
for i in columns_supp:
    df_temp[i] = df_temp[i].replace('-', '(X)')
    df_temp[i] = df_temp[i].replace('(X)', numpy.NAN)
print(df_temp.info())
df_temp

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29268 entries, 0 to 29267
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Geographic area        29268 non-null  object
 1   City                   29268 non-null  object
 2   share_white            29268 non-null  object
 3   share_black            29268 non-null  object
 4   share_native_american  29268 non-null  object
 5   share_asian            29268 non-null  object
 6   share_hispanic         29268 non-null  object
 7   city                   29268 non-null  object
dtypes: object(8)
memory usage: 1.8+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 29477 entries, 0 to 29476
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Geographic area        29268 non-null  object
 1   City                   29268 non-null  object
 2   share_white        

Unnamed: 0,Geographic area,City,share_white,share_black,share_native_american,share_asian,share_hispanic,city,Median Income,poverty_rate,percent_completed_hs
0,AK,Adak city,19.6,4,5.5,52.5,8.9,AK-Adak city,78500,39.3,93.4
1,AK,Akhiok city,8.5,1.4,50.7,1.4,11.3,AK-Akhiok city,26250,40.5,62.5
2,AK,Akiachak CDP,3.5,0.2,95.1,0.2,0.2,AK-Akiachak CDP,38750,26.1,76.7
3,AK,Akiak city,5.2,0,92.8,0,0.3,AK-Akiak city,42000,31.3,81.3
4,AK,Akutan city,23.3,17.9,5.5,43.3,20.8,AK-Akutan city,24750,16.1,73
...,...,...,...,...,...,...,...,...,...,...,...
29472,WY,Woods Landing-Jelm CDP,95.9,0,0,2.1,0,WY-Woods Landing-Jelm CDP,,18.6,100
29473,WY,Worland city,89.9,0.3,1.3,0.6,16.6,WY-Worland city,41523,15.3,85.6
29474,WY,Wright town,94.5,0.1,1.4,0.2,6.2,WY-Wright town,77114,5.9,89.2
29475,WY,Y-O Ranch CDP,92.8,1.5,2.6,0,11.8,WY-Y-O Ranch CDP,,0,100


In [723]:
"""
Let's try cleaning the the race of city dataset now viewing the missing datapoints which are denoted at (X).
"""
# df_temp['city'] = df_temp[['Geographic area', 'City']].apply("-".join,axis=1)
# list_race_df = df_temp.values.tolist()
# j = 0
# list_bad_cities = []
# for i in list_race_df:
#     #we can see it is only null when all races are missing
#     if(i.count('(X)') > 0):
#         j+=1
#         list_bad_cities.append(i[1])
#         print(i.count('(X)') )
# print("total cities with bad points = ", j)


"\nLet's try cleaning the the race of city dataset now viewing the missing datapoints which are denoted at (X).\n"

In [724]:
"""With only 20/29260 holding a row of null values, we shall drop these cities"""
# tempDf_race = df_temp[df_race_city.share_white != '(X)'].copy()
# print(tempDf_race.info())
# df_race_city.info()

'With only 20/29260 holding a row of null values, we shall drop these cities'

In [725]:
"""
We want to impute the city and our state/Geographic area such that we have a full comprehensive list of both
separately after the merge.
Then we want to get a list of race names within the df_race_city.
"""

df_temp['Geographic area'] = [i[0:2] for i in df_temp['city']]
df_temp['City'] = [i[3:] for i in df_temp['city']]
print(df_temp)
list_states = df_temp['Geographic area'].values.tolist()
list_states = list(set(list_states))
#we expect to see 51 including DC as a state
print(len(list_states))
columns_race = df_temp.columns.values.tolist()
s = 'share'
columns_race = [string for string in columns_race if(s in string)]
columns_race

      Geographic area                    City share_white share_black  \
0                  AK               Adak city        19.6           4   
1                  AK             Akhiok city         8.5         1.4   
2                  AK            Akiachak CDP         3.5         0.2   
3                  AK              Akiak city         5.2           0   
4                  AK             Akutan city        23.3        17.9   
...               ...                     ...         ...         ...   
29472              WY  Woods Landing-Jelm CDP        95.9           0   
29473              WY            Worland city        89.9         0.3   
29474              WY             Wright town        94.5         0.1   
29475              WY           Y-O Ranch CDP        92.8         1.5   
29476              WY              Yoder town        97.4           0   

      share_native_american share_asian share_hispanic  \
0                       5.5        52.5            8.9   
1      

['share_white',
 'share_black',
 'share_native_american',
 'share_asian',
 'share_hispanic']

In [726]:
"""
Now we can impute the porportion of missing cities based on the state's porportions
"""
for state in list_states:
    j=0
    list_total_races = []
    state_df = df_temp.loc[df_temp['Geographic area'] == state]
    for i in columns_race:
        state_df[i] = state_df[i].astype(float)
        list_total_races.append(state_df[i].sum())
        print(list_total_races[j])
        j+=1
    sum_all_races = sum(list_total_races)
    porportional_share = [i/sum_all_races for i in list_total_races]
    #print(porportional_share, " sum= ", sum(porportional_share))
    j=0
    for i in columns_race:
        state_df[i] = state_df[i].fillna(porportional_share[j])
        j+=1
    df_temp.loc[df_temp['Geographic area'] == state] = state_df
print(df_temp.info())
df_temp



97576.2
568.5999999999999
273.8
404.40000000000003
2844.4
[0.959758978787694, 0.005592746544123288, 0.0026930953284927126, 0.0039776762265977105, 0.0279775031130923]  sum=  1.0
123518.1
6430.8
357.9
1828.3
7068.1
[0.8873222742005931, 0.046197213857152704, 0.002571061584791154, 0.013134037148571295, 0.05077541320889175]  sum=  1.0
53454.6
2474.8
10542.1
312.6
4175.5
[0.7533103343310841, 0.03487618306754829, 0.14856481716356912, 0.0044053235925794396, 0.058843341845218965]  sum=  0.9999999999999998
30833.699999999997
82.7
4332.9
116.4
800.1
[0.8525651305929912, 0.0022866907409762817, 0.11980655757649493, 0.0032185103053160727, 0.02212311078422156]  sum=  1.0
52850.7
15077.300000000001
1321.4
684.5
4739.1
[0.7077618416295046, 0.20191099862065273, 0.017695820443801642, 0.009166633187363571, 0.06346470611867744]  sum=  1.0
10620.1
202.2
719.4000000000001
258.90000000000003
1699.3999999999999
[0.7866740740740741, 0.014977777777777777, 0.05328888888888889, 0.01917777777777778, 0.1258814814814

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


42441.2
8816.599999999999
411.1
258.1
2311.7
[0.7824892558265594, 0.1625518310726474, 0.007579458947209282, 0.004758594877827088, 0.042620859275756984]  sum=  1.0
38158.3
3132.8999999999996
135.8
273.4
1225.3000000000002
[0.888938328320796, 0.072984249528837, 0.0031636059516792965, 0.006369144824662147, 0.028544671374025346]  sum=  0.9999999999999998
21790.299999999996
680.9000000000001
64.7
693.4000000000001
1201.8999999999999
[0.8919046137725529, 0.027870100527194738, 0.002648253053472609, 0.028381741379874918, 0.049195291266904605]  sum=  0.9999999999999998
29158.5
125.5
1261.7
251.29999999999998
2498.2000000000003
[0.8757568658545377, 0.0037693120930344316, 0.03789435113770154, 0.007547634493860977, 0.07503183642086549]  sum=  1.0000000000000002
41909.2
13480.3
381.1
277.3
1722.5
[0.725444172101976, 0.23334268068076383, 0.006596803899574869, 0.0048000360045975105, 0.02981630731308767]  sum=  0.9999999999999999
38.5
50.7
0.3
3.5
9.1
[0.3770812928501469, 0.4965719882468169, 0.0029382

Unnamed: 0,Geographic area,City,share_white,share_black,share_native_american,share_asian,share_hispanic,city,Median Income,poverty_rate,percent_completed_hs
0,AK,Adak city,19.6,4.0,5.5,52.5,8.9,AK-Adak city,78500,39.3,93.4
1,AK,Akhiok city,8.5,1.4,50.7,1.4,11.3,AK-Akhiok city,26250,40.5,62.5
2,AK,Akiachak CDP,3.5,0.2,95.1,0.2,0.2,AK-Akiachak CDP,38750,26.1,76.7
3,AK,Akiak city,5.2,0.0,92.8,0.0,0.3,AK-Akiak city,42000,31.3,81.3
4,AK,Akutan city,23.3,17.9,5.5,43.3,20.8,AK-Akutan city,24750,16.1,73
...,...,...,...,...,...,...,...,...,...,...,...
29472,WY,Woods Landing-Jelm CDP,95.9,0.0,0.0,2.1,0.0,WY-Woods Landing-Jelm CDP,,18.6,100
29473,WY,Worland city,89.9,0.3,1.3,0.6,16.6,WY-Worland city,41523,15.3,85.6
29474,WY,Wright town,94.5,0.1,1.4,0.2,6.2,WY-Wright town,77114,5.9,89.2
29475,WY,Y-O Ranch CDP,92.8,1.5,2.6,0.0,11.8,WY-Y-O Ranch CDP,,0,100


In [626]:
df_temp['Median Income'] = pd.to_numeric(df_temp['Median Income'], errors='coerce')
df_temp['Median Income'] = df_temp['Median Income'].fillna(df_temp['Median Income'].mean())
print(df_temp.info())

df_temp['poverty_rate'] = pd.to_numeric(df_temp['poverty_rate'], errors='coerce')
df_temp['poverty_rate'] = df_temp['poverty_rate'].fillna(df_temp['poverty_rate'].mean())
print(df_temp.info())

df_temp['percent_completed_hs'] = pd.to_numeric(df_temp['percent_completed_hs'], errors='coerce')
df_temp['percent_completed_hs'] = df_temp['percent_completed_hs'].fillna(df_temp['percent_completed_hs'].mean())
print(df_temp.info())

df_temp

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29477 entries, 0 to 29476
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Geographic area        29268 non-null  object 
 1   City                   29268 non-null  object 
 2   share_white            29477 non-null  float64
 3   share_black            29477 non-null  float64
 4   share_native_american  29477 non-null  float64
 5   share_asian            29477 non-null  float64
 6   share_hispanic         29477 non-null  float64
 7   city                   29477 non-null  object 
 8   Median Income          29477 non-null  float64
 9   poverty_rate           29128 non-null  object 
 10  percent_completed_hs   29132 non-null  object 
dtypes: float64(6), object(5)
memory usage: 2.7+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 29477 entries, 0 to 29476
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype

Unnamed: 0,Geographic area,City,share_white,share_black,share_native_american,share_asian,share_hispanic,city,Median Income,poverty_rate,percent_completed_hs
0,AK,Adak city,19.6,4.0,5.5,52.5,8.9,AK-Adak city,78500.000000,39.3,93.4
1,AK,Akhiok city,8.5,1.4,50.7,1.4,11.3,AK-Akhiok city,26250.000000,40.5,62.5
2,AK,Akiachak CDP,3.5,0.2,95.1,0.2,0.2,AK-Akiachak CDP,38750.000000,26.1,76.7
3,AK,Akiak city,5.2,0.0,92.8,0.0,0.3,AK-Akiak city,42000.000000,31.3,81.3
4,AK,Akutan city,23.3,17.9,5.5,43.3,20.8,AK-Akutan city,24750.000000,16.1,73.0
...,...,...,...,...,...,...,...,...,...,...,...
29472,WY,Woods Landing-Jelm CDP,95.9,0.0,0.0,2.1,0.0,WY-Woods Landing-Jelm CDP,51177.890268,18.6,100.0
29473,WY,Worland city,89.9,0.3,1.3,0.6,16.6,WY-Worland city,41523.000000,15.3,85.6
29474,WY,Wright town,94.5,0.1,1.4,0.2,6.2,WY-Wright town,77114.000000,5.9,89.2
29475,WY,Y-O Ranch CDP,92.8,1.5,2.6,0.0,11.8,WY-Y-O Ranch CDP,51177.890268,0.0,100.0


In [563]:
#we can check that all the cities in the total data set is still in the race_city dataset
cities2 = tempDf_race['city'].tolist()
cities2 =list(set(cities2))
j = 0
for i in cities:
    # print(i in cities2)
    # print(i)
    # print(cities2.index(i[0::1]))
    if(any(i in string for string in cities2)):
        #print(i)
        j+=1
    else:
        """
        If the
        """
        newDF = {}
        newDF['Geographic area'] = i[0:2]
        newDF['City'] = i[3:]
        ind = 0
        for share in porportional_share:
            newDF[columns_race[ind]] = share
            ind+=1
        newDF['city'] = i
        print(newDF)
        tempDf_race = tempDf_race.append(newDF, ignore_index=True)
        cities2 = tempDf_race['city'].tolist()
        cities2 =list(set(cities2))
        print(any(i in string for string in cities2))
        j+=1

print(j)
print(list_bad_cities)

1534
['Alatna CDP', 'Alcan Border CDP', 'Aleneva CDP', 'Attu Station CDP', 'Beluga CDP', 'Birch Creek CDP', 'Central CDP', 'Chase CDP', 'Chenega CDP', 'Chicken CDP', 'Chiniak CDP', 'Chisana CDP', 'Chistochina CDP', 'Chitina CDP', 'Clam Gulch CDP', 'Coldfoot CDP', 'Dot Lake CDP', 'Dry Creek CDP', 'Eagle Village CDP', 'Edna Bay CDP', '(X)', 'Eureka Roadhouse CDP', 'Excursion Inlet CDP', 'Ferry CDP', 'Flat CDP', 'Fort Greely CDP', 'Four Mile Road CDP', 'Game Creek CDP', 'Glennallen CDP', 'Gulkana CDP', 'Halibut Cove CDP', 'Healy Lake CDP', 'Hobart Bay CDP', 'Hope CDP', 'Hyder CDP', 'Ivanof Bay CDP', 'Kenny Lake CDP', 'Lake Louise CDP', 'Lime Village CDP', 'Livengood CDP', 'Loring CDP', 'Lowell Point CDP', 'Lutak CDP', 'McCarthy CDP', 'Mendeltna CDP', 'Mertarvik CDP', 'Nabesna CDP', 'Naukati Bay CDP', 'Oscarville CDP', 'Paxson CDP', '(X)', 'Petersburg city', 'Petersville CDP', 'Point Baker CDP', 'Point Possession CDP', 'Pope-Vannoy Landing CDP', 'Port Clarence CDP', 'Port Protection CDP', 

In [444]:
#make a list of df fT-oubaor each city
list_df_cities = []
for i in cities:
    list_df_cities.append(df_total.loc[df_total['city'] == i])

In [445]:
#make a list of race
races = df_total['race'].tolist()
races =list(set(races))
races = races[1::]
races

['N', 'B', 'A', 'H', 'O']

In [446]:
mode_total = df_total['race'].mode(dropna=True).tolist()
print(mode_total)
for i in list_df_cities:
    mode = i['race'].mode(dropna=True)
    if(len(mode) == 0):
        i[['race']] = i[['race']].fillna(mode_total[0])
        if(i['race'].isnull().sum() != 0):
            print("null val")
    elif(isinstance(mode, type(i['race']))):
        mode.dropna()
        mode = mode.tolist()
        #if the mode has more than one value then take a random choice between the values
        i[['race']] = i[['race']].fillna(random.choice(tuple(mode)))



    df_total.loc[df_total['city'] ==  i.at[i.first_valid_index(),'city']] = i
df_total.info()

['W']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2535 entries, 0 to 2534
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       2535 non-null   float64
 1   name                     2535 non-null   object 
 2   date                     2535 non-null   object 
 3   manner_of_death          2535 non-null   object 
 4   armed                    2535 non-null   object 
 5   age                      2535 non-null   float64
 6   gender                   2535 non-null   object 
 7   race                     2535 non-null   object 
 8   city                     2535 non-null   object 
 9   state                    2535 non-null   object 
 10  signs_of_mental_illness  2535 non-null   bool   
 11  threat_level             2535 non-null   object 
 12  flee                     2535 non-null   object 
 13  body_camera              2535 non-null   bool   
dtypes: bool(2), float64(2), 

In [447]:
 # removed all data wiht NaN in race Column...df_training[df_training['race'].isnull()]
df_total

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3.0,Tim Elliot,02/01/15,shot,gun,53.000000,M,A,WA-Shelton,WA,True,attack,Not fleeing,False
1,4.0,Lewis Lee Lembke,02/01/15,shot,gun,47.000000,M,W,OR-Aloha,OR,False,attack,Not fleeing,False
2,5.0,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.000000,M,H,KS-Wichita,KS,False,other,Not fleeing,False
3,8.0,Matthew Hoffman,04/01/15,shot,toy weapon,32.000000,M,W,CA-San Francisco,CA,True,attack,Not fleeing,False
4,9.0,Michael Rodriguez,04/01/15,shot,nail gun,39.000000,M,H,CO-Evans,CO,False,attack,Not fleeing,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2530,2822.0,Rodney E. Jacobs,28/07/17,shot,gun,31.000000,M,B,MO-Kansas City,MO,False,attack,Not fleeing,False
2531,2813.0,TK TK,28/07/17,shot,vehicle,31.153846,M,H,NM-Albuquerque,NM,False,attack,Car,False
2532,2818.0,Dennis W. Robinson,29/07/17,shot,gun,48.000000,M,W,ID-Melba,ID,False,attack,Car,False
2533,2817.0,Isaiah Tucker,31/07/17,shot,vehicle,28.000000,M,B,WI-Oshkosh,WI,False,attack,Car,True


In [448]:
"""
We noticed that some of the 'age' data is missing.
So we imputed the average age of the people in each city and imputed the average age for the missing ages based on their city.
1. We calculate the total mean age of our total data sample.
2. We group by city, then calculate the mean Age at each city.
3. For each NaN age, we impute the mean age of that city. If that city was not calcuated (meaning this data is the only one from that city), we will use the total mean age.
"""
age_fix = df_total[['id','age','city']].copy()
age_fix = age_fix[age_fix['age'].isnull()].copy()
# 1. Calculate Total Mean Age
total_mean_age = df_total['age'].mean()

# 2. Calculate Mean of Each City
city_mean_ages = df_total.groupby('city')['age'].mean()

#3. Impute Age by the city. If City data does not exist, use Total Mean Age.
for i in age_fix.index:
    city = age_fix['city'][i]
    if city in city_mean_ages and not np.isnan(city_mean_ages[city]):
        age_fix['age'][i] = city_mean_ages[city]
    else:
        age_fix['age'][i] = total_mean_age


# update total list
df_total.update(age_fix)

df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2535 entries, 0 to 2534
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       2535 non-null   float64
 1   name                     2535 non-null   object 
 2   date                     2535 non-null   object 
 3   manner_of_death          2535 non-null   object 
 4   armed                    2535 non-null   object 
 5   age                      2535 non-null   float64
 6   gender                   2535 non-null   object 
 7   race                     2535 non-null   object 
 8   city                     2535 non-null   object 
 9   state                    2535 non-null   object 
 10  signs_of_mental_illness  2535 non-null   bool   
 11  threat_level             2535 non-null   object 
 12  flee                     2535 non-null   object 
 13  body_camera              2535 non-null   bool   
dtypes: bool(2), float64(2), 

In [449]:
"""
We noticed there were missing values for 'armed' and 'flee'. For these, we imputed the most frequent data that appeared.
These are listed in the variables flee_mode and armed_mode
"""
flee_mode = df_total['flee'].agg(pd.Series.mode).values.tolist()[0]
armed_mode = df_total['armed'].agg(pd.Series.mode).values.tolist()[0]
df_total['flee'].fillna(flee_mode, inplace=True)
df_total['armed'].fillna(armed_mode, inplace=True)
df_total

TypeError: "value" parameter must be a scalar or dict, but you passed a "list"

In [None]:
"""
Fix the 'race' column
We want to impute the NaN in the 'race' column with the most frequent race that is reported in the same city.
First get a list of cities,
second, get the mode for each city.
"""
training_race_noNaN = df_training[df_training['race'].notnull()].copy()

training_cities = training_race_noNaN['city'].unique()

training_race_noNaN

In [None]:
training_race_noNaN.groupby('city')['race'].agg(pd.Series.mode).to_frame()

In [None]:
training_race_noNaN.groupby('city')['race'].agg(pd.Series.mode).to_frame()