In [439]:
import itertools

import numpy
import pandas as pd
import random
import sklearn
from sklearn.model_selection import train_test_split
import datetime
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb

In [440]:
"""
Our first goal is going to be data cleaning and munging. We want to take care of all null values
first.
"""

df_training = pd.read_csv('police_killings_train.csv')
df_test = pd.read_csv('police_killings_test.csv')
df_race_city = pd.read_csv('share_race_by_city.csv')
df_income = pd.read_csv('income.csv', encoding="ANSI")
df_poverty = pd.read_csv('poverty.csv', encoding="ANSI")
df_education = pd.read_csv('education.csv', encoding="ANSI")

df_test.info()
df_training

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 507 entries, 0 to 506
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       507 non-null    int64  
 1   name                     507 non-null    object 
 2   date                     507 non-null    object 
 3   manner_of_death          507 non-null    object 
 4   armed                    504 non-null    object 
 5   age                      467 non-null    float64
 6   gender                   507 non-null    object 
 7   race                     403 non-null    object 
 8   city                     507 non-null    object 
 9   state                    507 non-null    object 
 10  signs_of_mental_illness  507 non-null    bool   
 11  threat_level             507 non-null    object 
 12  flee                     469 non-null    object 
 13  body_camera              507 non-null    bool   
dtypes: bool(2), float64(1), in

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023,2256,Jeremy Lopez-Robledo,24/01/17,shot,knife,29.0,M,H,Las Cruces,NM,True,attack,Foot,True
2024,2257,Jonathan David Sper,24/01/17,shot,unarmed,30.0,M,W,Algoma Township,MI,True,attack,Not fleeing,False
2025,2258,Jose Efrain Rodriguez,24/01/17,shot and Tasered,gun,18.0,M,H,Lancaster City,PA,False,attack,Not fleeing,False
2026,2259,Ramon Milanez,24/01/17,shot,gun,32.0,M,H,Kuna,ID,False,attack,Car,False


In [441]:
"""
We will combine the training and tesing dataset to make cleaning easier and more accurate
"""

df_total = df_training.append(df_test, ignore_index=True)
df_total

  df_total = df_training.append(df_test, ignore_index=True)


Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2530,2822,Rodney E. Jacobs,28/07/17,shot,gun,31.0,M,,Kansas City,MO,False,attack,Not fleeing,False
2531,2813,TK TK,28/07/17,shot,vehicle,,M,,Albuquerque,NM,False,attack,Car,False
2532,2818,Dennis W. Robinson,29/07/17,shot,gun,48.0,M,,Melba,ID,False,attack,Car,False
2533,2817,Isaiah Tucker,31/07/17,shot,vehicle,28.0,M,B,Oshkosh,WI,False,attack,Car,True


In [442]:
print(df_income.info())
df_race_city.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29322 entries, 0 to 29321
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Geographic Area  29322 non-null  object
 1   City             29322 non-null  object
 2   Median Income    29271 non-null  object
dtypes: object(3)
memory usage: 687.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29268 entries, 0 to 29267
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Geographic area        29268 non-null  object
 1   City                   29268 non-null  object
 2   share_white            29268 non-null  object
 3   share_black            29268 non-null  object
 4   share_native_american  29268 non-null  object
 5   share_asian            29268 non-null  object
 6   share_hispanic         29268 non-null  object
dtypes: object(7)
memory usage: 1.6+ MB


In [443]:
"""
There are many missing fields across all the supporting/optional datasets, in order to resolve this we need
to merge these datasets and then make the necessary imputation.
1. All data sets have a state and city type columns, merge those columns such that they can all have a column
to for indexing.
2. Get the list of cities within the mandatory dataset, we need to confirm that the supporting dataset
has these cities and if not then we need to impute.
"""
#make a list of unique cities, and include the state so we don't deal with same city names of different states
df_total['city'] = df_total[['state', 'city']].apply("-".join,axis=1)
df_race_city['city'] = df_race_city[['Geographic area', 'City']].apply("-".join,axis=1)
df_income['city'] = df_income[['Geographic Area', 'City']].apply("-".join,axis=1)
df_poverty['city'] = df_poverty[['Geographic Area', 'City']].apply("-".join,axis=1)
df_education['city'] = df_education[['Geographic Area', 'City']].apply("-".join,axis=1)
df_income = df_income.drop(['Geographic Area', 'City'], axis=1)
df_poverty = df_poverty.drop(columns=['Geographic Area', 'City'])
df_education = df_education.drop(columns=['Geographic Area', 'City'])
cities = df_total['city'].tolist()
cities =list(set(cities))
print(len(cities))
cities

1534


['MN-Austin',
 'OH-Findlay',
 'TX-Amarillo',
 'CA-Atascadero',
 'TX-Carrollton',
 'TN-Humboldt',
 'IL-Arcola',
 'TN-Knoxville',
 'CA-North Hills',
 'FL-Melbourne',
 'AZ-Bisbee',
 'CA-South Lake Tahoe',
 'WV-Elkins',
 'CA-Temecula',
 'GA-Lizella',
 'MT-Billings',
 'TX-Mabank',
 'WV-Meadow Bridge',
 'GA-Riverdale',
 'FL-Barberville',
 'CA-Delhi',
 'TX-Abilene',
 'TX-Pharr',
 'WV-Weirton',
 'CT-Mashantucket',
 'IN-French Lick',
 'CA-Vallejo',
 'IL-Forest Park',
 'CO-Brighton',
 'CA-Madera',
 'NJ-Paterson',
 'NC-Rutherfordton',
 'FL-Miami Gardens',
 'LA-Livingston Parish',
 'MA-Brockton',
 'CA-Selma',
 'IL-Chicago',
 'MD-Bel Air',
 'CO-New Castle',
 'KY-Berea',
 'AR-Benton',
 'OK-Coal County',
 'AZ-Buckeye',
 'UT-Milcreek',
 'NY-New Paltz',
 'PA-Somerset',
 'FL-Holiday',
 'KS-Medicine Lodge',
 'IN-Lafayette',
 'FL-Floral City',
 'AL-Rainbow City',
 'GA-Ludowici',
 'VA-Winchester',
 'NM-Los Lunas',
 'NC-Stoneville',
 'TX-Bellville',
 'WA-Olympia',
 'GA-Hampton',
 'FL-Jacksonville',
 'OR-Sea

In [444]:
"""
First lets merge each of the supporting datasets into df_temp.
We noticed that the missing values aren't consistent in the supporting datasets. So we need to change all types of
missing values to NAN such that we can impute using fillna().
"""
print(df_race_city.info())
df_temp = pd.merge(df_race_city, df_income, on='city', how='outer', sort=True)
df_temp = pd.merge(df_temp, df_poverty, on='city', how='outer', sort=True)
df_temp = pd.merge(df_temp, df_education, on='city', how='outer', sort=True)
columns_supp = df_temp.columns.values.tolist()
for i in columns_supp:
    df_temp[i] = df_temp[i].replace('-', '(X)')
    df_temp[i] = df_temp[i].replace('(X)', numpy.NAN)
print(df_temp.info())
df_temp

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29268 entries, 0 to 29267
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Geographic area        29268 non-null  object
 1   City                   29268 non-null  object
 2   share_white            29268 non-null  object
 3   share_black            29268 non-null  object
 4   share_native_american  29268 non-null  object
 5   share_asian            29268 non-null  object
 6   share_hispanic         29268 non-null  object
 7   city                   29268 non-null  object
dtypes: object(8)
memory usage: 1.8+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 29477 entries, 0 to 29476
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Geographic area        29268 non-null  object
 1   City                   29268 non-null  object
 2   share_white        

Unnamed: 0,Geographic area,City,share_white,share_black,share_native_american,share_asian,share_hispanic,city,Median Income,poverty_rate,percent_completed_hs
0,AK,Adak city,19.6,4,5.5,52.5,8.9,AK-Adak city,78500,39.3,93.4
1,AK,Akhiok city,8.5,1.4,50.7,1.4,11.3,AK-Akhiok city,26250,40.5,62.5
2,AK,Akiachak CDP,3.5,0.2,95.1,0.2,0.2,AK-Akiachak CDP,38750,26.1,76.7
3,AK,Akiak city,5.2,0,92.8,0,0.3,AK-Akiak city,42000,31.3,81.3
4,AK,Akutan city,23.3,17.9,5.5,43.3,20.8,AK-Akutan city,24750,16.1,73
...,...,...,...,...,...,...,...,...,...,...,...
29472,WY,Woods Landing-Jelm CDP,95.9,0,0,2.1,0,WY-Woods Landing-Jelm CDP,,18.6,100
29473,WY,Worland city,89.9,0.3,1.3,0.6,16.6,WY-Worland city,41523,15.3,85.6
29474,WY,Wright town,94.5,0.1,1.4,0.2,6.2,WY-Wright town,77114,5.9,89.2
29475,WY,Y-O Ranch CDP,92.8,1.5,2.6,0,11.8,WY-Y-O Ranch CDP,,0,100


In [445]:
"""
Let's try cleaning the the race of city dataset now viewing the missing datapoints which are denoted at (X).
"""
# df_temp['city'] = df_temp[['Geographic area', 'City']].apply("-".join,axis=1)
# list_race_df = df_temp.values.tolist()
# j = 0
# list_bad_cities = []
# for i in list_race_df:
#     #we can see it is only null when all races are missing
#     if(i.count('(X)') > 0):
#         j+=1
#         list_bad_cities.append(i[1])
#         print(i.count('(X)') )
# print("total cities with bad points = ", j)


"\nLet's try cleaning the the race of city dataset now viewing the missing datapoints which are denoted at (X).\n"

In [446]:
"""With only 20/29260 holding a row of null values, we shall drop these cities"""
# tempDf_race = df_temp[df_race_city.share_white != '(X)'].copy()
# print(tempDf_race.info())
# df_race_city.info()

'With only 20/29260 holding a row of null values, we shall drop these cities'

In [447]:
"""
We want to make sure all the cities that are in our mandatory dataset is accounted for in the supporting.
We shall add those cities if they aren't and impute them as nulls
"""
#we can check that all the cities in the total data set is still in the race_city dataset
cities2 = df_temp['city'].tolist()
cities2 =list(set(cities2))
j = 0
missing = 0
cities.sort()
cities2.sort()
rep_val = 0
for i in cities:
    # print(i in cities2)
    # print(i)
    # print(cities2.index(i[0::1]))
    if(i in cities2):
        j+=1
    elif(any(i in string for string in cities2)):
        if len(df_temp[df_temp['city'].str.contains(i) ]) == 1 and sum(i in string for string in cities) == 1:
            print(i)
            df_temp['city'].loc[df_temp['city'].str.contains(i)] = i
            rep_val+=1

        else:
            newDF = {}
            #print(i)
            newDF['Geographic area'] = i[0:2]
            newDF['City'] = i[3:]
            newDF['city'] = i
            ind = 0
            df_temp = df_temp.append(newDF, ignore_index=True)
            cities2 = df_temp['city'].tolist()
            cities2 =list(set(cities2))
            missing+=1

    else:
        newDF = {}
        #print(i)
        newDF['Geographic area'] = i[0:2]
        newDF['City'] = i[3:]
        newDF['city'] = i
        ind = 0
        df_temp = df_temp.append(newDF, ignore_index=True)
        cities2 = df_temp['city'].tolist()
        cities2 =list(set(cities2))
        missing+=1


print(j)
print("Replaced cities ", rep_val)
print(" Was Missing ", missing)
df_temp.drop_duplicates(subset='city', inplace=True)
df_temp.reset_index(inplace=True)
df_temp

CA-North Hills
GA-Lizella
FL-Barberville


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


LA-Livingston Parish
OK-Coal County
UT-Milcreek
WA-Beaver


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


TX-Maryneal
NJ-Brick Township
NC-Union Grove
PA-Huntingdon County


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


OR-Colton
OK-Okmulgee County
ID-Hammett
VA-Aldie


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


MI-Hessel
NY-Berne
OH-Geneva Township
CA-South Los Angeles


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


TX-Grand Prarie
IL-Lawndale
MO-St Louis
OR-Josephine County


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


FL-Opa-Locka
OH-Madison Township
CT-Bolton


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


CA-Point Loma
ID-Jefferson County
CA-Venice
GA-Bonaire


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


CA-Kings County
GA-Paulding County
CA-Van Nuys
VA-Powhatan County


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


AR-Desha
SC-Chesterfield County
CA-Pinion Hills
FL-Deland


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


TX-Myra
PA-Upper Darby
WV-Augusta


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


OR-Wilderville
LA-Ouachita Parish
IL-Shawnee National Forest
KY-Pine Top


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


CA-Hollywood Hills
TX-Henderson County
MA-Roslindale
PA-Forks Township


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


CA-Weitchpec
MO-North St. Louis
MO-Big Bear
AZ-Red Valley


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


NM-Laguna Pueblo
ME-Belgrade
MI-Cato Township


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


VA-Norman
OH-Mt. Auburn
KY-Simpsonsville
CA-Boyle Heights


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


KY-Daviess County
NC-Iredell County
WA-Hunters


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


CA-Happy Valley
CO-Caon City
PA-Straban Township
OR-Wolf Creek
VA-York County


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


PA-Union Township
CT-Oakdale
OH-Jasper
TN-Mt. Pleasant

  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)



VA-Grayson County
MI-Algoma Township


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


WV-Hensley
FL-Cantonment
PA-Canaan Township
KY-East Berstadt
AL-Brooksville


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


TN-West Knox
CA-Pacific Beach
CA-Clearlake Park
TX-Oak Cliff


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


OK-Bunch
AZ-La Paz County
MI-Roxand Township
HI-Papaaloa


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


NJ-Little Egg Harbor Township
OK-Rufe
KY-Chavies
TN-Decatur County


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


NJ-Cranbury Township
NY-Coney Island
ND-Wood Lake
AL-Macon County
FL-Little Havana


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


CA-Wilmington
ME-Smyrna
MO-Franklin County
CA-Chatsworth


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


MI-Brighton Township
WV-Gerrardstown
TN-Gibson County
NY-Clarendon


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


CT-Fairfield
KY-Fisty
MS-Barton
LA-Pride


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


PA-York County
FL-Orange County
CA-Hollywood
PA-Lower Mount Bethel
TN-Antioch


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


ME-Arundel
CO-Bailey
TX-Ellis County
AL-Lawrence County
MI-Holland Township


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


GA-Cobb County
NJ-Maurice River
CA-Watsonsville
CO-Montrose County
CA-City Terrace
KY-Gunlock
PA-Lancaster City
MI-Ishpeming Township
FL-Lake Asbury
PA-Penn Township


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


FL-Carollwood
WI-Geneva
NY-Brooklyn


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


TX-Bexar County
LA-Ragley
TN-Counce


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


NJ-Ideal Beach
OH-Sylvania Township
TX-Wood County
MI-Bloomfield Township


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


NH-Bath
MD-Mt Airy
AL-Washington County
CA-Studio City


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


OR-Clackamas County
AL-Eastaboga
CO-Park County
NC-Anson County


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


TX-Fuqua
OH-Howland Township
GA-Eden
CA-Panorama City
NH-Merrimack


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


WA-Thurston County
OK-Pottawatomie County
TX-North Laredo
OK-Tom


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


AK-Kenai Peninsula
LA-East Baton Rouge
CA-Olympic Valley
GA-Rockville


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


AK-Spenard
SC-Edmund
FL-Osteen
CA-San Ysidro
KY-Trosper


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


WI-Corning
NY-Glenville
NC-Ashe County


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


HI-Honolulu
HI-Maui
MS-Ruth
MI-Berrien County


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


NJ-Manchester
UT-Aragonite
FL-Jacksonsville
NJ-Little Egg Harbor
NC-Gaston County


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


AL-Baldwin County
NJ-Lyndhurst
VA-Arvonia
CA-University City


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


CA-East Hollywood
UT-Logan Canyon
OH-Colebrook Township
TX-Leon County


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


WA-Muckleshoot Indian Reservation
NY-Queens
OK-Norwood
AL-Killeen


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


CA-San Diego County
MO-McDowell


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


AR-Sims
VA-Woodford
PA-Germantown


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


LA-Lakes Charles
MA-Tewksbury


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


NY-Edinburg
CA-Sylmar
MO-Lampe
TN-Reagan


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


OK-Logan County
NM-Pueblo of Laguna
NH-Newton
ME-Vassalboro

  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)



GA-Whitfield County
AL-Jackson County
OH-Perry Township
MI-Columbia Township


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


IL-Joilet
OH-Southington
NY-Staten Island


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


MI-Eaton Rapids Township
MD-Abingdon
CA-McKinneyville
TN-Whitehaven
LA-Evangeline Parish


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


NJ-Winslow
VA-Scott County
GA-Barrow County
WV-Braxton County


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


FL-Weeki Wachi
PA-Mount Washington
FL-Putnam Hall
CA-Barona Indian Reservation


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


FL-Hog Valley
PA-Fort Littleton
CA-Siskiyou County
OH-St. Clair Township
WV-Petroleum


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


FL-Lakes Wales
ME-Orrington
TN-Raleigh
FL-Ft. Lauderdale
TX-Campbellton
NV-Jean
1300
 Was Missing  234


  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)
  df_temp = df_temp.append(newDF, ignore_index=True)


Unnamed: 0,Geographic area,City,share_white,share_black,share_native_american,share_asian,share_hispanic,city,Median Income,poverty_rate,percent_completed_hs
0,AK,Adak city,19.6,4,5.5,52.5,8.9,AK-Adak city,78500,39.3,93.4
1,AK,Akhiok city,8.5,1.4,50.7,1.4,11.3,AK-Akhiok city,26250,40.5,62.5
2,AK,Akiachak CDP,3.5,0.2,95.1,0.2,0.2,AK-Akiachak CDP,38750,26.1,76.7
3,AK,Akiak city,5.2,0,92.8,0,0.3,AK-Akiak city,42000,31.3,81.3
4,AK,Akutan city,23.3,17.9,5.5,43.3,20.8,AK-Akutan city,24750,16.1,73
...,...,...,...,...,...,...,...,...,...,...,...
29706,ME,Orrington,,,,,,ME-Orrington,,,
29707,TN,Raleigh,,,,,,TN-Raleigh,,,
29708,FL,Ft. Lauderdale,,,,,,FL-Ft. Lauderdale,,,
29709,TX,Campbellton,,,,,,TX-Campbellton,,,


In [None]:
cities2 = df_temp['city'].tolist()
cities2 =list(set(cities2))
j = 0
for i in cities:
    # print(i in cities2)
    # print(i)
    # print(cities2.index(i[0::1]))
    if(i in  cities2):
        j+=1
#confirming that all the cities were placed in
print(j)

In [448]:
df_temp.sort_values(by=['city'], inplace=True)
df_temp

Unnamed: 0,Geographic area,City,share_white,share_black,share_native_american,share_asian,share_hispanic,city,Median Income,poverty_rate,percent_completed_hs
0,AK,Adak city,19.6,4,5.5,52.5,8.9,AK-Adak city,78500,39.3,93.4
1,AK,Akhiok city,8.5,1.4,50.7,1.4,11.3,AK-Akhiok city,26250,40.5,62.5
2,AK,Akiachak CDP,3.5,0.2,95.1,0.2,0.2,AK-Akiachak CDP,38750,26.1,76.7
3,AK,Akiak city,5.2,0,92.8,0,0.3,AK-Akiak city,42000,31.3,81.3
4,AK,Akutan city,23.3,17.9,5.5,43.3,20.8,AK-Akutan city,24750,16.1,73
...,...,...,...,...,...,...,...,...,...,...,...
29472,WY,Woods Landing-Jelm CDP,95.9,0,0,2.1,0,WY-Woods Landing-Jelm CDP,,18.6,100
29473,WY,Worland city,89.9,0.3,1.3,0.6,16.6,WY-Worland city,41523,15.3,85.6
29474,WY,Wright town,94.5,0.1,1.4,0.2,6.2,WY-Wright town,77114,5.9,89.2
29475,WY,Y-O Ranch CDP,92.8,1.5,2.6,0,11.8,WY-Y-O Ranch CDP,,0,100


In [449]:
"""
We want to impute the city and our state/Geographic area such that we have a full comprehensive list of both
separately after the merge.
Then we want to get a list of race names within the df_race_city.
"""

df_temp['Geographic area'] = [i[0:2] for i in df_temp['city']]
df_temp['City'] = [i[3:] for i in df_temp['city']]
print(df_temp)
list_states = df_temp['Geographic area'].values.tolist()
list_states = list(set(list_states))
#we expect to see 51 including DC as a state
print(len(list_states))
columns_race = df_temp.columns.values.tolist()
s = 'share'
columns_race = [string for string in columns_race if(s in string)]
columns_race

      Geographic area                    City share_white share_black  \
0                  AK               Adak city        19.6           4   
1                  AK             Akhiok city         8.5         1.4   
2                  AK            Akiachak CDP         3.5         0.2   
3                  AK              Akiak city         5.2           0   
4                  AK             Akutan city        23.3        17.9   
...               ...                     ...         ...         ...   
29472              WY  Woods Landing-Jelm CDP        95.9           0   
29473              WY            Worland city        89.9         0.3   
29474              WY             Wright town        94.5         0.1   
29475              WY           Y-O Ranch CDP        92.8         1.5   
29476              WY              Yoder town        97.4           0   

      share_native_american share_asian share_hispanic  \
0                       5.5        52.5            8.9   
1      

['share_white',
 'share_black',
 'share_native_american',
 'share_asian',
 'share_hispanic']

In [450]:
"""
Now we can impute the porportion of missing cities based on the state's porportions
"""
for state in list_states:
    j=0
    list_total_races = []
    state_df = df_temp.loc[df_temp['Geographic area'] == state]
    for i in columns_race:
        state_df[i] = state_df[i].astype(float)
        list_total_races.append(state_df[i].sum())
        print(list_total_races[j])
        j+=1
    sum_all_races = sum(list_total_races)
    porportional_share = [i/sum_all_races for i in list_total_races]
    #print(porportional_share, " sum= ", sum(porportional_share))
    j=0
    for i in columns_race:
        state_df[i] = state_df[i].fillna(porportional_share[j])
        j+=1
    df_temp.loc[df_temp['Geographic area'] == state] = state_df
print(df_temp.info())
df_temp



108949.3
4081.1
2612.2
8441.4
44949.2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df[i] = state_df[i].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df[i] = state_df[i].fillna(porportional_share[j])


5038.799999999999
161.7
58.60000000000001
3873.9
1564.1
41909.2
13480.3
381.09999999999997
277.3
1722.5
51732.09999999999
895.9000000000001
2355.2
1930.8000000000002
6662.1
38109.200000000004
1169.0
80.30000000000001
136.8
362.59999999999997
5794.700000000001
1169.8999999999999
39.800000000000004
152.0
563.7000000000002
44084.799999999996
3801.4
140.10000000000002
3239.9
6062.0
9185.4
68.7
27.6
128.89999999999998
167.5
62377.9
644.5
1256.9
286.1
3403.1
164498.7
5768.7
285.0
1744.6
4820.2
71858.5
724.5
2025.5
611.3
2568.8999999999996
33078.9
113.1
4690.400000000001
124.0
794.4000000000001
21790.3
680.9000000000001
64.7
693.4000000000001
1201.9
37356.3
10082.3
176.60000000000002
1543.7
3072.5
38158.3
3132.8999999999996
135.8
273.4
1225.3000000000002
37009.8
143.29999999999998
2137.3
117.7
731.0
39182.3
19176.6
188.0
933.7
4015.8999999999996
48419.1
2321.1000000000004
110.30000000000001
373.3
1168.4
32947.2
199.10000000000002
973.3000000000001
519.8
3676.0999999999995
28980.4
295.5
6795.8

Unnamed: 0,Geographic area,City,share_white,share_black,share_native_american,share_asian,share_hispanic,city,Median Income,poverty_rate,percent_completed_hs
0,AK,Adak city,19.6,4.0,5.5,52.5,8.9,AK-Adak city,78500,39.3,93.4
1,AK,Akhiok city,8.5,1.4,50.7,1.4,11.3,AK-Akhiok city,26250,40.5,62.5
2,AK,Akiachak CDP,3.5,0.2,95.1,0.2,0.2,AK-Akiachak CDP,38750,26.1,76.7
3,AK,Akiak city,5.2,0.0,92.8,0.0,0.3,AK-Akiak city,42000,31.3,81.3
4,AK,Akutan city,23.3,17.9,5.5,43.3,20.8,AK-Akutan city,24750,16.1,73
...,...,...,...,...,...,...,...,...,...,...,...
29472,WY,Woods Landing-Jelm CDP,95.9,0.0,0.0,2.1,0.0,WY-Woods Landing-Jelm CDP,,18.6,100
29473,WY,Worland city,89.9,0.3,1.3,0.6,16.6,WY-Worland city,41523,15.3,85.6
29474,WY,Wright town,94.5,0.1,1.4,0.2,6.2,WY-Wright town,77114,5.9,89.2
29475,WY,Y-O Ranch CDP,92.8,1.5,2.6,0.0,11.8,WY-Y-O Ranch CDP,,0,100


In [451]:
df_temp['Median Income'] = pd.to_numeric(df_temp['Median Income'], errors='coerce')
df_temp['poverty_rate'] = pd.to_numeric(df_temp['poverty_rate'], errors='coerce')
df_temp['percent_completed_hs'] = pd.to_numeric(df_temp['percent_completed_hs'], errors='coerce')
for state in list_states:
    state_df = df_temp.loc[df_temp['Geographic area'] == state]
    state_df['Median Income'] = state_df['Median Income'].fillna(state_df['Median Income'].mean())
    state_df['poverty_rate'] = state_df['poverty_rate'].fillna(state_df['poverty_rate'].mean())
    state_df['percent_completed_hs'] =state_df['percent_completed_hs'].fillna(state_df['percent_completed_hs'].mean())
    df_temp.loc[df_temp['Geographic area'] == state] = state_df
print(df_temp.info())

df_temp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df['Median Income'] = state_df['Median Income'].fillna(state_df['Median Income'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df['poverty_rate'] = state_df['poverty_rate'].fillna(state_df['poverty_rate'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df['percent_

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29711 entries, 0 to 29476
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Geographic area        29711 non-null  object 
 1   City                   29711 non-null  object 
 2   share_white            29711 non-null  object 
 3   share_black            29711 non-null  object 
 4   share_native_american  29711 non-null  object 
 5   share_asian            29711 non-null  object 
 6   share_hispanic         29711 non-null  object 
 7   city                   29711 non-null  object 
 8   Median Income          29711 non-null  float64
 9   poverty_rate           29711 non-null  float64
 10  percent_completed_hs   29711 non-null  float64
dtypes: float64(3), object(8)
memory usage: 2.7+ MB
None


Unnamed: 0,Geographic area,City,share_white,share_black,share_native_american,share_asian,share_hispanic,city,Median Income,poverty_rate,percent_completed_hs
0,AK,Adak city,19.6,4.0,5.5,52.5,8.9,AK-Adak city,78500.000000,39.3,93.4
1,AK,Akhiok city,8.5,1.4,50.7,1.4,11.3,AK-Akhiok city,26250.000000,40.5,62.5
2,AK,Akiachak CDP,3.5,0.2,95.1,0.2,0.2,AK-Akiachak CDP,38750.000000,26.1,76.7
3,AK,Akiak city,5.2,0.0,92.8,0.0,0.3,AK-Akiak city,42000.000000,31.3,81.3
4,AK,Akutan city,23.3,17.9,5.5,43.3,20.8,AK-Akutan city,24750.000000,16.1,73.0
...,...,...,...,...,...,...,...,...,...,...,...
29472,WY,Woods Landing-Jelm CDP,95.9,0.0,0.0,2.1,0.0,WY-Woods Landing-Jelm CDP,60874.993464,18.6,100.0
29473,WY,Worland city,89.9,0.3,1.3,0.6,16.6,WY-Worland city,41523.000000,15.3,85.6
29474,WY,Wright town,94.5,0.1,1.4,0.2,6.2,WY-Wright town,77114.000000,5.9,89.2
29475,WY,Y-O Ranch CDP,92.8,1.5,2.6,0.0,11.8,WY-Y-O Ranch CDP,60874.993464,0.0,100.0


In [452]:
#make a list of df fT-oubaor each city
list_df_cities = []
for i in cities:
    list_df_cities.append(df_total.loc[df_total['city'] == i])

In [453]:
#make a list of race
races = df_total['race'].tolist()
races =list(set(races))
races = races[1::]
races

['N', 'H', 'A', 'O', 'W', 'B']

In [454]:
mode_total = df_total['race'].mode(dropna=True).tolist()
print(mode_total)
for i in list_df_cities:
    mode = i['race'].mode(dropna=True)
    if(len(mode) == 0):
        i[['race']] = i[['race']].fillna(mode_total[0])
        if(i['race'].isnull().sum() != 0):
            print("null val")
    elif(isinstance(mode, type(i['race']))):
        mode.dropna()
        mode = mode.tolist()
        #if the mode has more than one value then take a random choice between the values
        i[['race']] = i[['race']].fillna(random.choice(tuple(mode)))



    df_total.loc[df_total['city'] ==  i.at[i.first_valid_index(),'city']] = i
df_total.info()

['W']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i[['race']] = i[['race']].fillna(random.choice(tuple(mode)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i[['race']] = i[['race']].fillna(random.choice(tuple(mode)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i[['race']] = i[['race']].fillna(random.choice(tuple(mode)))
A value is trying to b

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2535 entries, 0 to 2534
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       2535 non-null   int64  
 1   name                     2535 non-null   object 
 2   date                     2535 non-null   object 
 3   manner_of_death          2535 non-null   object 
 4   armed                    2526 non-null   object 
 5   age                      2458 non-null   float64
 6   gender                   2535 non-null   object 
 7   race                     2535 non-null   object 
 8   city                     2535 non-null   object 
 9   state                    2535 non-null   object 
 10  signs_of_mental_illness  2535 non-null   bool   
 11  threat_level             2535 non-null   object 
 12  flee                     2470 non-null   object 
 13  body_camera              2535 non-null   bool   
dtypes: bool(2), float64(1), 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i[['race']] = i[['race']].fillna(random.choice(tuple(mode)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i[['race']] = i[['race']].fillna(random.choice(tuple(mode)))


In [None]:
 # removed all data with NaN in race Column...df_training[df_training['race'].isnull()]
df_total

In [None]:
"""
We noticed that some of the 'age' data is missing.
So we imputed the average age of the people in each city and imputed the average age for the missing ages based on their city.
1. We calculate the total mean age of our total data sample.
2. We group by city, then calculate the mean Age at each city.
3. For each NaN age, we impute the mean age of that city. If that city was not calcuated (meaning this data is the only one from that city), we will use the total mean age.
"""
age_fix = df_total[['id','age','city']].copy()
age_fix = age_fix[age_fix['age'].isnull()].copy()
# 1. Calculate Total Mean Age
total_mean_age = df_total['age'].mean()

# 2. Calculate Mean of Each City
city_mean_ages = df_total.groupby('city')['age'].mean()

#3. Impute Age by the city. If City data does not exist, use Total Mean Age.
for i in age_fix.index:
    city = age_fix['city'][i]
    if city in city_mean_ages and not np.isnan(city_mean_ages[city]):
        age_fix['age'][i] = city_mean_ages[city]
    else:
        age_fix['age'][i] = total_mean_age


# update total list
df_total.update(age_fix)

df_total.info()

In [None]:
"""
We noticed there were missing values for 'armed' and 'flee'. For these, we imputed the most frequent data that appeared.
These are listed in the variables flee_mode and armed_mode
"""
flee_mode = df_total['flee'].agg(pd.Series.mode).values.tolist()[0]
armed_mode = df_total['armed'].agg(pd.Series.mode).values.tolist()[0]
df_total['flee'].fillna(flee_mode, inplace=True)
df_total['armed'].fillna(armed_mode, inplace=True)
df_total.info()

In [None]:
"""
****GIVE EXPLANATION OF OUR ENTIRE PROCESS
########################### END OF DATA MUNGING##########################
"""


In [None]:
"""
Data Analysis
Exploratory Data Analysis (EDA 10 Points)
"""
df_total

In [None]:
"""
Q: Which state has the most fatal police shootings? Which city is the most dangerous?
Create a collapsed dataframe of state. Create a collapsed data frame of Cities.
"""
# Make a copy of Total Data For City!
df_aux = df_total.copy()
df_aux.sort_values(by=['city'])

# The following dropped Columns are not used when data is collapsed as City
df_aux.drop(columns=['id', 'name', 'date', 'age', 'state'], inplace=True)
df_aux.set_index('city', inplace=True)
df_aux = df_aux.astype(str)
cols = df_aux.columns.tolist() # Move 'arms' to end of list
cols.append(cols.pop(cols.index('armed')))
df_aux = df_aux[cols]

# Get a list of 'df_aux' column names
aux_cols = df_aux.columns

# Get a list of the index cities (Sorted)
list_of_cities = df_aux.index.unique().tolist()
list_of_cities.sort()

# For each column, change the string such that  " 'COLUMN_NAME': + str + (s) " is the result for each string.
# This is important so that when we build/update our resulting Dataframe, it will have unique values per catagory.
for i in list_of_cities:
    for j in aux_cols:
        df_aux.at[i, j] =  j.upper() + ': ' + df_aux.at[i, j]

# Make the df_city_stats Columns
freq_list_cols = ['Total Cases']
for i in aux_cols:
    freq_list_cols = freq_list_cols + df_aux[i].unique().tolist()


# Create Dataframe to build up
df_city_stats = pd.DataFrame(index=list_of_cities, columns=freq_list_cols)
df_city_stats = df_city_stats.fillna(0)
for i in list_of_cities: # indexs
    temp = len(df_aux[df_aux.index == i])
    df_city_stats.loc[i]['Total Cases'] = temp
    for j in aux_cols:
        temp = df_aux[df_aux.index == i][j].value_counts()
        df_city_stats.loc[i].update(temp)

df_city_stats

In [None]:
# Make a copy of Total Data For State!
df_aux = df_total.copy()
df_aux.sort_values(by=['state'])

# The following dropped Columns are not used when data is collapsed as City
df_aux.drop(columns=['id', 'name', 'date', 'age', 'city'], inplace=True)
df_aux.set_index('state', inplace=True)
df_aux = df_aux.astype(str)
cols = df_aux.columns.tolist() # Move 'arms' to end of list
cols.append(cols.pop(cols.index('armed')))
df_aux = df_aux[cols]

# Get a list of 'df_aux' column names
aux_cols = df_aux.columns

# Get a list of the index state (Sorted)
list_of_states = df_aux.index.unique().tolist()
list_of_states.sort()

# For each column, change the string such that  " 'COLUMN_NAME': + str + (s) " is the result for each string.
# This is important so that when we build/update our resulting Dataframe, it will have unique values per catagory.
for i in list_of_states:
    for j in aux_cols:
        df_aux.at[i, j] =  j.upper() + ': ' + df_aux.at[i, j]

# Make the df_city_stats Columns
freq_list_cols = ['Total Cases']
for i in aux_cols:
    freq_list_cols = freq_list_cols + df_aux[i].unique().tolist()


# Create Dataframe to build up
df_state_stats = pd.DataFrame(index=list_of_states, columns=freq_list_cols)
df_state_stats = df_state_stats.fillna(0)
for i in list_of_states: # indexs
    temp = len(df_aux[df_aux.index == i])
    df_state_stats.loc[i]['Total Cases'] = temp
    for j in aux_cols:
        temp = df_aux[df_aux.index == i][j].value_counts()
        df_state_stats.loc[i].update(temp)

df_state_stats

In [None]:
"""
Now that we have the most accurate imputation using all of the support data. We want to hold a dataframe that contains cities that are also in the main data frame.
"""
j=0
df_support = pd.DataFrame(columns=columns_supp)
for i in cities:
    newDf= df_temp[df_temp['city'] == i].copy()
    if(len(newDf) > 1):
        print(i)
        print(newDf.info())
        print(newDf)
    df_support = df_support.append(newDf, ignore_index=True)
    df_support.drop_duplicates(inplace=True)
    j+=1
print(j)
print(df_support.info())
print(len(cities))
df_support

In [473]:
"""
Q: Which state has the most fatal police shootings? Which city is the most dangerous?
Create a collapsed dataframe of state. Create a collapsed data frame of Cities.
"""
# Make a copy of Total Data For City!
df_aux = df_total.copy()
df_aux.sort_values(by=['city'])

# The following dropped Columns are not used when data is collapsed as City
df_aux.drop(columns=['id', 'name', 'date', 'age', 'state'], inplace=True)
df_aux.set_index('city', inplace=True)
df_aux = df_aux.astype(str)
cols = df_aux.columns.tolist() # Move 'arms' to end of list
cols.append(cols.pop(cols.index('armed')))
df_aux = df_aux[cols]

# Get a list of 'df_aux' column names
aux_cols = df_aux.columns

# Get a list of the index cities (Sorted)
list_of_cities = df_aux.index.unique().tolist()
list_of_cities.sort()

# For each column, change the string such that  " 'COLUMN_NAME': + str + (s) " is the result for each string.
# This is important so that when we build/update our resulting Dataframe, it will have unique values per catagory.
for i in list_of_cities:
    for j in aux_cols:
        df_aux.at[i, j] =  j.upper() + ': ' + df_aux.at[i, j]

# Make the df_city_stats Columns
freq_list_cols = ['Total Cases']
for i in aux_cols:
    freq_list_cols = freq_list_cols + df_aux[i].unique().tolist()


# Create Dataframe to build up
df_city_stats = pd.DataFrame(index=list_of_cities, columns=freq_list_cols)
df_city_stats = df_city_stats.fillna(0)
for i in list_of_cities: # indexs
    temp = len(df_aux[df_aux.index == i])
    df_city_stats.loc[i]['Total Cases'] = temp
    for j in aux_cols:
        temp = df_aux[df_aux.index == i][j].value_counts()
        df_city_stats.loc[i].update(temp)

df_city_stats

Unnamed: 0,Total Cases,MANNER_OF_DEATH: shot,MANNER_OF_DEATH: shot and Tasered,GENDER: M,GENDER: F,RACE: A,RACE: W,RACE: H,RACE: B,RACE: O,...,ARMED: metal rake,ARMED: crowbar,ARMED: oar,ARMED: machete and gun,ARMED: tire iron,ARMED: air conditioner,ARMED: pole and knife,ARMED: baseball bat and bottle,ARMED: fireworks,ARMED: pen
AK-Anchorage,3,3,0,3,0,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AK-Barrow,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AK-Big Lake,1,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
AK-Fairbanks,5,4,1,5,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AK-Houston,1,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WY-Cheyenne,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
WY-Douglas,2,2,0,2,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
WY-Gillette,1,0,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
WY-Laramie,1,1,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [474]:
# Make a copy of Total Data For State!
df_aux = df_total.copy()
df_aux.sort_values(by=['state'])

# The following dropped Columns are not used when data is collapsed as City
df_aux.drop(columns=['id', 'name', 'date', 'age', 'city'], inplace=True)
df_aux.set_index('state', inplace=True)
df_aux = df_aux.astype(str)
cols = df_aux.columns.tolist() # Move 'arms' to end of list
cols.append(cols.pop(cols.index('armed')))
df_aux = df_aux[cols]

# Get a list of 'df_aux' column names
aux_cols = df_aux.columns

# Get a list of the index state (Sorted)
list_of_states = df_aux.index.unique().tolist()
list_of_states.sort()

# For each column, change the string such that  " 'COLUMN_NAME': + str + (s) " is the result for each string.
# This is important so that when we build/update our resulting Dataframe, it will have unique values per catagory.
for i in list_of_states:
    for j in aux_cols:
        df_aux.at[i, j] =  j.upper() + ': ' + df_aux.at[i, j]

# Make the df_city_stats Columns
freq_list_cols = ['Total Cases']
for i in aux_cols:
    freq_list_cols = freq_list_cols + df_aux[i].unique().tolist()


# Create Dataframe to build up
df_state_stats = pd.DataFrame(index=list_of_states, columns=freq_list_cols)
df_state_stats = df_state_stats.fillna(0)
for i in list_of_states: # indexs
    temp = len(df_aux[df_aux.index == i])
    df_state_stats.loc[i]['Total Cases'] = temp
    for j in aux_cols:
        temp = df_aux[df_aux.index == i][j].value_counts()
        df_state_stats.loc[i].update(temp)

df_state_stats

Unnamed: 0,Total Cases,MANNER_OF_DEATH: shot,MANNER_OF_DEATH: shot and Tasered,GENDER: M,GENDER: F,RACE: A,RACE: W,RACE: H,RACE: B,RACE: O,...,ARMED: metal rake,ARMED: crowbar,ARMED: oar,ARMED: machete and gun,ARMED: tire iron,ARMED: air conditioner,ARMED: pole and knife,ARMED: baseball bat and bottle,ARMED: fireworks,ARMED: pen
AK,15,14,1,14,1,0,7,0,1,0,...,0,0,0,0,0,0,0,0,0,0
AL,50,47,3,46,4,0,36,0,14,0,...,0,0,0,0,0,0,0,0,0,0
AR,26,25,1,26,0,0,19,0,7,0,...,0,0,0,0,0,0,0,0,0,0
AZ,118,112,6,108,10,0,67,38,5,0,...,0,0,0,0,0,0,0,0,0,0
CA,424,382,42,402,22,17,146,184,68,8,...,1,0,0,0,0,0,0,1,0,0
CO,74,71,3,72,2,2,39,23,9,0,...,0,0,0,0,0,0,0,0,0,0
CT,9,8,1,9,0,0,7,2,0,0,...,0,0,0,0,0,0,0,0,0,0
DC,11,11,0,9,2,0,1,0,10,0,...,0,0,0,0,0,0,0,0,0,0
DE,8,8,0,8,0,0,4,0,4,0,...,0,0,0,0,0,0,0,0,0,0
FL,154,143,11,148,6,1,83,18,50,2,...,0,0,1,0,0,0,0,0,0,0
