In [1]:
# Preprocessing data before using machine learning model, in this case we will use the police data set

In [2]:
# Importing libraries
import numpy as np
import pandas as pd

In [3]:
# Import police dataset

df=pd.read_csv("police_train.csv")
print (df.shape)

(5416, 14)


# Dataframe info

In [4]:
df.head()

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,2015-01-02,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,2015-01-02,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,2015-01-03,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,2015-01-04,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,2015-01-04,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False


In [5]:
# Checking the columns
df.columns

Index(['id', 'name', 'date', 'manner_of_death', 'armed', 'age', 'gender',
       'race', 'city', 'state', 'signs_of_mental_illness', 'threat_level',
       'flee', 'body_camera'],
      dtype='object')

In [6]:
# Checking info of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5416 entries, 0 to 5415
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       5416 non-null   int64  
 1   name                     5416 non-null   object 
 2   date                     5416 non-null   object 
 3   manner_of_death          5416 non-null   object 
 4   armed                    5189 non-null   object 
 5   age                      5181 non-null   float64
 6   gender                   5414 non-null   object 
 7   race                     4895 non-null   object 
 8   city                     5416 non-null   object 
 9   state                    5416 non-null   object 
 10  signs_of_mental_illness  5416 non-null   bool   
 11  threat_level             5416 non-null   object 
 12  flee                     5167 non-null   object 
 13  body_camera              5416 non-null   bool   
dtypes: bool(2), float64(1), 

In [7]:
# Looking for missing values
df.isnull().sum()

id                           0
name                         0
date                         0
manner_of_death              0
armed                      227
age                        235
gender                       2
race                       521
city                         0
state                        0
signs_of_mental_illness      0
threat_level                 0
flee                       249
body_camera                  0
dtype: int64

In [8]:
# Removing all missing data from the gender column it will not affect our data set

df.dropna(subset=['gender'], inplace=True)

In [9]:
# Dealing with the age column, setting the mean for the missing data
age_mask = df['age'].isna()
age_mask.sum()

234

In [10]:
age_median = df['age'].median()
print(age_median)

35.0


In [11]:
# Let's see if we do it right
df['age'].fillna(age_median, inplace=True)
df['age'].isna().sum()

0

In [12]:
# Since the armed column has a udeterminated value, we put all missing values in that category
print(df['armed'].value_counts())

gun                       3060
knife                      790
unarmed                    353
toy weapon                 186
undetermined               164
                          ... 
bow and arrow                1
spear                        1
pen                          1
machete and gun              1
baseball bat and knife       1
Name: armed, Length: 93, dtype: int64


In [13]:
df['armed'].fillna('unarmed', inplace=True)

In [14]:
# Discribing thr race column and making it more readable

df['race'].unique()

array(['A', 'W', 'H', 'B', 'O', nan, 'N'], dtype=object)

In [15]:
df.groupby("race").count()

Unnamed: 0_level_0,id,name,date,manner_of_death,armed,age,gender,city,state,signs_of_mental_illness,threat_level,flee,body_camera
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
A,93,93,93,93,93,93,93,93,93,93,93,88,93
B,1298,1298,1298,1298,1298,1298,1298,1298,1298,1298,1298,1248,1298
H,902,902,902,902,902,902,902,902,902,902,902,852,902
N,78,78,78,78,78,78,78,78,78,78,78,73,78
O,48,48,48,48,48,48,48,48,48,48,48,47,48
W,2475,2475,2475,2475,2475,2475,2475,2475,2475,2475,2475,2379,2475


In [16]:
df.replace(to_replace = ['A'], value = ['Asian'], inplace = True)
df.replace(to_replace = ['B'], value = ['Black'], inplace = True)
df.replace(to_replace = ['H'], value = ['Hispanic'], inplace = True)
df.replace(to_replace = ['N'], value = ['Native American'], inplace = True)
df.replace(to_replace = ['O'], value = ['Other'], inplace = True)
df.replace(to_replace = ['W'], value = ['White'], inplace = True)

In [17]:
print(df['race'].value_counts())

White              2475
Black              1298
Hispanic            902
Asian                93
Native American      78
Other                48
Name: race, dtype: int64


In [18]:
# Putting all missing values in Other category
df['race'].fillna('Other', inplace=True)

In [19]:
# # Putting all missing values in Other category
df['flee'].unique()

array(['Not fleeing', 'Car', 'Foot', 'Other', nan], dtype=object)

In [20]:
print(df['flee'].value_counts())

Not fleeing    3410
Car             899
Foot            692
Other           164
Name: flee, dtype: int64


In [21]:
df['flee'].fillna('Other', inplace=True)

In [22]:
# We arrange the date column into 3 special columns day, month and year 
df['date']=pd.to_datetime(df['date'])
df['year']=pd.to_datetime(df['date']).dt.year
df['month']=pd.to_datetime(df['date']).dt.month

In [23]:
# Group by manner of death and flee
df.groupby(['manner_of_death','flee'])['flee'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,unique,top,freq
manner_of_death,flee,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
shot,Car,880,1,Car,880
shot,Foot,659,1,Foot,659
shot,Not fleeing,3213,1,Not fleeing,3213
shot,Other,392,1,Other,392
shot and Tasered,Car,19,1,Car,19
shot and Tasered,Foot,33,1,Foot,33
shot and Tasered,Not fleeing,197,1,Not fleeing,197
shot and Tasered,Other,21,1,Other,21


In [24]:
# Dropping irelevant columns for our prediction

df1=df.drop(['body_camera', 'state', 'city', 'signs_of_mental_illness', 'date'], axis=1)

In [25]:
print(df['threat_level'].value_counts())

attack          3494
other           1681
undetermined     239
Name: threat_level, dtype: int64


In [26]:
# mapping manner of death column

manner_of_death_mapping = {'shot': 0, 'shot and Tasered': 1}

df['manner_of_death'] = df['manner_of_death'].map(manner_of_death_mapping)
df

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera,year,month
0,3,Tim Elliot,2015-01-02,0,gun,53.0,M,Asian,Shelton,WA,True,attack,Not fleeing,False,2015,1
1,4,Lewis Lee Lembke,2015-01-02,0,gun,47.0,M,White,Aloha,OR,False,attack,Not fleeing,False,2015,1
2,5,John Paul Quintero,2015-01-03,1,unarmed,23.0,M,Hispanic,Wichita,KS,False,other,Not fleeing,False,2015,1
3,8,Matthew Hoffman,2015-01-04,0,toy weapon,32.0,M,White,San Francisco,CA,True,attack,Not fleeing,False,2015,1
4,9,Michael Rodriguez,2015-01-04,0,nail gun,39.0,M,Hispanic,Evans,CO,False,attack,Not fleeing,False,2015,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5411,5921,William Slyter,2020-06-13,0,gun,22.0,M,White,Kansas City,MO,False,other,Other,False,2020,6
5412,5922,TK TK,2020-06-13,0,undetermined,35.0,M,Other,San Bernardino,CA,False,attack,Not fleeing,False,2020,6
5413,5924,Nicholas Hirsh,2020-06-15,0,gun,31.0,M,White,Lawrence,KS,False,attack,Car,False,2020,6
5414,5926,TK TK,2020-06-16,0,gun,24.0,M,Other,Beach Park,IL,False,attack,Not fleeing,False,2020,6


In [27]:
# let's take a look of column armed
df.armed.unique()

array(['gun', 'unarmed', 'toy weapon', 'nail gun', 'knife', 'shovel',
       'hammer', 'hatchet', 'undetermined', 'sword', 'machete',
       'box cutter', 'metal object', 'screwdriver', 'lawn mower blade',
       'flagpole', 'guns and explosives', 'cordless drill', 'crossbow',
       'metal pole', 'Taser', 'metal pipe', 'metal hand tool',
       'blunt object', 'metal stick', 'sharp object', 'meat cleaver',
       'carjack', 'chain', "contractor's level", 'unknown weapon',
       'stapler', 'beer bottle', 'bean-bag gun',
       'baseball bat and fireplace poker', 'straight edge razor',
       'gun and knife', 'ax', 'brick', 'baseball bat', 'hand torch',
       'chain saw', 'garden tool', 'scissors', 'pole', 'pick-axe',
       'flashlight', 'vehicle', 'baton', 'spear', 'chair', 'pitchfork',
       'hatchet and gun', 'rock', 'piece of wood', 'bayonet', 'pipe',
       'glass shard', 'motorcycle', 'pepper spray', 'metal rake',
       'crowbar', 'oar', 'machete and gun', 'tire iron',
      

In [28]:
# As we can see it has many unique values, let's try to categorize them

lethal = ['gun', 'hatchet', 'machete', 'guns and explosives', 'gun and knife', 'ax', 'hand torch',
          'chain saw', 'hatchet and gun', 'machete and gun', 'chainsaw', 'gun and sword', 'gun and car',
          'incendiary device', 'gun and vehicle', 'vehicle and gun', 'grenade', 'air pistol', 'vehicle and machete']
semilethal = ['nail gun', 'knife', 'shovel', 'hammer', 'sword', 'lawn mower blade', 'cordless drill', 'crossbow', 'Taser',
              'metal object', 'metal hand tool', 'metal stick', 'sharp object', 'meat cleaver', 'bean-bag gun', 'straight edge razor', 
              'baton', 'spear', 'bayonet', 'crowbar', 'tire iron', 'pole and knife', 'pellet gun', 'BB gun', 'samurai sword', 'bow and arrow',
              'wrench', 'BB gun and vehicle', 'Airsoft pistol', 'baseball bat and knife', 'ice pick', 'car, knife and mace']
nonlethal = ['toy weapon', 'box cutter', 'screwdriver', 'flagpole', 'metal pole', 'pick-axe', 'metal rake', 'metal pipe', 
             'blunt object', 'carjack', 'chain', "contractor's level", 'stapler', 'beer bottle', 'baseball bat and fireplace poker',
             'brick', 'baseball bat', 'garden tool', 'scissors', 'pole', 'flashlight', 'vehicle', 'chair', 'pitchfork', 'rock',
             'piece of wood', 'pipe', 'glass shard', 'motorcycle', 'pepper spray', 'oar', 'air conditioner', 'baseball bat and bottle',
             'fireworks', 'pen', 'walking stick', 'barstool', 'wasp spray']
unarmed = ['unarmed']
unknown = ['claimed to be armed', 'unknown weapon']
undetermined = ['undetermined','undeterminated']

In [29]:
# We have divided the types of weapons into 6 subsets, and we use the replace function to combine these new variables in the existing armed column.
for i in df.armed.unique():
    if i in lethal:
        df.armed = df.armed.replace(i, 'Lethal')
    elif i in semilethal:
        df.armed = df.armed.replace(i, 'Semi-Lethal')
    elif i in nonlethal:
        df.armed = df.armed.replace(i, 'Non-Lethal')
    elif i in unarmed:
        df.armed = df.armed.replace(i, 'Unarmed')
    elif i in unknown:
        df.armed = df.armed.replace(i, 'Unknown')
    elif i in undetermined:
        df.armed = df.armed.replace(i, 'Undetermined')

In [30]:
# Now armed colimn has this values

df.armed.unique()

array(['Lethal', 'Unarmed', 'Non-Lethal', 'Semi-Lethal', 'Undetermined',
       'Unknown'], dtype=object)

In [31]:
# Mapping gender

gender_mapping= {'M': 0, 'F': 1}
df['gender'] = df['gender'].map(gender_mapping)
df

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera,year,month
0,3,Tim Elliot,2015-01-02,0,Lethal,53.0,0,Asian,Shelton,WA,True,attack,Not fleeing,False,2015,1
1,4,Lewis Lee Lembke,2015-01-02,0,Lethal,47.0,0,White,Aloha,OR,False,attack,Not fleeing,False,2015,1
2,5,John Paul Quintero,2015-01-03,1,Unarmed,23.0,0,Hispanic,Wichita,KS,False,other,Not fleeing,False,2015,1
3,8,Matthew Hoffman,2015-01-04,0,Non-Lethal,32.0,0,White,San Francisco,CA,True,attack,Not fleeing,False,2015,1
4,9,Michael Rodriguez,2015-01-04,0,Semi-Lethal,39.0,0,Hispanic,Evans,CO,False,attack,Not fleeing,False,2015,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5411,5921,William Slyter,2020-06-13,0,Lethal,22.0,0,White,Kansas City,MO,False,other,Other,False,2020,6
5412,5922,TK TK,2020-06-13,0,Undetermined,35.0,0,Other,San Bernardino,CA,False,attack,Not fleeing,False,2020,6
5413,5924,Nicholas Hirsh,2020-06-15,0,Lethal,31.0,0,White,Lawrence,KS,False,attack,Car,False,2020,6
5414,5926,TK TK,2020-06-16,0,Lethal,24.0,0,Other,Beach Park,IL,False,attack,Not fleeing,False,2020,6


In [32]:
# Since our mission here is to see are the shootings depend of race let's divide race on two groups

df.race.unique()

array(['Asian', 'White', 'Hispanic', 'Black', 'Other', 'Native American'],
      dtype=object)

In [33]:
# We first determine 2 subsets
white =['White']
nowhite=['Asian', 'Hispanic', 'Black Non-Hispanic', 'Other', 'Native American']

In [34]:
# Secondly we replacing them into Race feature
for i in df.race.unique():
    if i in white:
        df.race = df.race.replace(i, 'White')
    elif i in nowhite:
        df.race = df.race.replace(i, 'No White')

In [35]:
# Now we have only two type of race, those that are white and those that aren't.
df.race.unique()

array(['No White', 'White', 'Black'], dtype=object)

In [36]:
# It's time to map them

df['race']=df['race'].map({'No White':0,'White':1})
df.head(3)

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera,year,month
0,3,Tim Elliot,2015-01-02,0,Lethal,53.0,0,0.0,Shelton,WA,True,attack,Not fleeing,False,2015,1
1,4,Lewis Lee Lembke,2015-01-02,0,Lethal,47.0,0,1.0,Aloha,OR,False,attack,Not fleeing,False,2015,1
2,5,John Paul Quintero,2015-01-03,1,Unarmed,23.0,0,0.0,Wichita,KS,False,other,Not fleeing,False,2015,1


In [37]:
# We have some boolean
df.signs_of_mental_illness.unique()

array([ True, False])

In [38]:
# Converting boolean to integer 
df.signs_of_mental_illness = list(map(int, df.signs_of_mental_illness)) 
  
# Printing result 
print(str(df.signs_of_mental_illness)) 

0       1
1       0
2       0
3       1
4       0
       ..
5411    0
5412    0
5413    0
5414    0
5415    0
Name: signs_of_mental_illness, Length: 5414, dtype: int64


In [39]:
# One more
df.body_camera.unique()

array([False,  True])

In [40]:
# Converting boolean to integer 
df.body_camera = list(map(int, df.body_camera)) 
  
# Printing result 
print(str(df.body_camera)) 


0       0
1       0
2       0
3       0
4       0
       ..
5411    0
5412    0
5413    0
5414    0
5415    0
Name: body_camera, Length: 5414, dtype: int64


In [41]:
# Transforming  features using OneHotEncoder

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(handle_unknown= 'ignore')
ohe.fit(df.loc[:, ['flee', 'armed', 'threat_level']])

df_enc = pd.DataFrame(ohe.transform(df.loc[:, ['flee', 'armed', 'threat_level']]).toarray(), columns=ohe.get_feature_names())
df_enc.head(5)


Unnamed: 0,x0_Car,x0_Foot,x0_Not fleeing,x0_Other,x1_Lethal,x1_Non-Lethal,x1_Semi-Lethal,x1_Unarmed,x1_Undetermined,x1_Unknown,x2_attack,x2_other,x2_undetermined
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [42]:
# Dropping duplicates from df
df = df.loc[df.index.drop_duplicates()]

In [43]:
# Let's check our df now
df.head(3)

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera,year,month
0,3,Tim Elliot,2015-01-02,0,Lethal,53.0,0,0.0,Shelton,WA,1,attack,Not fleeing,0,2015,1
1,4,Lewis Lee Lembke,2015-01-02,0,Lethal,47.0,0,1.0,Aloha,OR,0,attack,Not fleeing,0,2015,1
2,5,John Paul Quintero,2015-01-03,1,Unarmed,23.0,0,0.0,Wichita,KS,0,other,Not fleeing,0,2015,1


In [44]:
# Now we will merge this two dataframes  df and df_enc by index
df_2 = pd.concat([df,df_enc], axis=1)
df_2.head(3)

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,...,x0_Other,x1_Lethal,x1_Non-Lethal,x1_Semi-Lethal,x1_Unarmed,x1_Undetermined,x1_Unknown,x2_attack,x2_other,x2_undetermined
0,3.0,Tim Elliot,2015-01-02,0.0,Lethal,53.0,0.0,0.0,Shelton,WA,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,4.0,Lewis Lee Lembke,2015-01-02,0.0,Lethal,47.0,0.0,1.0,Aloha,OR,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,5.0,John Paul Quintero,2015-01-03,1.0,Unarmed,23.0,0.0,0.0,Wichita,KS,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [45]:
# We will remove columns that we have preprocessed and those that carry no weight for our study
df_2.pop('id')
df_2.pop('name')
df_2.pop('armed')
df_2.pop('city')
df_2.pop('state')
df_2.pop('flee')
df_2.pop('threat_level')
df_2.pop('date')
df_2.shape

(5416, 21)

In [46]:
df_2.head(3)

Unnamed: 0,manner_of_death,age,gender,race,signs_of_mental_illness,body_camera,year,month,x0_Car,x0_Foot,...,x0_Other,x1_Lethal,x1_Non-Lethal,x1_Semi-Lethal,x1_Unarmed,x1_Undetermined,x1_Unknown,x2_attack,x2_other,x2_undetermined
0,0.0,53.0,0.0,0.0,1.0,0.0,2015.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,47.0,0.0,1.0,0.0,0.0,2015.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,23.0,0.0,0.0,0.0,0.0,2015.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [47]:
df_2.isnull().sum()

manner_of_death               2
age                           2
gender                        2
race                       1300
signs_of_mental_illness       2
body_camera                   2
year                          2
month                         2
x0_Car                        2
x0_Foot                       2
x0_Not fleeing                2
x0_Other                      2
x1_Lethal                     2
x1_Non-Lethal                 2
x1_Semi-Lethal                2
x1_Unarmed                    2
x1_Undetermined               2
x1_Unknown                    2
x2_attack                     2
x2_other                      2
x2_undetermined               2
dtype: int64

In [48]:
# It's time do select dependet and indipendet feature

X = df_2.drop(['race'], axis = 1)
y = df_2['race']
display(X.head(5))
display(y)

Unnamed: 0,manner_of_death,age,gender,signs_of_mental_illness,body_camera,year,month,x0_Car,x0_Foot,x0_Not fleeing,x0_Other,x1_Lethal,x1_Non-Lethal,x1_Semi-Lethal,x1_Unarmed,x1_Undetermined,x1_Unknown,x2_attack,x2_other,x2_undetermined
0,0.0,53.0,0.0,1.0,0.0,2015.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,47.0,0.0,0.0,0.0,2015.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,23.0,0.0,0.0,0.0,2015.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,32.0,0.0,1.0,0.0,2015.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,39.0,0.0,0.0,0.0,2015.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


0       0.0
1       1.0
2       0.0
3       1.0
4       0.0
       ... 
5411    1.0
5412    0.0
5413    1.0
5414    0.0
5415    0.0
Name: race, Length: 5416, dtype: float64

In [50]:
# Split independent and target variable on train and test dataset
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [51]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (4332, 20)
y_train shape: (4332,)
X_test shape: (1084, 20)
y_test shape: (1084,)


In [52]:
# First we will try to normalize data and than we will observe the distribution of data
from sklearn.preprocessing import MinMaxScaler

# fit scaler on training data
norm = MinMaxScaler()

# transform training data
X_train_norm = norm.fit_transform(X_train)

# transform testing dataabs
X_test_norm = norm.transform(X_test)

In [53]:
# Distribution of normalized data train data

col_names = list(X.columns)
df_norm = pd.DataFrame(data = X_train_norm, columns = col_names)
df_norm.head(3)

Unnamed: 0,manner_of_death,age,gender,signs_of_mental_illness,body_camera,year,month,x0_Car,x0_Foot,x0_Not fleeing,x0_Other,x1_Lethal,x1_Non-Lethal,x1_Semi-Lethal,x1_Unarmed,x1_Undetermined,x1_Unknown,x2_attack,x2_other,x2_undetermined
0,0.0,0.258824,0.0,0.0,0.0,0.2,0.272727,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.482353,0.0,0.0,0.0,0.0,0.545455,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.505882,0.0,0.0,0.0,0.8,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [54]:
# Second we will standardize data 
from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

# Dist of standardized data
df_std = pd.DataFrame(data = X_train_std, columns = col_names)
df_std.head(3)

Unnamed: 0,manner_of_death,age,gender,signs_of_mental_illness,body_camera,year,month,x0_Car,x0_Foot,x0_Not fleeing,x0_Other,x1_Lethal,x1_Non-Lethal,x1_Semi-Lethal,x1_Unarmed,x1_Undetermined,x1_Unknown,x2_attack,x2_other,x2_undetermined
0,-0.229667,-0.712961,-0.215379,-0.536862,-0.361676,-0.802565,-0.608425,-0.447647,-0.386149,0.769977,-0.285777,-1.213855,-0.293271,2.169376,-0.340091,-0.180054,-0.12247,0.740571,-0.66789,-0.218877
1,-0.229667,0.760132,-0.215379,-0.536862,-0.361676,-1.423707,0.255444,-0.447647,-0.386149,0.769977,-0.285777,-1.213855,-0.293271,-0.460962,2.940388,-0.180054,-0.12247,-1.35031,1.497253,-0.218877
2,-0.229667,0.915194,-0.215379,-0.536862,-0.361676,1.060861,1.695226,-0.447647,-0.386149,0.769977,-0.285777,0.823822,-0.293271,-0.460962,-0.340091,-0.180054,-0.12247,-1.35031,1.497253,-0.218877
