In [54]:
import pandas as pd
import os
from utils import drop_categorical_columns
from utils import missing_values_table
import numpy as np

In [55]:
def drop_label_with_null(df, column_name):
    # Drop rows where the specified column is null
    df_cleaned = df.dropna(subset=[column_name])

    return df_cleaned

In [56]:
original = pd.read_csv("TestDataset/database.csv")
original.head(5)
column_names = [str(i) for i in range(1, len(original.columns) + 1)]
#
# # Assign the generated column names to the DataFrame
#df.columns = column_names
# Replace '?' with NaN in the entire DataFrameN
original.replace('?', np.nan, inplace=True)


In [57]:
missing_values_table(original)

Your selected dataframe has 14 columns.
There are 6 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values,Data Type
race,103,4.8,object
age,43,2.0,float64
flee,36,1.7,object
name,19,0.9,object
armed,6,0.3,object
gender,1,0.0,object


In [58]:
original.head(5)
df=original.copy()
df.columns

Index(['id', 'name', 'date', 'manner_of_death', 'armed', 'age', 'gender',
       'race', 'city', 'state', 'signs_of_mental_illness', 'threat_level',
       'flee', 'body_camera'],
      dtype='object')

In [59]:
df.drop(columns=['name','id'],inplace=True)

In [60]:
original.head(5)

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,2015-01-02,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,2015-01-02,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,2015-01-03,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,2015-01-04,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,2015-01-04,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False


In [61]:
categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
print(len(categorical_columns))
for col in categorical_columns:
    df[col] = pd.to_numeric(df[col], errors='ignore')
# after those are taken care of we can drop the columns that are still object
categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
print(categorical_columns)

9
['date', 'manner_of_death', 'armed', 'gender', 'race', 'city', 'state', 'threat_level', 'flee']


In [62]:
# Convert date and time columns to datetime format
df['date'] = pd.to_datetime(df['date'])


# Extract year, month, day from the date column
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# Drop the original date and time columns
df = df.drop(['date'], axis=1)

In [63]:
dropped=drop_label_with_null(df, 'age')
dropped.shape

(2099, 14)

In [64]:
missing_values_table(dropped)

Your selected dataframe has 14 columns.
There are 3 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values,Data Type
race,81,3.9,object
flee,32,1.5,object
armed,5,0.2,object


In [65]:
#others then handle nullls

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
df=dropped
# Select categorical columns
categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Create a copy of the original dataframe to preserve the original columns
df_encoded = df.copy()

# Create an empty DataFrame to store the encoded columns
encoded_data_frames = []

# Define a list of transformers for each categorical column
for col in categorical_columns:
    # Replace null values with "Others" in the original column
    df_encoded[col].fillna("Others", inplace=True)

    # Create dummy variables using OneHotEncoder
    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore', drop='first')
    encoded_data = encoder.fit_transform(df_encoded[[col]])

    # Get column names based on the original column and category
    column_names = [category for category in encoder.get_feature_names_out([col])]

    # Create a DataFrame for the encoded data
    encoded_df = pd.DataFrame(encoded_data, columns=column_names, index=df_encoded.index)

    try:
      new_column_name = f"{col}_{'Others'}"
      # Drop the original column and the dummy variables for "Others"

      encoded_df.loc[df[col].isnull(), column_names] = pd.NA
      encoded_df.drop([new_column_name], axis=1, inplace=True)
    except:
      print('This column doesnt have nulls',col)

    # Add the encoded DataFrame to the list
    encoded_data_frames.append(encoded_df)

# Concatenate the original DataFrame and the list of encoded DataFrames
df_encoded = pd.concat([df_encoded] + encoded_data_frames, axis=1)

# Drop the original categorical columns
df_encoded.drop(categorical_columns, axis=1, inplace=True)

# Now, the df_encoded DataFrame contains the changes made to categorical columns, with null values preserved.

This column doesnt have nulls manner_of_death
This column doesnt have nulls armed
This column doesnt have nulls gender
This column doesnt have nulls city
This column doesnt have nulls state
This column doesnt have nulls threat_level




In [66]:
missing_values_table(df_encoded)

Your selected dataframe has 1353 columns.
There are 71 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values,Data Type
race_W,81,3.9,float64
race_O,81,3.9,float64
race_N,81,3.9,float64
race_H,81,3.9,float64
race_B,81,3.9,float64
...,...,...,...
armed_gun and knife,5,0.2,float64
armed_gun,5,0.2,float64
armed_glass shard,5,0.2,float64
armed_garden tool,5,0.2,float64


In [67]:
df_encoded.head(15)

Unnamed: 0,age,signs_of_mental_illness,body_camera,year,month,day,manner_of_death_shot and Tasered,armed_Taser,armed_ax,armed_baseball bat,...,state_VT,state_WA,state_WI,state_WV,state_WY,threat_level_other,threat_level_undetermined,flee_Foot,flee_Not fleeing,flee_Other
0,53.0,True,False,2015,1,2,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,47.0,False,False,2015,1,2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,23.0,False,False,2015,1,3,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,32.0,True,False,2015,1,4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,39.0,False,False,2015,1,4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,18.0,False,False,2015,1,4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6,22.0,False,False,2015,1,5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,35.0,False,False,2015,1,6,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8,34.0,False,True,2015,1,6,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
9,47.0,False,False,2015,1,6,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [70]:
# Assuming df is your DataFrame
df_encoded['body_camera'] = df_encoded['body_camera'].astype(int)
df_encoded['signs_of_mental_illness'] = df_encoded['signs_of_mental_illness'].astype(int)

In [71]:
df_encoded.to_csv(os.path.join('Final-Datasets', 'Fatal-Shotting.csv'),index=False)

In [72]:
missing_values_table(df_encoded)

Your selected dataframe has 1353 columns.
There are 71 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values,Data Type
race_W,81,3.9,float64
race_O,81,3.9,float64
race_N,81,3.9,float64
race_H,81,3.9,float64
race_B,81,3.9,float64
...,...,...,...
armed_gun and knife,5,0.2,float64
armed_gun,5,0.2,float64
armed_glass shard,5,0.2,float64
armed_garden tool,5,0.2,float64
