In [1]:
import warnings
warnings.filterwarnings('ignore')

# Matplotlib inline magic command
%matplotlib inline

In [2]:
# Initial imports
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from collections import Counter

In [3]:
# Scikit-Learn Model Selection and Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
# Scikit-Learn Model imports
from sklearn.linear_model import LinearRegression

In [5]:
# Sci-kit Learn Metrics imports
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [6]:
# Imbalanced Learn imports
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.over_sampling import RandomOverSampler

In [None]:
# Connect to AWS


In [7]:
# Load the data
file_path = 'Resources/2013-2020_Police_Killings_Revised.xlsx' 
killings_df = pd.read_excel(file_path)                         
killings_df

Unnamed: 0,Victim's age,Victim's gender,Victim's race,Date of Incident (month/day/year),County,Agency responsible for death,Cause of death,Symptoms of mental illness?,Armed/Unarmed Status,Alleged Threat Level (Source: WaPo),Fleeing (Source: WaPo),Body Camera (Source: WaPo),Geography,Encounter Type (DRAFT),Initial Reported Reason for Encounter (DRAFT),Call for Service? (DRAFT)
0,Unknown,Male,Unknown Race,2021-04-18,,Detroit Police Department,Gunshot,,Allegedly Armed,attack,,,,Part 1 Violent Crime,shooting,
1,Unknown,Male,Unknown Race,2021-04-18,,Douglasville Police Department,Gunshot,,,,,,,,,
2,Unknown,Male,Unknown Race,2021-04-18,,Fort Worth Police Department,Gunshot,,Allegedly Armed,,,,,Other Non-Violent Offense,carjacking,
3,Unknown,Male,White,2021-04-18,,Burnsville Police Department,Gunshot,,Allegedly Armed,attack,car,,,Other Non-Violent Offense,carjacking,
4,Unknown,Male,Black,2021-04-17,,Winter Haven Police Department,Gunshot,,,,,,,Domestic disturbance,Domestic disturbance (armed),
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9077,21,Male,White,2013-01-01,Douglas,Douglas County Sheriff's Office,Gunshot,Drug or alcohol use,Allegedly Armed,,,,Rural,,,
9078,26,Male,White,2013-01-01,Blue Earth,Mankato Department of Public Safety,Taser,No,Unarmed/Did Not Have Actual Weapon,,,,Suburban,,,
9079,21,Male,Hispanic,2013-01-01,Pueblo,Pueblo Police Department,Gunshot,No,Allegedly Armed,,,,Suburban,,,
9080,49,Male,Hispanic,2013-01-01,San Juan,Farmington Police Department,Gunshot,No,Allegedly Armed,,,,Urban,,,


In [8]:
# Rename columns
killings_df.rename(columns = {"Victim's age" : 'Victim_Age', "Victim's gender" : 'Victim_Gender', 
                              "Victim's race" : 'Victim_Race', 'Date of Incident (month/day/year)' : 'Date',
                              'Agency responsible for death' : 'Responsible_Agency', 'Cause of death' : 'Cause_of_Death',
                              'A brief description of the circumstances surrounding the death' : 'Brief_Description',
                              'Criminal Charges?' : 'Criminal_Charges', 'Symptoms of mental illness?' : 'Mental_Illness', 
                              'Armed/Unarmed Status' : 'Armed_Status', 'Alleged Threat Level (Source: WaPo)' : 'Threat_Level', 
                              'Fleeing (Source: WaPo)' : 'Fleeing', 'Body Camera (Source: WaPo)' : 'Body_Camera', 
                              'Encounter Type (DRAFT)' : 'Encounter_Type', 
                              'Initial Reported Reason for Encounter (DRAFT)' : 'Initial_Reason_for_Encounter',
                              'Call for Service? (DRAFT)' : 'Call_for_Service'}, inplace = True)

killings_df.head()

Unnamed: 0,Victim_Age,Victim_Gender,Victim_Race,Date,County,Responsible_Agency,Cause_of_Death,Mental_Illness,Armed_Status,Threat_Level,Fleeing,Body_Camera,Geography,Encounter_Type,Initial_Reason_for_Encounter,Call_for_Service
0,Unknown,Male,Unknown Race,2021-04-18,,Detroit Police Department,Gunshot,,Allegedly Armed,attack,,,,Part 1 Violent Crime,shooting,
1,Unknown,Male,Unknown Race,2021-04-18,,Douglasville Police Department,Gunshot,,,,,,,,,
2,Unknown,Male,Unknown Race,2021-04-18,,Fort Worth Police Department,Gunshot,,Allegedly Armed,,,,,Other Non-Violent Offense,carjacking,
3,Unknown,Male,White,2021-04-18,,Burnsville Police Department,Gunshot,,Allegedly Armed,attack,car,,,Other Non-Violent Offense,carjacking,
4,Unknown,Male,Black,2021-04-17,,Winter Haven Police Department,Gunshot,,,,,,,Domestic disturbance,Domestic disturbance (armed),


In [9]:
# Delete Gender column
del killings_df['Victim_Gender']

In [10]:
# Find null values
for column in killings_df.columns:
    print(f'Column {column} has {killings_df[column].isnull().sum()} null values')

Column Victim_Age has 8 null values
Column Victim_Race has 0 null values
Column Date has 0 null values
Column County has 26 null values
Column Responsible_Agency has 4 null values
Column Cause_of_Death has 0 null values
Column Mental_Illness has 69 null values
Column Armed_Status has 6 null values
Column Threat_Level has 2394 null values
Column Fleeing has 3077 null values
Column Body_Camera has 3147 null values
Column Geography has 31 null values
Column Encounter_Type has 4310 null values
Column Initial_Reason_for_Encounter has 4310 null values
Column Call_for_Service has 5539 null values


In [11]:
# Replace empty/blank values for Age, Threat_Level, Fleeing, Body_Camera, Encounter_Type, Initial_Reason_for_Encounter, 
# Call_for_Service and re-verify null values

# Column: Age
killings_df['Victim_Age'].replace({'Unknown': '0'}, inplace = True) # Possible to fill in 'Unknown' with mean
                                                                    # of column?
# Column: Threat_Level
killings_df['Threat_Level'].replace({None: 'undetermined'}, inplace = True)

# Column: Fleeing
killings_df['Fleeing'].replace({None: 'unknown'}, inplace = True)

# Column: Body_Camera
killings_df['Body_Camera'].replace({None: 'unknown'}, inplace = True)

# Column: Encounter_Type
killings_df['Encounter_Type'].replace({None: 'Unknown'}, inplace = True)

# Column: Initial_Reason_for_Encounter
killings_df['Initial_Reason_for_Encounter'].replace({None: 'unknown'}, inplace = True)

# Column: Call_for_Service
killings_df['Call_for_Service'].replace({None: 'Unavailable'}, inplace = True)

# Verify null values
for column in killings_df.columns:
    print(f'Column {column} has {killings_df[column].isnull().sum()} null values')

Column Victim_Age has 8 null values
Column Victim_Race has 0 null values
Column Date has 0 null values
Column County has 26 null values
Column Responsible_Agency has 4 null values
Column Cause_of_Death has 0 null values
Column Mental_Illness has 69 null values
Column Armed_Status has 6 null values
Column Threat_Level has 0 null values
Column Fleeing has 0 null values
Column Body_Camera has 0 null values
Column Geography has 31 null values
Column Encounter_Type has 0 null values
Column Initial_Reason_for_Encounter has 0 null values
Column Call_for_Service has 0 null values


In [12]:
# Drop null values
killings_df = killings_df.dropna(how = 'any')
killings_df = killings_df.reset_index().drop(['index'], axis = 1)
killings_df

Unnamed: 0,Victim_Age,Victim_Race,Date,County,Responsible_Agency,Cause_of_Death,Mental_Illness,Armed_Status,Threat_Level,Fleeing,Body_Camera,Geography,Encounter_Type,Initial_Reason_for_Encounter,Call_for_Service
0,46,Hispanic,2021-04-14,Harris,Harris County Sheriff's Office,"Gunshot, Taser",Yes,Allegedly Armed,other,unknown,no,Suburban,Mental Health/Welfare Check,mental health crisis,Yes
1,48,Unknown Race,2021-04-10,Los Angeles,San Fernando Police Department,Gunshot,No,Allegedly Armed,attack,not fleeing,no,Urban,Part 1 Violent Crime,gunshots,Yes
2,0,Unknown Race,2021-04-10,Winnebago,Winnebago County Sheriff's Office,Gunshot,No,Unclear,undetermined,unknown,no,Suburban,Domestic disturbance,domestic disturbance,No
3,0,Unknown Race,2021-04-10,Jefferson,Jefferson Parish Sheriff's Office,Gunshot,No,Allegedly Armed,attack,unknown,no,Urban,Domestic disturbance,domestic disturbance,Yes
4,50,Unknown Race,2021-04-09,Clark,U.S. Marshals Service,Gunshot,No,Allegedly Armed,attack,not fleeing,no,Suburban,Other Crimes Against People,warrant (threatening judge),No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8969,21,White,2013-01-01,Douglas,Douglas County Sheriff's Office,Gunshot,Drug or alcohol use,Allegedly Armed,undetermined,unknown,unknown,Rural,Unknown,unknown,Unavailable
8970,26,White,2013-01-01,Blue Earth,Mankato Department of Public Safety,Taser,No,Unarmed/Did Not Have Actual Weapon,undetermined,unknown,unknown,Suburban,Unknown,unknown,Unavailable
8971,21,Hispanic,2013-01-01,Pueblo,Pueblo Police Department,Gunshot,No,Allegedly Armed,undetermined,unknown,unknown,Suburban,Unknown,unknown,Unavailable
8972,49,Hispanic,2013-01-01,San Juan,Farmington Police Department,Gunshot,No,Allegedly Armed,undetermined,unknown,unknown,Urban,Unknown,unknown,Unavailable


In [None]:
killings_df['Brief_Description'][4532]

In [13]:
# Inspect column data types
killings_df.dtypes

Victim_Age                              object
Victim_Race                             object
Date                            datetime64[ns]
County                                  object
Responsible_Agency                      object
Cause_of_Death                          object
Mental_Illness                          object
Armed_Status                            object
Threat_Level                            object
Fleeing                                 object
Body_Camera                             object
Geography                               object
Encounter_Type                          object
Initial_Reason_for_Encounter            object
Call_for_Service                        object
dtype: object

In [14]:
# Change Victim_Age data type from object to int
killings_df['Victim_Age'] = killings_df['Victim_Age'].astype(str).astype(int)
killings_df.dtypes

Victim_Age                               int32
Victim_Race                             object
Date                            datetime64[ns]
County                                  object
Responsible_Agency                      object
Cause_of_Death                          object
Mental_Illness                          object
Armed_Status                            object
Threat_Level                            object
Fleeing                                 object
Body_Camera                             object
Geography                               object
Encounter_Type                          object
Initial_Reason_for_Encounter            object
Call_for_Service                        object
dtype: object

In [15]:
# Extract year from Date and create new column Year
killings_df['Year'] = pd.DatetimeIndex(killings_df['Date']).year

# Extract month from Date and create new column Month
killings_df['Month'] = pd.DatetimeIndex(killings_df['Date']).month

# Extract day from Date and create new column Day
killings_df['Day'] = pd.DatetimeIndex(killings_df['Date']).day

# Get day of week from Date and create new column Day_of_Week
killings_df['Day_of_Week'] = killings_df['Date'].dt.day_name()

In [22]:
# Get holiday from Date and create new column Holiday
dr = pd.date_range(start='2013-01-01', end='2021-04-18')
df = pd.DataFrame()
df['Date'] = dr

cal = calendar()
holidays = cal.holidays(start=dr.min(), end=dr.max())

killings_df['Holiday'] = killings_df['Date'].dt.date.astype('datetime64').isin(holidays)

In [24]:
#killings_df

In [16]:
# Use get_dummies() to transform text values into numerical values
killings_encoded = pd.get_dummies(killings_df, columns = 
                                  ['Cause_of_Death', 'Mental_Illness', 'Armed_Status', 'Threat_Level', 'Fleeing', 'Body_Camera', 
                                   'Geography'])

killings_encoded.head()

# We are keeping race because that is what our model will be running against, right?  Age was converted to an 
# int and Date is now datetime
# Are we deleting the columns that are not in killings_encoded then?  I took notes on this during the tutor
# session, but they do not make sense to me

Unnamed: 0,Victim_Age,Victim_Race,Date,County,Responsible_Agency,Encounter_Type,Initial_Reason_for_Encounter,Call_for_Service,Year,Month,...,Body_Camera_Yes,Body_Camera_cell phone video,Body_Camera_no,Body_Camera_surveillance video,Body_Camera_unknown,Body_Camera_yes,Geography_Rural,Geography_Suburban,Geography_Undetermined,Geography_Urban
0,46,Hispanic,2021-04-14,Harris,Harris County Sheriff's Office,Mental Health/Welfare Check,mental health crisis,Yes,2021,4,...,0,0,1,0,0,0,0,1,0,0
1,48,Unknown Race,2021-04-10,Los Angeles,San Fernando Police Department,Part 1 Violent Crime,gunshots,Yes,2021,4,...,0,0,1,0,0,0,0,0,0,1
2,0,Unknown Race,2021-04-10,Winnebago,Winnebago County Sheriff's Office,Domestic disturbance,domestic disturbance,No,2021,4,...,0,0,1,0,0,0,0,1,0,0
3,0,Unknown Race,2021-04-10,Jefferson,Jefferson Parish Sheriff's Office,Domestic disturbance,domestic disturbance,Yes,2021,4,...,0,0,1,0,0,0,0,0,0,1
4,50,Unknown Race,2021-04-09,Clark,U.S. Marshals Service,Other Crimes Against People,warrant (threatening judge),No,2021,4,...,0,0,1,0,0,0,0,1,0,0


## Split Data into Training and Testing Sets

In [None]:
# Create features
X = pd.get_dummies(killings_df.drop(columns = ['race']))

# Create target
y = pd.DataFrame(killings_df['race'])

In [None]:
X.describe()

In [None]:
# Check balance of target values
y['race'].value_counts()

In [None]:
# Train, test, and split model
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state = 78,
                                                    stratify = y)

y_train.shape

In [None]:
# Creating a StandardScaler instance
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# ML model code
# Still need to pick a model

In [None]:
# Calculate balanced accuracy score
rf_y_pred = rf_model.predict(X_test)
balanced_accuracy_score(y_test, rf_y_pred)

In [None]:
# Display confusion matrix
rf_cm = confusion_matrix(y_test, rf_y_pred)
rf_cm

In [None]:
# Print imbalanced classification report
print(classification_report_imbalanced(y_test, rf_y_pred))

In [None]:
# List feature sorted in descending order by feature importance - is this needed?
sorted(zip(rf_model.feature_importances_, X.columns), reverse = True)

In [None]:
# Model prediction
print(rf_y_pred)