# Machine Learning Model Mockup

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Initial imports
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [4]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import plot_confusion_matrix

In [5]:
# Load the data
file_path = 'Resources/2013-2020_Killings_by_PD.csv'
killings_df = pd.read_csv(file_path)
killings_df

Unnamed: 0,State,City,PD,ORI,Black People Killed by Police (1/1/2013-12/31/2020),Hispanic People Killed by Police (1/1/2013-12/31/2020),Native American People Killed by Police (1/1/2013-12/31/2020),Asian People Killed by Police (1/1/2013-12/31/2020),Pacific Islanders Killed by Police (1/1/2013-12/31/2020),White People Killed by Police (1/1/2013-12/31/2020),...,Violent Crime Rate,2013 Total Arrests (UCR Data),2014 Total Arrests,2015 Total Arrests,2016 Total Arrests,2017 Total Arrests,2018 Total Arrests,2019 Total Arrests,Estimated Average Arrests per Year 2013-2019,Killings by Police per 10k Arrests
0,New Mexico,Albuquerque,Albuquerque Police Department,NM0010100,3.0,22.0,,,,12.0,...,11.2,27700.0,25447.0,22126.0,20341.0,21134.0,22247.0,22118.0,22236.0,17.1
1,California,Anaheim,Anaheim Police Department,CA0300100,3.0,8.0,,,,3.0,...,3.4,7891.0,8137.0,8381.0,9415.0,8869.0,10019.0,9645.0,9078.0,15.4
2,Alaska,Anchorage,Anchorage Police Department,AK0010100,1.0,1.0,3.0,,1.0,6.0,...,10.9,17601.0,14748.0,14387.0,13294.0,13871.0,13973.0,13655.0,13988.0,11.4
3,Texas,Arlington,Arlington Police Department,TX2200100,7.0,1.0,,2.0,,3.0,...,5.0,17258.0,16222.0,13999.0,11024.0,8354.0,9552.0,10319.0,11578.0,11.2
4,Georgia,Atlanta,Atlanta Police Department,GAAPD0000,16.0,,,,,1.0,...,9.8,30505.0,26958.0,24601.0,22161.0,22581.0,,,24075.0,7.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,United States,Nationwide Average,Nationwide Average,,2190.0,1509.0,121.0,132.0,49.0,3803.0,...,,,,,,,,,,
102,,,,,,,,,,,...,,,,,,,,,,
103,,,,,,,,,,,...,,,,,,,,,,
104,,,,,,,,,,,...,,,,,,,,,,


## Preprocess Data

In [7]:
# Rename DataFrame columns
killings_df.columns = ['State', 'City', 'PD', 'ORI', 'Black_Killed, Hispanic_Killed', 'Native_Am_Killed', 
                       'Asian_Killed', 'Pacific_Islander_Killed', 'White_Killed', 'Unknown_Race_Killed', 
                       'All_Killed', 'Allegedly_Armed_Killed', 'Unarmed_Killed', 'Unclear_Armed_Status_Killed',
                       'Armed_with_Vehicle_Killed', 'Total_Population', 'Total_Black', 'Total_White',
                       'Total_Native_Am', 'Total_Asian', 'Total_Hawaiian', 'Total_Asian_Pac_Isl', 'Other',
                       'Two_Or', 'Total_Hispanic', 'Black_White_Dissimilarity_Index(2010)',
                       'Avg_Annual_Police_Homicide', 'Avg_Annual_Police_Homicide_Black', 
                       'Avg_Annual_Police_Homicide_White', 'Avg_Annual_Police_Homicide_Hispanic', 
                       'Black_White_Disparity', 'Hispanic_White_Disparity', 'Violent_Crimes_2013',
                       'Violent_Crimes_2014', 'Violent_Crimes_2015', 'Violent_Crimes_2016', 'Violent_Crimes_2017',
                       'Violent_Crimes_2018', 'Violent_Crimes_2019', 'Average_Violent_Crimes_2013-2019',
                       'Violent_Crime_Rate', '2013_Total_Arrests', '2014_Total_Arrests', '2014_Total_Arrests',
                       '2015_Total_Arrests', '2016_Total_Arrests', '2017_Total_Arrests', '2018_Total_Arrests',
                       '2019_Total_Arrests', 'Est_Avg_Arrests_per_Year(2013-2019)', 
                       'Killings_by_Police_per_10k_Arrests'
                      ]
killings_df.head(5)

Unnamed: 0,State,City,PD,ORI,"Black_Killed, Hispanic_Killed",Native_Am_Killed,Asian_Killed,Pacific_Islander_Killed,White_Killed,Unknown_Race_Killed,...,2013_Total_Arrests,2014_Total_Arrests,2014_Total_Arrests.1,2015_Total_Arrests,2016_Total_Arrests,2017_Total_Arrests,2018_Total_Arrests,2019_Total_Arrests,Est_Avg_Arrests_per_Year(2013-2019),Killings_by_Police_per_10k_Arrests
0,New Mexico,Albuquerque,Albuquerque Police Department,NM0010100,3.0,22.0,,,,12.0,...,11.2,27700.0,25447.0,22126.0,20341.0,21134.0,22247.0,22118.0,22236.0,17.1
1,California,Anaheim,Anaheim Police Department,CA0300100,3.0,8.0,,,,3.0,...,3.4,7891.0,8137.0,8381.0,9415.0,8869.0,10019.0,9645.0,9078.0,15.4
2,Alaska,Anchorage,Anchorage Police Department,AK0010100,1.0,1.0,3.0,,1.0,6.0,...,10.9,17601.0,14748.0,14387.0,13294.0,13871.0,13973.0,13655.0,13988.0,11.4
3,Texas,Arlington,Arlington Police Department,TX2200100,7.0,1.0,,2.0,,3.0,...,5.0,17258.0,16222.0,13999.0,11024.0,8354.0,9552.0,10319.0,11578.0,11.2
4,Georgia,Atlanta,Atlanta Police Department,GAAPD0000,16.0,,,,,1.0,...,9.8,30505.0,26958.0,24601.0,22161.0,22581.0,,,24075.0,7.5


In [8]:
# Get column data types
killings_df.dtypes

State                                     object
City                                      object
PD                                        object
ORI                                       object
Black_Killed, Hispanic_Killed            float64
Native_Am_Killed                         float64
Asian_Killed                             float64
Pacific_Islander_Killed                  float64
White_Killed                             float64
Unknown_Race_Killed                      float64
All_Killed                               float64
Allegedly_Armed_Killed                   float64
Unarmed_Killed                           float64
Unclear_Armed_Status_Killed              float64
Armed_with_Vehicle_Killed                float64
Total_Population                         float64
Total_Black                               object
Total_White                               object
Total_Native_Am                           object
Total_Asian                               object
Total_Hawaiian      

In [11]:
# Find null values
for column in killings_df.columns:
    print(f'Column {column} has {killings_df[column].isnull().sum()} null values')

Column State has 4 null values
Column City has 3 null values
Column PD has 4 null values
Column ORI has 6 null values
Column Black_Killed, Hispanic_Killed has 11 null values
Column Native_Am_Killed has 31 null values
Column Asian_Killed has 94 null values
Column Pacific_Islander_Killed has 73 null values
Column White_Killed has 96 null values
Column Unknown_Race_Killed has 13 null values
Column All_Killed has 48 null values
Column Allegedly_Armed_Killed has 4 null values
Column Unarmed_Killed has 7 null values
Column Unclear_Armed_Status_Killed has 23 null values
Column Armed_with_Vehicle_Killed has 50 null values
Column Total_Population has 52 null values
Column Total_Black has 4 null values
Column Total_White has 4 null values
Column Total_Native_Am has 4 null values
Column Total_Asian has 5 null values
Column Total_Hawaiian has 5 null values
Column Total_Asian_Pac_Isl has 5 null values
Column Other has 5 null values
Column Two_Or has 5 null values
Column Total_Hispanic has 5 null va

In [None]:
# Replace object NaNs with 'unknown'


In [None]:
# Replace float64 NaNs with '0'