# Machine Learning Model Mockup

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Initial imports
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt

In [3]:
# scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [4]:
# scikit-learn metrics imports
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import plot_confusion_matrix

In [7]:
# Load the data
file_path = 'Resources/2013-2020_Killings_by_PD_Cleaned.csv' 
killings_df = pd.read_csv(file_path)                         
killings_df

# CSV in this code manually cleaned in Excel to save time
# Will need to use '2013-2020_Killings_by_PD.csv' in final code
# Cleaning this data will require for loops

Unnamed: 0,State,City,PD,ORI,Black People Killed by Police (1/1/2013-12/31/2020),Hispanic People Killed by Police (1/1/2013-12/31/2020),Native American People Killed by Police (1/1/2013-12/31/2020),Asian People Killed by Police (1/1/2013-12/31/2020),Pacific Islanders Killed by Police (1/1/2013-12/31/2020),White People Killed by Police (1/1/2013-12/31/2020),...,Violent Crime Rate,2013 Total Arrests (UCR Data),2014 Total Arrests,2015 Total Arrests,2016 Total Arrests,2017 Total Arrests,2018 Total Arrests,2019 Total Arrests,Estimated Average Arrests per Year 2013-2019,Killings by Police per 10k Arrests
0,New Mexico,Albuquerque,Albuquerque Police Department,NM0010100,3.0,22.0,,,,12.0,...,11.2,27700.0,25447.0,22126.0,20341.0,21134.0,22247.0,22118.0,22236.0,17.1
1,California,Anaheim,Anaheim Police Department,CA0300100,3.0,8.0,,,,3.0,...,3.4,7891.0,8137.0,8381.0,9415.0,8869.0,10019.0,9645.0,9078.0,15.4
2,Alaska,Anchorage,Anchorage Police Department,AK0010100,1.0,1.0,3.0,,1.0,6.0,...,10.9,17601.0,14748.0,14387.0,13294.0,13871.0,13973.0,13655.0,13988.0,11.4
3,Texas,Arlington,Arlington Police Department,TX2200100,7.0,1.0,,2.0,,3.0,...,5.0,17258.0,16222.0,13999.0,11024.0,8354.0,9552.0,10319.0,11578.0,11.2
4,Georgia,Atlanta,Atlanta Police Department,GAAPD0000,16.0,,,,,1.0,...,9.8,30505.0,26958.0,24601.0,22161.0,22581.0,,,24075.0,7.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,Virginia,Virginia Beach,Virginia Beach Police Department,VA1280000,4.0,,,,,2.0,...,1.4,24319.0,23199.0,20188.0,19139.0,18603.0,17133.0,17082.0,19224.0,4.2
98,Kansas,Wichita,Wichita Police Department,KS0870300,1.0,3.0,,,,11.0,...,10.3,10566.0,9322.0,9307.0,9120.0,8322.0,3937.0,9507.0,8253.0,18.2
99,North Carolina,Winston-Salem,Winston-Salem Police Department,NC0340200,3.0,,,,,,...,6.4,19744.0,18875.0,16792.0,15432.0,,12786.0,,15971.0,1.9
100,United States,Big Cities Average,Big Cities Average,,850.0,554.0,21.0,51.0,19.0,586.0,...,7.2,2440564.0,2285601.0,2144017.0,2007193.0,1908113.0,1880088.0,1562378.0,20840.0,


## Preprocess Data

In [8]:
# Rename DataFrame columns
killings_df.columns = ['State', 'City', 'PD', 'ORI', 'Black_Killed, Hispanic_Killed', 'Native_Am_Killed', 
                       'Asian_Killed', 'Pacific_Islander_Killed', 'White_Killed', 'Unknown_Race_Killed', 
                       'All_Killed', 'Allegedly_Armed_Killed', 'Unarmed_Killed', 'Unclear_Armed_Status_Killed',
                       'Armed_with_Vehicle_Killed', 'Total_Population', 'Total_Black', 'Total_White',
                       'Total_Native_Am', 'Total_Asian', 'Total_Hawaiian', 'Total_Asian_Pac_Isl', 'Other',
                       'Two_Or', 'Total_Hispanic', 'Black_White_Dissimilarity_Index(2010)',
                       'Avg_Annual_Police_Homicide', 'Avg_Annual_Police_Homicide_Black', 
                       'Avg_Annual_Police_Homicide_White', 'Avg_Annual_Police_Homicide_Hispanic', 
                       'Black_White_Disparity', 'Hispanic_White_Disparity', 'Violent_Crimes_2013',
                       'Violent_Crimes_2014', 'Violent_Crimes_2015', 'Violent_Crimes_2016', 'Violent_Crimes_2017',
                       'Violent_Crimes_2018', 'Violent_Crimes_2019', 'Average_Violent_Crimes_2013-2019',
                       'Violent_Crime_Rate', '2013_Total_Arrests', '2014_Total_Arrests', '2014_Total_Arrests',
                       '2015_Total_Arrests', '2016_Total_Arrests', '2017_Total_Arrests', '2018_Total_Arrests',
                       '2019_Total_Arrests', 'Est_Avg_Arrests_per_Year(2013-2019)', 
                       'Killings_by_Police_per_10k_Arrests'
                      ]
killings_df.head(5)

Unnamed: 0,State,City,PD,ORI,"Black_Killed, Hispanic_Killed",Native_Am_Killed,Asian_Killed,Pacific_Islander_Killed,White_Killed,Unknown_Race_Killed,...,2013_Total_Arrests,2014_Total_Arrests,2014_Total_Arrests.1,2015_Total_Arrests,2016_Total_Arrests,2017_Total_Arrests,2018_Total_Arrests,2019_Total_Arrests,Est_Avg_Arrests_per_Year(2013-2019),Killings_by_Police_per_10k_Arrests
0,New Mexico,Albuquerque,Albuquerque Police Department,NM0010100,3.0,22.0,,,,12.0,...,11.2,27700.0,25447.0,22126.0,20341.0,21134.0,22247.0,22118.0,22236.0,17.1
1,California,Anaheim,Anaheim Police Department,CA0300100,3.0,8.0,,,,3.0,...,3.4,7891.0,8137.0,8381.0,9415.0,8869.0,10019.0,9645.0,9078.0,15.4
2,Alaska,Anchorage,Anchorage Police Department,AK0010100,1.0,1.0,3.0,,1.0,6.0,...,10.9,17601.0,14748.0,14387.0,13294.0,13871.0,13973.0,13655.0,13988.0,11.4
3,Texas,Arlington,Arlington Police Department,TX2200100,7.0,1.0,,2.0,,3.0,...,5.0,17258.0,16222.0,13999.0,11024.0,8354.0,9552.0,10319.0,11578.0,11.2
4,Georgia,Atlanta,Atlanta Police Department,GAAPD0000,16.0,,,,,1.0,...,9.8,30505.0,26958.0,24601.0,22161.0,22581.0,,,24075.0,7.5


In [9]:
# Get column data types
killings_df.dtypes

State                                     object
City                                      object
PD                                        object
ORI                                       object
Black_Killed, Hispanic_Killed            float64
Native_Am_Killed                         float64
Asian_Killed                             float64
Pacific_Islander_Killed                  float64
White_Killed                             float64
Unknown_Race_Killed                      float64
All_Killed                               float64
Allegedly_Armed_Killed                     int64
Unarmed_Killed                           float64
Unclear_Armed_Status_Killed              float64
Armed_with_Vehicle_Killed                float64
Total_Population                         float64
Total_Black                               object
Total_White                               object
Total_Native_Am                           object
Total_Asian                               object
Total_Hawaiian      

In [10]:
# Find null values
for column in killings_df.columns:
    print(f'Column {column} has {killings_df[column].isnull().sum()} null values')

Column State has 0 null values
Column City has 0 null values
Column PD has 0 null values
Column ORI has 2 null values
Column Black_Killed, Hispanic_Killed has 7 null values
Column Native_Am_Killed has 27 null values
Column Asian_Killed has 90 null values
Column Pacific_Islander_Killed has 69 null values
Column White_Killed has 92 null values
Column Unknown_Race_Killed has 9 null values
Column All_Killed has 44 null values
Column Allegedly_Armed_Killed has 0 null values
Column Unarmed_Killed has 3 null values
Column Unclear_Armed_Status_Killed has 19 null values
Column Armed_with_Vehicle_Killed has 46 null values
Column Total_Population has 48 null values
Column Total_Black has 0 null values
Column Total_White has 0 null values
Column Total_Native_Am has 0 null values
Column Total_Asian has 1 null values
Column Total_Hawaiian has 1 null values
Column Total_Asian_Pac_Isl has 1 null values
Column Other has 1 null values
Column Two_Or has 1 null values
Column Total_Hispanic has 1 null valu

In [11]:
# Replace object NaNs with 'unknown'


In [12]:
# Replace float64 NaNs with '0'

In [13]:
# Drop unnecessary columns
del killings_df['ORI']
killings_df.head()

Unnamed: 0,State,City,PD,"Black_Killed, Hispanic_Killed",Native_Am_Killed,Asian_Killed,Pacific_Islander_Killed,White_Killed,Unknown_Race_Killed,All_Killed,...,2013_Total_Arrests,2014_Total_Arrests,2014_Total_Arrests.1,2015_Total_Arrests,2016_Total_Arrests,2017_Total_Arrests,2018_Total_Arrests,2019_Total_Arrests,Est_Avg_Arrests_per_Year(2013-2019),Killings_by_Police_per_10k_Arrests
0,New Mexico,Albuquerque,Albuquerque Police Department,3.0,22.0,,,,12.0,1.0,...,11.2,27700.0,25447.0,22126.0,20341.0,21134.0,22247.0,22118.0,22236.0,17.1
1,California,Anaheim,Anaheim Police Department,3.0,8.0,,,,3.0,,...,3.4,7891.0,8137.0,8381.0,9415.0,8869.0,10019.0,9645.0,9078.0,15.4
2,Alaska,Anchorage,Anchorage Police Department,1.0,1.0,3.0,,1.0,6.0,4.0,...,10.9,17601.0,14748.0,14387.0,13294.0,13871.0,13973.0,13655.0,13988.0,11.4
3,Texas,Arlington,Arlington Police Department,7.0,1.0,,2.0,,3.0,,...,5.0,17258.0,16222.0,13999.0,11024.0,8354.0,9552.0,10319.0,11578.0,11.2
4,Georgia,Atlanta,Atlanta Police Department,16.0,,,,,1.0,1.0,...,9.8,30505.0,26958.0,24601.0,22161.0,22581.0,,,24075.0,7.5


## Split Data into Training and Testing

In [14]:
# Create features
X = pd.get_dummies(killings_df.drop(columns = ['Total_Black']))

# Create target
y = pd.DataFrame(killings_df['Total_Black'])

In [15]:
# Calculate stats
X.describe()

Unnamed: 0,"Black_Killed, Hispanic_Killed",Native_Am_Killed,Asian_Killed,Pacific_Islander_Killed,White_Killed,Unknown_Race_Killed,All_Killed,Allegedly_Armed_Killed,Unarmed_Killed,Unclear_Armed_Status_Killed,...,Killings_by_Police_per_10k_Arrests_8.3,Killings_by_Police_per_10k_Arrests_8.5,Killings_by_Police_per_10k_Arrests_8.6,Killings_by_Police_per_10k_Arrests_8.7,Killings_by_Police_per_10k_Arrests_9,Killings_by_Police_per_10k_Arrests_9.2,Killings_by_Police_per_10k_Arrests_9.5,Killings_by_Police_per_10k_Arrests_9.6,Killings_by_Police_per_10k_Arrests_9.7,Killings_by_Police_per_10k_Arrests_9.8
count,95.0,75.0,12.0,33.0,10.0,93.0,58.0,102.0,99.0,83.0,...,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0
mean,40.947368,34.893333,13.583333,7.090909,8.7,53.494624,21.034483,129.27451,16.656566,3.831325,...,0.009804,0.009804,0.009804,0.009804,0.019608,0.009804,0.009804,0.009804,0.009804,0.009804
std,239.194409,184.043368,34.272858,24.031703,15.477941,397.663273,125.595699,889.816937,17.442712,3.708327,...,0.099015,0.099015,0.099015,0.099015,0.139333,0.099015,0.099015,0.099015,0.099015,0.099015
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,1.0,1.75,1.0,1.0,2.0,1.0,9.0,7.0,1.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5.0,4.0,2.0,1.0,1.0,4.0,2.0,16.0,10.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,11.5,9.5,3.25,2.0,9.25,10.0,3.0,27.0,20.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2190.0,1509.0,121.0,132.0,49.0,3803.0,952.0,8756.0,103.0,21.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state = 1)

y_train.shape

(76, 1)

## Oversampling

### Naive Random Oversampling