In [1]:
import warnings
warnings.filterwarnings('ignore')

# Matplotlib inline magic command
%matplotlib inline

In [3]:
# Initial imports
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

import numpy as np
import seaborn as sns
from collections import Counter
import psycopg2
from config import db_password
import json
from pandas.io.json import json_normalize
import mysql.connector
from sqlalchemy import create_engine
from os import walk
import time

import matplotlib as mpl
import matplotlib.pyplot as plt

In [4]:
# Scikit-Learn Model Selection and Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [5]:
# Scikit-Learn Model imports
from sklearn.linear_model import LinearRegression

In [6]:
# Sci-kit Learn Metrics imports
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from imblearn.metrics import classification_report_imbalanced

ModuleNotFoundError: No module named 'imblearn'

In [8]:
# Imbalanced Learn imports
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.over_sampling import RandomOverSampler

ModuleNotFoundError: No module named 'imblearn'

In [None]:
# Connect to AWS
def getDBengine():
    user = 'postgres'
    passw = 'db_passw'
    host = 'policekillings.cv95lgysyvwq.us-east-1.rds.amazonaws.com'
    port = 5432
    database = 'PoliceKillings'
    engine_1 = create_engine(f'postgres://' + user + ':' + passw + '@' + host + ':' + str(port) + '/' + database , 
                             pool_recycle=3600, echo=False)

In [None]:
# Load database into DataFrame
def getData1():
    s_statement = "SELECT * FROM public.police_description"
    # df = pd.read_sql(s_statement, con=getDBengine())
    try:
        df = pd.read_sql(s_statement, con=getDBengine1())
    except:
        df = pd.DataFrame()
    return df

In [None]:
# Load the data
#file_path = 'Resources/2013-2020_Police_Killings_Revised.xlsx' 
#killings_df = pd.read_excel(file_path)                         
#killings_df

In [None]:
# Rename columns
killings_df.rename(columns = {"Victim's age" : 'Victim_Age', "Victim's gender" : 'Victim_Gender', 
                              "Victim's race" : 'Victim_Race', 'Date of Incident (month/day/year)' : 'Date',
                              'Agency responsible for death' : 'Responsible_Agency', 'Cause of death' : 'Cause_of_Death',
                              'A brief description of the circumstances surrounding the death' : 'Brief_Description',
                              'Criminal Charges?' : 'Criminal_Charges', 'Symptoms of mental illness?' : 'Mental_Illness', 
                              'Armed/Unarmed Status' : 'Armed_Status', 'Alleged Threat Level (Source: WaPo)' : 'Threat_Level', 
                              'Fleeing (Source: WaPo)' : 'Fleeing', 'Body Camera (Source: WaPo)' : 'Body_Camera', 
                              'Encounter Type (DRAFT)' : 'Encounter_Type', 
                              'Initial Reported Reason for Encounter (DRAFT)' : 'Initial_Reason_for_Encounter',
                              'Call for Service? (DRAFT)' : 'Call_for_Service'}, inplace = True)

killings_df.head()

In [None]:
# Delete Gender column
del killings_df['Victim_Gender']

In [None]:
# Find null values
for column in killings_df.columns:
    print(f'Column {column} has {killings_df[column].isnull().sum()} null values')

In [None]:
# Replace empty/blank values for Age, Threat_Level, Fleeing, Body_Camera, Encounter_Type, Initial_Reason_for_Encounter, 
# Call_for_Service and re-verify null values

# Column: Age
killings_df['Victim_Age'].replace({'Unknown': '0'}, inplace = True) # Possible to fill in 'Unknown' with mean
                                                                    # of column?
# Column: Threat_Level
killings_df['Threat_Level'].replace({None: 'undetermined'}, inplace = True)

# Column: Fleeing
killings_df['Fleeing'].replace({None: 'unknown'}, inplace = True)

# Column: Body_Camera
killings_df['Body_Camera'].replace({None: 'unknown'}, inplace = True)

# Column: Encounter_Type
killings_df['Encounter_Type'].replace({None: 'Unknown'}, inplace = True)

# Column: Initial_Reason_for_Encounter
killings_df['Initial_Reason_for_Encounter'].replace({None: 'unknown'}, inplace = True)

# Column: Call_for_Service
killings_df['Call_for_Service'].replace({None: 'Unavailable'}, inplace = True)

# Verify null values
for column in killings_df.columns:
    print(f'Column {column} has {killings_df[column].isnull().sum()} null values')

In [None]:
# Drop null values
killings_df = killings_df.dropna(how = 'any')
killings_df = killings_df.reset_index().drop(['index'], axis = 1)
killings_df

In [None]:
killings_df['Brief_Description'][4532]

In [None]:
# Inspect column data types
killings_df.dtypes

In [None]:
# Change Victim_Age data type from object to int
killings_df['Victim_Age'] = killings_df['Victim_Age'].astype(str).astype(int)
killings_df.dtypes

In [None]:
# Extract year from Date and create new column Year
killings_df['Year'] = pd.DatetimeIndex(killings_df['Date']).year

# Extract month from Date and create new column Month
killings_df['Month'] = pd.DatetimeIndex(killings_df['Date']).month

# Extract day from Date and create new column Day
killings_df['Day'] = pd.DatetimeIndex(killings_df['Date']).day

# Get day of week from Date and create new column Day_of_Week
killings_df['Day_of_Week'] = killings_df['Date'].dt.day_name()

In [None]:
# Get holiday from Date and create new column Holiday
dr = pd.date_range(start='2013-01-01', end='2021-04-18')
df = pd.DataFrame()
df['Date'] = dr

cal = calendar()
holidays = cal.holidays(start=dr.min(), end=dr.max())

killings_df['Holiday'] = killings_df['Date'].dt.date.astype('datetime64').isin(holidays)

In [None]:
#killings_df

In [None]:
# Use get_dummies() to transform text values into numerical values
killings_encoded = pd.get_dummies(killings_df, columns = 
                                  ['Cause_of_Death', 'Mental_Illness', 'Armed_Status', 'Threat_Level', 'Fleeing', 'Body_Camera', 
                                   'Geography'])

killings_encoded.head()

# We are keeping race because that is what our model will be running against, right?  Age was converted to an 
# int and Date is now datetime
# Are we deleting the columns that are not in killings_encoded then?  I took notes on this during the tutor
# session, but they do not make sense to me

## Split Data into Training and Testing Sets

In [None]:
# Create features
X = pd.get_dummies(killings_df.drop(columns = ['race']))

# Create target
y = pd.DataFrame(killings_df['race'])

In [None]:
X.describe()

In [None]:
# Check balance of target values
y['race'].value_counts()

In [None]:
# Train, test, and split model
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state = 78,
                                                    stratify = y)

y_train.shape

In [None]:
# Creating a StandardScaler instance
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# ML model code
# Still need to pick a model

In [None]:
# Calculate balanced accuracy score
rf_y_pred = rf_model.predict(X_test)
balanced_accuracy_score(y_test, rf_y_pred)

In [None]:
# Display confusion matrix
rf_cm = confusion_matrix(y_test, rf_y_pred)
rf_cm

In [None]:
# Print imbalanced classification report
print(classification_report_imbalanced(y_test, rf_y_pred))

In [None]:
# List feature sorted in descending order by feature importance - is this needed?
sorted(zip(rf_model.feature_importances_, X.columns), reverse = True)

In [None]:
# Model prediction
print(rf_y_pred)