The following packages/modules are used:
- pandas
- XGBoost
- joblib
- numpy

We also use functions from the NLP_Preprocessing file.

In [17]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
import joblib

## 1. Load Features

Load output features from inference feature engineering.

In [18]:
features = pd.read_csv('inference_features.csv', header=None)

In [19]:
#read words to assign to column headers
cv = joblib.load('countvectorizer.pkl')
words = cv.get_feature_names_out()

In [20]:
names = ['victim_age_1','subject_age_1','East', 'North', 'precinct_OOJ', 'South', 'Southwest', 'West', 'precinct_Unknown',
 'Female','Gender Diverse (gender non-conforming and/or transgender)', 'Male', 'Vic_Gender_Unknown',
 'American Indian or Alaska Native', 'Asian', 'Black or African American', 'Native Hawaiian or Other Pacific Islander',
 'Vic_Race_Unknown', 'White', 'Hispanic Or Latino', 'Not Hispanic Or Latino', 'Vic_Ethni_Unknown',
 'subject_American Indian or Alaska Native', 'subject_Asian', 'subject_Black or African American',
 'subject_Native Hawaiian or Other Pacific Islander', 'subject_Sub_Race_Unknown', 'subject_White',
 'subject_Female', 'subject_Gender Diverse (gender non-conforming and/or transgender)',
 'subject_Male', 'subject_Sub_Gender_Unknown', 'subject_Hispanic Or Latino', 'subject_Not Hispanic Or Latino',
 'subject_Sub_Ethni_Unknown', 'B1', 'B2', 'B3', 'C1', 'C2', 'C3', 'D1', 'D2', 'D3', 'E1', 'E2', 'E3', 'F1', 'F2',
 'F3', 'G1', 'G2', 'G3', 'H1', 'H2', 'H3', 'J1', 'J2', 'J3', 'K1', 'K2', 'K3', 'L1', 'L2', 'L3', 'M1', 'M2', 'M3',
 'N1', 'N2', 'N3', 'O1', 'O2', 'O3', 'Q1', 'Q2', 'Q3', 'R1', 'R2', 'R3', 'S1', 'S2', 'S3', 'U1', 'U2', 'U3',
 'beat_Unknown', 'W1', 'W2', 'W3', 'beat_OOJ']

col_names = np.concatenate((words, names))

features = features.rename(columns=dict(zip(features.columns, col_names), inplace=True))

### Read Trained Model and Generate Predictions

In [21]:
bst = xgb.Booster()
bst.load_model('xgboost_model')

In [22]:
# make predictions
dfeat = xgb.DMatrix(features)
preds = bst.predict(dfeat)
features["prediction"] = preds

In [23]:
# read optimal classification threshold
ot = pd.read_csv('optimal_threshold.csv', header=None)

In [24]:
preds_optimal = [1 if i >= ot.iloc[0][0] else 0 for i in preds]
features['pred_optimal'] = preds_optimal

In [25]:
#read reporting_event_numbers and report_ids to identify reports with positive predictions
ids = pd.read_csv('inference_reports.csv')

### Save Reports with Positive Predictions 

In [34]:
pos_preds = features[features['pred_optimal'] == 1]

In [35]:
#save reports with positive predictions to provide to bias crime unit 
pos_preds = pos_preds.join(ids, how='left')
pos_preds[['report_id', 'reporting_event_number']].to_csv('pos_preds.csv', index=False)