## Questions

If someone is stopped, can we predict the probability that they will be frisked?

Is someone is stopped, can we predict the probability that they will be arrested?

If someone is stopped and frisked, can we predict the probability that they will be arrested?

How can we minimize the damage that this unconstitional law inflicts on its citizens?

What observed characteristics lead to arrests?

We chose to look at features that were only available to the officer before the stop was initiated.

Our objective is to maximize the precision of stop and frisks procduces.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
import data_cleaner as dc
import data_modeler as dm
import importlib

importlib.reload(dc)
importlib.reload(dm)
np.random.seed(5)

In [3]:
df = dc.load_full_sqf()

In [4]:
# df = dc.load_full_sqf(force=True)

In [5]:
# filespecs = dc.load_filespecs()
# pd.options.display.max_rows=112
# pd.options.display.max_colwidth=100
# pd.options.display.max_columns=113
# filespecs[2016][['Variable', 'Label']]

Only looked at a 0.1% of the data. We limited our features to data that could only be determined before the stop was initiated.

In [6]:
df_sample = df.sample(frac=.001)
X_pre_so = df_sample[dm.PRE_STOP_OBSERVABLES]
y = df_sample.arstmade

Broke up the data into categorical features and Y/N features

In [7]:
X_category = X_pre_so.select_dtypes(include='category')
X_yn = X_pre_so.select_dtypes(include='int8')

Fill in missing values in X_category and one hot encode categorical features.

In [8]:
X_cat_nonan = dm.fill_NaNs(X_category) # fills in NaNs
X_ohe, ohe_feat_names = dm.categorical_encoder(X_cat_nonan)
X_ohe_df = pd.DataFrame(X_ohe.toarray())

In [9]:
ohe_feat_names

array(['x0_NoVal', 'x1_F', 'x1_M', 'x1_NoVal', 'x1_Z', 'x2_H', 'x2_NoVal',
       'x2_P', 'x2_T', 'x3_H', 'x3_M', 'x3_NoVal', 'x3_T', 'x3_U', 'x3_Z',
       'x4_BA', 'x4_BK', 'x4_BL', 'x4_BR', 'x4_DY', 'x4_GY', 'x4_NoVal',
       'x4_RA', 'x4_RD', 'x4_SN', 'x4_SP', 'x4_XX', 'x4_ZZ', 'x5_BK',
       'x5_BL', 'x5_BR', 'x5_DF', 'x5_GR', 'x5_GY', 'x5_HA', 'x5_NoVal',
       'x5_XX', 'x5_Z', 'x6_A', 'x6_B', 'x6_I', 'x6_NoVal', 'x6_P',
       'x6_Q', 'x6_U', 'x6_W', 'x6_X', 'x6_Z', 'x7_I', 'x7_NoVal', 'x7_O',
       'x8_L', 'x8_NoVal', 'x8_R'], dtype=object)

In [10]:
X_ohe_df.columns = ohe_feat_names
X_ohe_df.head()

Unnamed: 0,x0_NoVal,x1_F,x1_M,x1_NoVal,x1_Z,x2_H,x2_NoVal,x2_P,x2_T,x3_H,...,x6_U,x6_W,x6_X,x6_Z,x7_I,x7_NoVal,x7_O,x8_L,x8_NoVal,x8_R
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [11]:
X_yn.head()

Unnamed: 0,ac_stsnd,cs_casng,cs_drgtr,offunif,cs_other,cs_bulge,cs_cloth,cs_furtv,cs_descr,ac_other,cs_objcs,cs_vcrim,cs_lkout,ac_incid,ac_time
4835191,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1
1680638,0,1,0,0,0,0,0,1,0,0,0,0,1,1,1
4650672,0,0,0,1,0,1,0,1,0,0,0,1,0,1,0
4470588,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
726114,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0


Merge X_ohe and X_yn

In [12]:
X_mergred = np.concatenate((X_ohe_df, X_yn), axis = -1)

In [14]:
all_names = ohe_feat_names.tolist() + X_yn.columns.tolist()
all_names

['x0_NoVal',
 'x1_F',
 'x1_M',
 'x1_NoVal',
 'x1_Z',
 'x2_H',
 'x2_NoVal',
 'x2_P',
 'x2_T',
 'x3_H',
 'x3_M',
 'x3_NoVal',
 'x3_T',
 'x3_U',
 'x3_Z',
 'x4_BA',
 'x4_BK',
 'x4_BL',
 'x4_BR',
 'x4_DY',
 'x4_GY',
 'x4_NoVal',
 'x4_RA',
 'x4_RD',
 'x4_SN',
 'x4_SP',
 'x4_XX',
 'x4_ZZ',
 'x5_BK',
 'x5_BL',
 'x5_BR',
 'x5_DF',
 'x5_GR',
 'x5_GY',
 'x5_HA',
 'x5_NoVal',
 'x5_XX',
 'x5_Z',
 'x6_A',
 'x6_B',
 'x6_I',
 'x6_NoVal',
 'x6_P',
 'x6_Q',
 'x6_U',
 'x6_W',
 'x6_X',
 'x6_Z',
 'x7_I',
 'x7_NoVal',
 'x7_O',
 'x8_L',
 'x8_NoVal',
 'x8_R',
 'ac_stsnd',
 'cs_casng',
 'cs_drgtr',
 'offunif',
 'cs_other',
 'cs_bulge',
 'cs_cloth',
 'cs_furtv',
 'cs_descr',
 'ac_other',
 'cs_objcs',
 'cs_vcrim',
 'cs_lkout',
 'ac_incid',
 'ac_time']

In [15]:
merged_split = dm.load_split(X_mergred, y, stratify=y)
merged_rf = dm.run_rf(merged_split)



In [16]:
rf_feature_df = pd.DataFrame([merged_rf.feature_importances_], columns=all_names)
pd.options.display.max_rows=77
rf_feature_df.T

Unnamed: 0,0
x0_NoVal,0.0
x1_F,0.0
x1_M,0.0
x1_NoVal,0.0
x1_Z,0.0
x2_H,0.023857
x2_NoVal,0.0
x2_P,0.0
x2_T,0.0
x3_H,0.0


# Everything after this point is garbage

## Data Cleaning

zeros were used to fill in missing data.

The data was separated into pre-stop features and during-stop features

In [9]:
df_sample = df.sample(frac=.01)
df_sample.columns

Index(['year', 'pct', 'ser_num', 'datestop', 'timestop', 'recstat', 'inout',
       'trhsloc', 'perobs', 'crimsusp',
       ...
       'dettypcm', 'linecm', 'detailcm', 'datetimestop', 'height', 'wepfound',
       'forceuse', 'month', 'day', 'detail1_'],
      dtype='object', length=118)

In [10]:
X_category = df_sample.select_dtypes(include='category')
y = df_sample.arstmade
X_category.head()

Unnamed: 0,recstat,inout,trhsloc,typeofid,officrid,offverb,offshld,sex,race,haircolr,eyecolor,build,addrtyp,rescode,city,sector,dettypcm,forceuse,month,day
3657824,A,O,P,P,,,,M,B,BK,BR,T,R,,BRONX,T,CM,,,
1884111,A,I,,P,,,,M,B,BK,BR,M,L,,MANHATTAN,J,CM,,,
4174492,1,I,T,V,I,V,S,M,B,BK,BR,T,R,,MANHATTAN,F,CM,,,
1497686,A,O,,P,,V,S,M,W,BL,BL,M,L,,QUEENS,K,CM,,,
2073468,A,O,,P,,,,M,W,GY,BL,M,L,,QUEENS,E,CM,,,


In [12]:
X_yn = df_sample.select_dtypes(include='int8')
X_yn.head()

Unnamed: 0,explnstp,othpers,arstmade,sumissue,offunif,frisked,searched,contrabn,adtlrept,pistol,...,rf_knowl,ac_stsnd,ac_other,sb_hdobj,sb_outln,sb_admis,sb_other,rf_furt,rf_bulg,wepfound
3657824,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1884111,1,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4174492,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1497686,1,1,1,0,0,1,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2073468,1,0,0,0,1,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [56]:
merged_split = dm.load_split(X_ohe_df, y, stratify=y)
merged_rf = dm.run_rf(merged_split)
merged_rf.feature_importances_



array([0.        , 0.        , 0.        , 0.12522809, 0.        ,
       0.03404644, 0.00372565, 0.00248739, 0.03915427, 0.        ,
       0.        , 0.        , 0.01539468, 0.        , 0.        ,
       0.        , 0.        , 0.00150529, 0.        , 0.        ,
       0.        , 0.14217938, 0.02959635, 0.        , 0.        ,
       0.011777  , 0.        , 0.        , 0.        , 0.00451466,
       0.        , 0.        , 0.06355916, 0.08270336, 0.04746247,
       0.        , 0.        , 0.04818323, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.03735241, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.0480778 , 0.09267573, 0.04809112,
       0.12228553, 0.        ])

In [11]:
X_cat_nonan = dm.fill_NaNs(X_category) # fills in NaNs
X_ohe = dm.categorical_encoder(X_cat_nonan) # One-hot encodes and fit_transforms

In [211]:
# encoder.get_feature_names()

In [215]:
SD_split = dm.load_split(X_ohe, y, stratify=y)

In [216]:
def run_rf(split, **kwargs):
    """run random forest
good defaults: solver='saga', penalty='l1', max_iter=4000"""
    smote = SMOTE()
    X_train_resampled, y_train_resampled = smote.fit_sample(split['X_train'], split['y_train']) 
    rf = RandomForestClassifier(max_depth=2)
    rf.fit(X_train_resampled, y_train_resampled)
    return rf

In [217]:
SD_rf = run_rf(SD_split)
SD_rf



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [220]:
importlib.reload(dm)
SD_rf = dm.run_rf(SD_split)
SD_rf



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [218]:
SD_rf.feature_importances_

array([0.        , 0.        , 0.        , 0.        , 0.04843267,
       0.        , 0.12388989, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.00508404, 0.        , 0.07752745, 0.        ,
       0.09610462, 0.        , 0.02978267, 0.        , 0.00120612,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.0945771 , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.06190992, 0.        ,
       0.07712812, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.00234777, 0.        , 0.05907976, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.00772282, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [133]:
"""Transform your data"""
#X_model = X_model.reshape(-1, 1)
#X_model = X_model.toarray()


"""Optional: Hot encode your categorical data"""
#encoder = preprocessing.OneHotEncoder()
#X_model = encoder.fit_transform(X_model)
"""Split your code"""

X_train, X_test, y_train, y_test = train_test_split(X_category_nona, y,
                                                    stratify=y)
"""Pipeline"""
"""Optional: Scale your numerical training data"""
#scaler = StandardScaler()

"""Optional: Perform over or under sampling techniques here"""
from imblearn.over_sampling import SMOTE
smt = SMOTE()
X_train, y_train = smt.fit_sample(X_train, y_train)
"""Declare your model"""
#clf = svm.SVC()
#clf.fit(X_train, y_train)

rf = RandomForestClassifier(max_depth=2)
rf.fit(X_train, y_train)
print(rf.feature_importances_)
rf_feature_df = pd.DataFrame([rf.feature_importances_], columns=encoder.get_feature_names().tolist())
#print(clf.predict([[0, 0, 0, 0]]))
"""Do a grid search"""
#parameters = {'param_1': ('option_1', 'option_2'), 'param_2':[1, 10, 100], ...}
#model_tuned = GridSearchCV(model, parameters, cv=5)
#model_tuned.fit(X_train, y_train)


"""Make predictions"""
predictions = rf.predict(X_test)
"""Check the accuracy of your model"""
print(balanced_accuracy_score(y_test, predictions))

[0.         0.         0.00844523 0.         0.         0.04383786
 0.         0.05966516 0.         0.07265088 0.03215144 0.00200409
 0.         0.         0.03083676 0.         0.         0.
 0.         0.04328193 0.         0.         0.         0.02407044
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.03399767 0.         0.
 0.0251278  0.         0.         0.         0.03556214 0.
 0.         0.         0.         0.         0.00901606 0.10063131
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.08367829 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.04608461
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.09155477 0.001755
 0.         0.         0.         0.         0.0162644  0.
 0.         0.         0.06424732 0.         0.         0.
 0.1133605



In [146]:
pd.options.display.max_rows=135
rf_feature_df.T

Unnamed: 0,0
x0_1,0.0
x0_9,0.0
x0_A,0.008445
x0_APP,0.0
x0_NoVal,0.0
x1_I,0.043838
x1_NoVal,0.0
x1_O,0.059665
x2_H,0.0
x2_NoVal,0.072651


In [89]:
# # Feature Engineering
# df_sample['height'] = 12*df_sample['ht_feet'] + df_sample['ht_inch']
# df_sample = df_sample.drop(columns=['ht_feet', 'ht_inch'])
# df_sample.columns

In [16]:
# Drop columns that are missing a lot of data
dropped_columns = ['month', 'day', 'forceuse', 'detail1_', 'arstoffn', 'officrid',
                  'offverb', 'offshld', 'rescode', 'premtype', 'aptnum', 'state',
                  'zip', 'beat', 'post', 'crimsusp', 'stinter', 'stname', 'crossst',
                  'premname', 'sumoffen', 'othfeatr', 'addrnum', 'repcmd','dob',
                  'revcmd', 'datetimestop']
df_sample = df.sample(frac=.001)
df_sample = df_sample.drop(columns=dropped_columns)
y = df_sample.arstmade
X = df_sample.drop(columns='arstmade')
X_category = X.select_dtypes(include ='category')
categories = X_category.columns.to_list()
X_non_category = X.drop(columns=categories)

In [79]:
#df1 = df.select_dtypes([np.int, np.float])

# for i, col in enumerate(X_category_nona.columns):
#     plt.figure(i)
#     sns.countplot(x=col, data=X_category_nona)

Based on selected features, can we predict the probability of someone being arrested?

In [23]:
y.value_counts()

0    4796
1     290
Name: arstmade, dtype: int64

In [26]:
# Only works if there are no NaNs
# sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
# sel.fit_transform(X_non_category)
# sel

In [29]:
# for feature in X_non_category.columns:
#     print(X_non_category['feature'].value_counts())

### Let's try modeling the categorical data

In [25]:
df_model = df_sample[['eyecolor', 'sex', 'arstmade']]
df_model = df_model.dropna()
X_model = df_model[['eyecolor', 'sex']]
y_model = df_model['arstmade']
print(len(y_model))
X_model

encoder = preprocessing.OneHotEncoder()
X_model = encoder.fit_transform(X_model)

5086


In [93]:
"""Transform your data"""
#X_model = X_model.reshape(-1, 1)
#X_model = X_model.toarray()


"""Optional: Hot encode your categorical data"""
#encoder = preprocessing.OneHotEncoder()
#X_model = encoder.fit_transform(X_model)
"""Split your code"""

X_train, X_test, y_train, y_test = train_test_split(X_model, y_model,
                                                    stratify=y_model)
"""Pipeline"""
"""Optional: Scale your numerical training data"""
#scaler = StandardScaler()

"""Optional: Perform over or under sampling techniques here"""
#from imblearn.over_sampling import SMOTE
#smt.SMOTE()
#X_train, y_train = smt.fit_sample(X_train, y_train)
"""Declare your model"""
#clf = svm.SVC()
#clf.fit(X_train, y_train)

rf = RandomForestClassifier(max_depth=2)
rf.fit(X_train, y_train)
#rf.feature_importances_
#RandomForestClassifier(max_depth=2, random_state=0)
print(rf.feature_importances_)
#print(clf.predict([[0, 0, 0, 0]]))
"""Do a grid search"""
#parameters = {'param_1': ('option_1', 'option_2'), 'param_2':[1, 10, 100], ...}
#model_tuned = GridSearchCV(model, parameters, cv=5)
#model_tuned.fit(X_train, y_train)


"""Make predictions"""
predictions = rf.predict(X_test)
"""Check the accuracy of your model"""
print(accuracy_score(y_test, predictions))

NameError: name 'X_model' is not defined

In [27]:
y

4870252    0
2615585    0
1564042    0
791830     0
1225627    0
          ..
3373136    0
1442388    0
4787054    0
209046     0
2844916    0
Name: arstmade, Length: 5086, dtype: int8

### Race Key Resource

https://www.icpsr.umich.edu/icpsrweb/NACJD/studies/21660/datadocumentation#

A Asian/Pacific Islander

B Black

I American Indian/Alaskan Native

P Black-Hispanic

Q White-Hispanic

W White

X Unknown

Z Other

## Haircolor Key Resource
https://www.icpsr.umich.edu/icpsrweb/NACJD/studies/21660/datadocumentation#

BA Bald

BK Black

BL Blond

BR Brown

GY Gray

RD Red

SP Salt and Pepper

WH White

XX Unknown

ZZ Other

## Eyecolor Key Resource
https://www.icpsr.umich.edu/icpsrweb/NACJD/studies/21660/datadocumentation#

BK Black

BL Blue

BR Brown

DF Two Different

GR Green

GY Gray

XX Unknown

ZZ Other

## Build Key Resource
https://www.icpsr.umich.edu/icpsrweb/NACJD/studies/21660/datadocumentation#

H Heavy

M Medium

T Thin

U Muscular

Z Unknown