We need to break out columns into the pre-stop observables - e.g. reason for stop, age / weight / build, precinct, xcoord/ycoord
from the during-stop features

In [32]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import data_cleaner as dc
import data_modeler as dm

In [2]:
df = dc.load_full_sqf()

In [3]:
full_df = df.copy()
df = df.sample(5000)

In [5]:
df.describe()

Unnamed: 0,year,pct,ser_num,perobs,perstop,explnstp,othpers,arstmade,sumissue,compyear,...,premtype,state,addrpct,beat,xcoord,ycoord,linecm,detailcm,wepfound,detail1_
count,5000.0,5000.0,5000.0,4985.0,4999.0,5000.0,5000.0,5000.0,5000.0,4977.0,...,0.0,0.0,4201.0,1449.0,4051.0,4051.0,3673.0,4201.0,5000.0,528.0
mean,2008.572,67.9548,6015.8436,2.63585,5.521104,0.997,0.2336,0.0592,0.0688,0.0,...,,,67.820757,8.021394,1003828.0,205340.278697,0.971958,38.875982,0.0008,37.301136
std,2.880985,33.199325,6048.825102,6.014787,6.103409,0.054696,0.423163,0.236022,0.253139,0.0,...,,,33.200308,6.515511,38756.88,28963.999764,0.165117,26.067844,0.028276,25.790541
min,2003.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,1.0,1.0,593380.0,121098.0,0.0,8.0,0.0,8.0
25%,2006.0,41.0,1702.75,1.0,3.0,1.0,0.0,0.0,0.0,0.0,...,,,41.0,4.0,996146.0,183729.0,1.0,20.0,0.0,20.0
50%,2009.0,73.0,4285.5,1.0,5.0,1.0,0.0,0.0,0.0,0.0,...,,,73.0,7.0,1005373.0,200207.0,1.0,28.0,0.0,28.0
75%,2011.0,102.0,8365.0,2.0,5.0,1.0,0.0,0.0,0.0,0.0,...,,,101.0,10.0,1017308.0,232339.0,1.0,46.0,0.0,46.0
max,2018.0,123.0,91411.0,200.0,300.0,1.0,1.0,1.0,1.0,0.0,...,,,123.0,62.0,1062915.0,268642.0,1.0,113.0,1.0,112.0


In [9]:
y = df.arstmade

# Modeling on the yes-no columns

In [48]:
X_y_n = df.select_dtypes(include=['int8']).drop(columns='arstmade')

In [49]:
split = dm.load_split(X_y_n, y)

## Balance classes

In [50]:
split['y_train'].value_counts(normalize=True)

0    0.941867
1    0.058133
Name: arstmade, dtype: float64

In [51]:
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_sample(split['X_train'], split['y_train']) 
y_train_resampled.value_counts()

1    3532
0    3532
Name: arstmade, dtype: int64

In [43]:
logit = LogisticRegression(solver='saga', penalty='l1', max_iter=4000)

In [52]:
logit.fit(X_train_resampled, y_train_resampled)
y_hat_train = logit.predict(split['X_train'])
residuals = np.abs(split['y_train'] - y_hat_train)
print(pd.Series(residuals).value_counts())
print(pd.Series(residuals).value_counts(normalize=True))

0    3406
1     344
Name: arstmade, dtype: int64
0    0.908267
1    0.091733
Name: arstmade, dtype: float64


In [53]:
pd.Series(y_hat_train).value_counts()

0    3304
1     446
dtype: int64

In [54]:
y_hat_test = logit.predict(split['X_test'])
residuals = np.abs(split['y_test'] - y_hat_test)
print(pd.Series(residuals).value_counts())
print(pd.Series(residuals).value_counts(normalize=True))

0    1131
1     119
Name: arstmade, dtype: int64
0    0.9048
1    0.0952
Name: arstmade, dtype: float64


In [55]:
logit.coef_

array([[-0.49144642, -1.46957627, -2.42386536, -0.80574694,  1.90023864,
         4.31529434,  4.30774961, -0.74686696,  5.29947862,  0.        ,
         0.        ,  1.6295691 ,  0.        ,  0.10195139, -0.12430863,
         0.        , -0.70115563,  0.        , -0.61178456,  0.        ,
         2.53323551,  0.        , -1.49778969, -1.40750321, -0.015672  ,
        -0.67855971, -0.4235051 , -1.76313173, -0.66321882, -0.80287405,
        -0.24625685, -0.73195728, -2.11418972, -1.84113804, -0.82595638,
        -0.33452918, -2.01869254, -0.47348931, -0.70723881, -1.24120284,
         0.01281269, -0.65584233, -2.29074084, -1.98967813, -1.63384225,
        -1.18490896, -0.52818738, -0.64484247, -1.16738197, -0.51512953,
        -3.2273605 , -3.36836732, -3.83719605, -1.35230389,  0.17073368,
        -0.93166047, -1.76038318,  0.        ]])

In [56]:
split['X_train'].columns

Index(['explnstp', 'othpers', 'sumissue', 'offunif', 'frisked', 'searched',
       'contrabn', 'adtlrept', 'pistol', 'riflshot', 'asltweap', 'knifcuti',
       'machgun', 'othrweap', 'pf_hands', 'pf_wall', 'pf_grnd', 'pf_drwep',
       'pf_ptwep', 'pf_baton', 'pf_hcuff', 'pf_pepsp', 'pf_other', 'radio',
       'ac_rept', 'ac_inves', 'rf_vcrim', 'rf_othsw', 'ac_proxm', 'rf_attir',
       'cs_objcs', 'cs_descr', 'cs_casng', 'cs_lkout', 'rf_vcact', 'cs_cloth',
       'cs_drgtr', 'ac_evasv', 'ac_assoc', 'cs_furtv', 'rf_rfcmp', 'ac_cgdir',
       'rf_verbl', 'cs_vcrim', 'cs_bulge', 'cs_other', 'ac_incid', 'ac_time',
       'rf_knowl', 'ac_stsnd', 'ac_other', 'sb_hdobj', 'sb_outln', 'sb_admis',
       'sb_other', 'rf_furt', 'rf_bulg', 'wepfound'],
      dtype='object')

In [58]:
coef = pd.DataFrame(logit.coef_)
coef.columns = split['X_train'].columns
coef.T

Unnamed: 0,0
explnstp,-0.491446
othpers,-1.469576
sumissue,-2.423865
offunif,-0.805747
frisked,1.900239
searched,4.315294
contrabn,4.30775
adtlrept,-0.746867
pistol,5.299479
riflshot,0.0


In [60]:
df.groupby(by=['sumissue','arstmade']).count().iloc[:,0]

sumissue  arstmade
0         0           4372
          1            284
1         0            332
          1             12
Name: year, dtype: int64