### This notebook, loads the dataset, applies any filters/preprocessing, does hyperparam sweep over model classes and saves the trained models. Addiionally, there are some analysis for the selective labels project. Artifacts and checkpoints are logged for reproducibility

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from scipy import stats
import seaborn as sns
import os
import os.path as osp
import sys
import pickle
import joblib
from collections import Counter
from itertools import product
import torch
import pdb
import random
import tables
from sklearn.linear_model import LogisticRegression, LinearRegression
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report, average_precision_score,\
balanced_accuracy_score
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.calibration import CalibratedClassifierCV
import wandb
from wandb.lightgbm import wandb_callback, log_summary
from dill.source import getsource
from dill import detect
import functools

### set the seeds and change to current directory + set the output directory

In [2]:
SEED=90210
np.random.seed(SEED)
os.environ['USER_PATH']='/share/pierson/selective_labels_data/hirid_data_analysis/richras_dir/learning_from_doctor_and_patient/'
os.environ['OUT_PATH']='/share/pierson/selective_labels_data/hirid_data_analysis/richras_dir/learning_from_doctor_and_patient/output_directory'

In [3]:
os.chdir('/share/pierson/selective_labels_data/hirid_data_analysis/richras_dir/learning_from_doctor_and_patient/')
from AnalysisFuncs import saveFile, loadFile

In [4]:
%reload_ext autoreload
%autoreload 2

### Load the dataset initially from the current directory

In [5]:
raw_data_path=osp.join(os.environ.get('OUT_PATH'), 'stop_and_frisk', 'stop_and_frisk.pkl')
with open(f"{raw_data_path}", "rb") as input_file:
    dict_raw_sf= pickle.load(input_file)
df_raw_sf = dict_raw_sf['stops']

### Create function to pickle functions

In [6]:
def function_to_string(fn):
    return getsource(detect.code(fn)) 

### Filter for years 2008-2011

In [7]:
def filter_df(df, group_vals, groupName):
    return df.loc[df[[groupName]].apply(lambda x: x.isin(group_vals)).values].reset_index(drop=True)

years=[2008, 2009, 2010, 2011]
df_raw_sf=filter_df(df_raw_sf, years, 'year')
assert len(df_raw_sf.loc[~df_raw_sf['year'].isin(years)])==0


In [8]:
def dropNa(df, feats_to_drop):
    """
    drop rows that contain Nan for specific features
    """
    for feat in feats_to_drop:
        print(f" number of rows with {feat} value as nan :{pd.isna(df[feat]).sum()}")
        df=df.loc[~pd.isna(df[feat])]
        assert df[feat].isna().sum()==0
        assert pd.isna(df[feat]).sum()==0
    return df


In [9]:
feats_to_drop=['suspect.race', 'suspect.sex', 'time', 'date', 'suspect.hair', 'suspected.crime', 'suspect.weight']
df_raw_sf=dropNa(df_raw_sf, feats_to_drop)


 number of rows with suspect.race value as nan :16777
 number of rows with suspect.sex value as nan :38834
 number of rows with time value as nan :382
 number of rows with date value as nan :0
 number of rows with suspect.hair value as nan :2803
 number of rows with suspected.crime value as nan :49
 number of rows with suspect.weight value as nan :1637


In [10]:
#bin age
def binCols(df, bins, colName):
    """
    bin the values of colName to bins 
    """
    return pd.cut(df[colName], bins=bins)
bins=[0, 18, 25, 32, 40, np.inf]
df_raw_sf["suspect.age"] = binCols(df_raw_sf, bins, "suspect.age")
assert (~df_raw_sf["suspect.age"].isin(bins)).sum()==0


In [11]:
def convertDateTime(df):
    """
    form two new columns month and hour from date and time respectively
    """
    df['time']=df['time'].apply(lambda x: datetime.strptime(str(x),"%H:%M"))
    df['date']=df['date'].apply(lambda x: datetime.strptime(str(x),"%Y-%m-%d"))
    df['month'] = df['date'].apply(lambda x: x.month)
    df['hour'] = df['time'].apply(lambda x: x.hour)
    return df
df_raw_sf=convertDateTime(df_raw_sf)
assert (~df_raw_sf['month'].isin(np.arange(0,13))).sum()==0
assert (~df_raw_sf['hour'].isin(np.arange(0,25))).sum()==0


In [12]:
def replaceRace(df, race_dict):
    """
    replace the race names per dict
    """
    df=df.replace({"suspect.race": race_dict})
    categories=list(set(race_dict.values()))
    df = df.loc[df['suspect.race'].isin(categories)].reset_index(drop=True)
    assert (~df['suspect.race'].isin(categories)).sum()==0, pd.unique(df['suspect.race'])
    return df
race_dict={'black':'Black', 'black hispanic':'Hispanic', 'white':'White', 'white hispanic':'Hispanic'}
df_raw_sf=replaceRace(df_raw_sf, race_dict)

In [13]:
#inspect data
df_frisked_by_year=df_raw_sf.groupby("year")[["frisked", "found.weapon"]].agg(['sum','count'])
df_frisked_by_year

Unnamed: 0_level_0,frisked,frisked,found.weapon,found.weapon
Unnamed: 0_level_1,sum,count,sum,count
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2008,271494,494495,6192,494495
2009,307784,537882,6860,537882
2010,314049,552720,7159,552720
2011,353828,627987,7335,627987


In [14]:
def get_split_features(columns):
    """Get features from "https://github.dev/stanford-policylab/risk-adjusted-regression/tree/master/src"""
    feats_to_use = []
    feats_to_use += ["suspect.race",
  "suspected.crime",
  "year",
  "month",
  "hour",
  "precinct",
  "location.housing",
  "suspect.sex",
  "suspect.age",
  "suspect.height",
  "suspect.weight",
  "suspect.hair",
  "suspect.eye",
  "suspect.build",
  "additional.report",
  "additional.investigation",
  "additional.proximity",
  "additional.evasive",
  "additional.associating",
  "additional.direction",
  "additional.highcrime",
  "additional.time",
  "additional.sights",
  "additional.other",
  "stopped.bc.object",
  "stopped.bc.desc",
  "stopped.bc.casing",
  "stopped.bc.lookout",
  "stopped.bc.clothing",
  "stopped.bc.drugs",
  "stopped.bc.furtive",
  "stopped.bc.violent",
  "stopped.bc.bulge",
  "stopped.bc.other"]
    return feats_to_use
feats_to_use=get_split_features(df_raw_sf.columns)

In [15]:
df_raw_sf['T']=df_raw_sf['frisked']
df_raw_sf['D']=df_raw_sf['found.weapon']
df_raw_sf['D_and_T']=(df_raw_sf['T']==1) & (df_raw_sf['D']==1)
#mark D as missing (NA) when T==0
df_raw_sf.loc[df_raw_sf['T']==0,'D']=np.nan
df_T_D_grouped =df_raw_sf.groupby(['T', 'D'], dropna=False)['id'].describe().reset_index()
df_T_D_grouped

Unnamed: 0,T,D,count,mean,std,min,25%,50%,75%,max
0,False,,965929.0,2174656.0,699869.991268,978589.0,1564735.0,2174917.0,2786568.0,3387061.0
1,True,0.0,1221293.0,2188759.0,689845.048987,978588.0,1598209.0,2184756.0,2781628.0,3387066.0
2,True,1.0,25862.0,2159603.0,681127.156531,978628.0,1572639.0,2156164.5,2730445.5,3386674.0


In [16]:
print(f" percent of T=1:{df_raw_sf.loc[df_raw_sf['T']==1]['T'].sum()*100/len(df_raw_sf):.3f} %")
print(f" percent of D=1|T=1:{df_raw_sf.loc[df_raw_sf['T']==1]['D'].sum()*100/df_raw_sf.loc[df_raw_sf['T']==1]['T'].sum():.3f} %")

 percent of T=1:56.354 %
 percent of D=1|T=1:2.074 %


In [17]:
cat_cols = ['suspect.race', 'suspected.crime', 'year', 'month', 'hour', 'precinct',
            'location.housing', 'suspect.sex', 'suspect.age', 'suspect.hair', 'suspect.eye',
            'suspect.build']

In [18]:
df_raw_sf=df_raw_sf.reset_index(drop=True)
df_sf=df_raw_sf[feats_to_use+['T', 'D', 'D_and_T']]
df_sf

Unnamed: 0,suspect.race,suspected.crime,year,month,hour,precinct,location.housing,suspect.sex,suspect.age,suspect.height,...,stopped.bc.lookout,stopped.bc.clothing,stopped.bc.drugs,stopped.bc.furtive,stopped.bc.violent,stopped.bc.bulge,stopped.bc.other,T,D,D_and_T
0,Black,cpw,2008,1,1,42,neither,male,"(18.0, 25.0]",6.000000,...,False,False,False,True,False,False,True,True,0.0,False
1,Hispanic,burglary,2008,1,20,108,neither,male,"(18.0, 25.0]",5.583333,...,False,False,False,False,False,False,True,False,,False
2,Black,robbery,2008,1,1,100,housing,male,"(0.0, 18.0]",6.000000,...,False,False,False,False,False,False,False,True,0.0,False
3,Black,robbery,2008,1,19,103,neither,male,"(18.0, 25.0]",6.083333,...,True,False,False,True,False,False,False,True,0.0,False
4,Black,criminal trespass,2008,1,21,71,housing,male,"(0.0, 18.0]",5.750000,...,True,False,False,False,False,False,True,False,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2213079,Hispanic,cpw,2011,12,23,115,neither,male,"(18.0, 25.0]",5.500000,...,False,False,False,True,True,False,False,True,0.0,False
2213080,Black,cpw,2011,12,23,23,housing,male,"(18.0, 25.0]",5.750000,...,False,False,False,True,False,False,False,True,0.0,False
2213081,Black,cpw,2011,12,23,40,neither,male,"(18.0, 25.0]",5.750000,...,False,False,False,True,False,False,False,True,0.0,False
2213082,Black,cpw,2011,12,23,40,neither,male,"(18.0, 25.0]",5.833333,...,False,False,False,True,False,False,False,True,0.0,False


In [19]:
def getTrainTestIdx(df, train_size=0.5):
    idxs = np.array(df.index)
    train_idxs = np.random.choice(idxs, size=int(len(idxs)*train_size), replace=False)
    test_idxs = np.setdiff1d(idxs, train_idxs)
    assert len(np.intersect1d(train_idxs, test_idxs))==0
    return train_idxs, test_idxs
train_idxs, test_idxs = getTrainTestIdx(df_sf, 0.5)
train_idxs, val_idxs = getTrainTestIdx(df_sf.iloc[train_idxs], 0.9)
assert len(np.intersect1d(train_idxs, test_idxs))==0
assert len(np.intersect1d(train_idxs, val_idxs))==0
assert len(np.intersect1d(val_idxs, test_idxs))==0
tr=len(train_idxs)
v=len(val_idxs)
t=len(test_idxs)
print(f"train %:{tr*100/(tr+v+t):.2f}, val %:{v*100/(tr+v+t):.2f}, test %:{t*100/(tr+v+t):.2f}")

train %:45.00, val %:5.00, test %:50.00


In [20]:
def normalize_data(df,  train_idxs, columns_to_norm):
    """"Normalizes the data as Goel et al do - continuous features only"""
    
    df_unnormed_train = df.iloc[train_idxs].copy()
    for feature_name in columns_to_norm:
        df[feature_name] = df[feature_name] - np.mean(df_unnormed_train[feature_name])
        df[feature_name]  = df[feature_name] / np.std(df_unnormed_train[feature_name])
    return df

columns_to_norm = ['suspect.height', 'suspect.weight']
df_sf = normalize_data(df_sf.copy(),  train_idxs, columns_to_norm)

In [21]:
def get_ohe(df, cat_feats):
    for f in cat_feats:
        ohe_df=pd.get_dummies(df[f], prefix=f, drop_first=True)
        df=df.drop(f, axis=1)
        df=df.join(ohe_df.astype('bool'))
    return df
df_sf=get_ohe(df_sf, cat_cols)

In [22]:
features=[]
for c in df_sf.columns:
    if c not in ['T', 'D', 'D_and_T']:
        features.append(c)
        assert df_sf[c].isna().sum()==0

In [23]:
print(features, len(features))

['suspect.height', 'suspect.weight', 'additional.report', 'additional.investigation', 'additional.proximity', 'additional.evasive', 'additional.associating', 'additional.direction', 'additional.highcrime', 'additional.time', 'additional.sights', 'additional.other', 'stopped.bc.object', 'stopped.bc.desc', 'stopped.bc.casing', 'stopped.bc.lookout', 'stopped.bc.clothing', 'stopped.bc.drugs', 'stopped.bc.furtive', 'stopped.bc.violent', 'stopped.bc.bulge', 'stopped.bc.other', 'suspect.race_Hispanic', 'suspect.race_White', 'suspected.crime_abortion', 'suspected.crime_absconding', 'suspected.crime_adultery', 'suspected.crime_aggravated assault', 'suspected.crime_aggravated harassment', 'suspected.crime_aggravated sexual abuse', 'suspected.crime_arson', 'suspected.crime_assault', 'suspected.crime_auto stripping', 'suspected.crime_bigamy', 'suspected.crime_bribe receiving', 'suspected.crime_bribery', 'suspected.crime_burglary', 'suspected.crime_coercion', 'suspected.crime_computer tampering', '

In [24]:
def logDisparateImpact(df):
    # out of all the ppl stopped, how many were whites, black, hispanic
    assert df['T'].isna().sum()==0 # there are as many T values a snumber of stopped records
    print(f" records by race for T variable : \n {df.groupby(['suspect.race'])['T'].agg(['sum', 'count'])}")
    # out of all the ppl frisked, how many were whites, black, hispanic
    print(f" frisked records by race:{df.groupby(['suspect.race'])[['frisked']].describe()}")
    df_stops=pd.DataFrame({'Black':len(df.loc[df['suspect.race']=='Black'])/len(df), 
                          'Hispanic':len(df.loc[df['suspect.race']=='Hispanic'])/len(df), 
                          'White':len(df.loc[df['suspect.race']=='White'])/len(df)}, index=[0])
    df_stops=df_stops.apply(lambda row: row*100, axis=1)
    df_stops=df_stops.round(decimals=2)
    print(f" % of stops by race: \n {df_stops.iloc[0]}")
    df_frisked_stop = pd.DataFrame({'Black': 
    df.loc[df['suspect.race']=='Black']['frisked'].sum()/len(df.loc[df['suspect.race']=='Black']['frisked']),
    'Hispanic':
    df.loc[df['suspect.race']=='Hispanic']['frisked'].sum()/len(df.loc[df['suspect.race']=='Hispanic']['frisked']), 
    'White':df.loc[df['suspect.race']=='White']['frisked'].sum()/len(df.loc[df['suspect.race']=='White']['frisked'])},
    index=[0])
    df_frisked_stop=df_frisked_stop.apply(lambda row: row*100, axis=1)
    df_frisked_stop=df_frisked_stop.round(decimals=2)
    print(f" % of frisked by race for police stops: \n {df_frisked_stop.iloc[0]}")

    black_weapons_frisk=df.loc[(df['suspect.race']=='Black') & 
                        (df['T']==True)]['D'].sum()/len(df.loc[(df['suspect.race']=='Black') & 
                        (df['T']==True)])
    hispanic_weapons_frisk= df.loc[(df['suspect.race']=='Hispanic') &
                        (df['T']==True)]['D'].sum()/len(df.loc[(df['suspect.race']=='Hispanic') &
                        (df['T']==True)])
    white_weapons_frisk=df.loc[(df['suspect.race']=='White') &
                        (df['T']==True)]['D'].sum()/len(df.loc[(df['suspect.race']=='White') &
                        (df['T']==True)])
    df_weapons_frisk=pd.DataFrame({'Black' : black_weapons_frisk,'Hispanic':hispanic_weapons_frisk,
                                   'White':white_weapons_frisk}, index=[0])
    df_weapons_frisk=df_weapons_frisk.apply(lambda row: row*100, axis=1)
    df_weapons_frisk=df_weapons_frisk.round(decimals=2)
    print(f" % of those where weapons were found by race for police frisks : \n {df_weapons_frisk.iloc[0]}")

    black_weapons_stop=df.loc[(df['suspect.race']=='Black') & 
                        (df['T']==True)]['D'].sum()/len(df.loc[(df['suspect.race']=='Black')])
    hispanic_weapons_stop= df.loc[(df['suspect.race']=='Hispanic') &
                        (df['T']==True)]['D'].sum()/len(df.loc[(df['suspect.race']=='Hispanic')])
    white_weapons_stop=df.loc[(df['suspect.race']=='White') &
                        (df['T']==True)]['D'].sum()/len(df.loc[(df['suspect.race']=='White')])
    df_weapons_stop=pd.DataFrame({'Black' : black_weapons_stop,'Hispanic':hispanic_weapons_stop,
                                   'White':white_weapons_stop}, index=[0])
    df_weapons_stop=df_weapons_stop.apply(lambda row: row*100, axis=1)
    df_weapons_stop=df_weapons_stop.round(decimals=2)
    print(f" % of those where weapons were found by race for police stops : \n {df_weapons_stop.iloc[0]}")
    return df_stops, df_frisked_stop, df_weapons_frisk, df_weapons_stop

In [25]:
df_stops, df_frisked_stop, df_weapons_frisk, df_weapons_stop=logDisparateImpact(df_raw_sf)

 records by race for T variable : 
                  sum    count
suspect.race                 
Black         708820  1236755
Hispanic      439646   752364
White          98689   223965
 frisked records by race:              frisked                      
                count unique    top    freq
suspect.race                               
Black         1236755      2   True  708820
Hispanic       752364      2   True  439646
White          223965      2  False  125276
 % of stops by race: 
 Black       55.88
Hispanic    34.00
White       10.12
Name: 0, dtype: float64
 % of frisked by race for police stops: 
 Black       57.31
Hispanic    58.44
White       44.06
Name: 0, dtype: float64
 % of those where weapons were found by race for police frisks : 
 Black       1.83
Hispanic    2.11
White       3.69
Name: 0, dtype: float64
 % of those where weapons were found by race for police stops : 
 Black       1.05
Hispanic    1.23
White       1.62
Name: 0, dtype: float64


In [26]:
df_sf

Unnamed: 0,suspect.height,suspect.weight,additional.report,additional.investigation,additional.proximity,additional.evasive,additional.associating,additional.direction,additional.highcrime,additional.time,...,suspect.eye_maroon,suspect.eye_other,suspect.eye_pink,suspect.eye_two different,suspect.eye_unknown,suspect.eye_violet,suspect.build_medium,suspect.build_muscular,suspect.build_thin,suspect.build_unknown
0,1.051721,-0.278049,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,-0.510943,0.695568,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1.051721,0.359838,True,False,True,False,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
3,1.364253,0.527703,False,False,False,True,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
4,0.114123,-0.815217,False,False,False,False,False,False,True,True,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2213079,-0.823476,0.359838,False,True,False,False,False,True,True,True,...,False,False,False,False,False,False,True,False,False,False
2213080,0.114123,-0.311622,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2213081,0.114123,1.031298,False,False,True,False,False,False,True,True,...,False,False,False,False,False,False,True,False,False,False
2213082,0.426655,1.031298,False,False,True,False,False,False,True,True,...,False,False,False,False,False,False,True,False,False,False


In [27]:
processed_data_path=osp.join(os.environ.get('OUT_PATH'), 'stop_and_frisk')
if not os.path.exists(processed_data_path):
    os.makedirs(processed_data_path)
df_sf.to_csv(processed_data_path+'/processed_df.csv', index=False)
df_raw_sf.to_csv(processed_data_path+'/df_cleaned.csv', index=False)
saveFile(processed_data_path, features, 'features.pkl')

## Initialiaze wandb run for storing data artifacts

In [28]:
len(df_sf)

2213084

In [29]:
train_X = np.array(df_sf.iloc[train_idxs][features].reset_index(drop=True))
assert ~pd.isna(df_sf.iloc[train_idxs][features].reset_index(drop=True)).isnull().values.any()
train_X=train_X.astype(float)
train_y_T = df_sf.iloc[train_idxs]['T'].values.astype(int)
assert np.isnan(train_y_T).sum()==0
train_y_T = train_y_T.astype(int)

val_X = np.array(df_sf.iloc[val_idxs][features].reset_index(drop=True).astype(float))
val_y_T = df_sf.iloc[val_idxs]['T'].values.astype(int)

train_cross_val_X = np.concatenate((train_X, val_X))
train_cross_val_y_T = np.concatenate((train_y_T, val_y_T))

test_X = np.array(df_sf.iloc[test_idxs][features].reset_index(drop=True).astype(float))
test_y_T = df_sf.iloc[test_idxs]['T'].values.astype(int)

saveFile(osp.join(processed_data_path,'predict_T'), train_X, '/train_X.npy')
saveFile(osp.join(processed_data_path,'predict_T'), train_y_T, '/train_y_T.npy')
saveFile(osp.join(processed_data_path,'predict_T'), val_X, '/val_X.npy')
saveFile(osp.join(processed_data_path,'predict_T'), val_y_T, '/val_y_T.npy')
saveFile(osp.join(processed_data_path,'predict_T'), train_cross_val_X, '/train_cross_val_X.npy')
saveFile(osp.join(processed_data_path,'predict_T'), train_cross_val_y_T, '/train_cross_val_y_T.npy')
saveFile(osp.join(processed_data_path,'predict_T'), test_X, '/test_X.npy')
saveFile(osp.join(processed_data_path,'predict_T'), test_y_T, '/test_y_T.npy')

np.testing.assert_array_equal(df_sf.index, df_raw_sf.index)
train_X_D_given_T = np.array(df_sf.iloc[train_idxs].loc[df_sf['T']==1][features].reset_index(drop=True).astype(float))
train_y_D_given_T = df_sf.iloc[train_idxs].loc[df_sf['T']==1]['D'].values.astype(int)

val_X_D_given_T = np.array(df_sf.iloc[val_idxs].loc[df_sf['T']==1][features].reset_index(drop=True).astype(float))
val_y_D_given_T = df_sf.iloc[val_idxs].loc[df_sf['T']==1]['D'].values.astype(int)

test_X_D_given_T = np.array(df_sf.iloc[test_idxs].loc[df_sf['T']==1][features].reset_index(drop=True).astype(float))
test_y_D_given_T = df_sf.iloc[test_idxs].loc[df_sf['T']==1]['D'].values.astype(int)

train_cross_val_X_D_given_T = np.concatenate((train_X_D_given_T, val_X_D_given_T))
train_cross_val_y_D_given_T = np.concatenate((train_y_D_given_T, val_y_D_given_T))

saveFile(osp.join(processed_data_path,'predict_D_given_T'), train_X_D_given_T, '/train_X_D_given_T.npy')
saveFile(osp.join(processed_data_path,'predict_D_given_T'), train_y_D_given_T, '/train_y_D_given_T.npy')
saveFile(osp.join(processed_data_path,'predict_D_given_T'), val_X_D_given_T, '/val_X_D_given_T.npy')
saveFile(osp.join(processed_data_path,'predict_D_given_T'), val_y_D_given_T, '/val_y_D_given_T.npy')
saveFile(osp.join(processed_data_path,'predict_D_given_T'), train_cross_val_X_D_given_T,
                                                                 '/train_cross_val_X_D_given_T.npy')
saveFile(osp.join(processed_data_path,'predict_D_given_T'), train_cross_val_y_D_given_T, 
                                                                 '/train_cross_val_y_D_given_T.npy')
saveFile(osp.join(processed_data_path,'predict_D_given_T'), test_X_D_given_T, '/test_X_D_given_T.npy')
saveFile(osp.join(processed_data_path,'predict_D_given_T'), test_y_D_given_T, '/test_y_D_given_T.npy')

train_y_D_and_T = df_sf.iloc[train_idxs]['D_and_T'].values.astype(int)
val_y_D_and_T = df_sf.iloc[val_idxs]['D_and_T'].values.astype(int)
train_cross_val_y_D_and_T = np.concatenate((train_y_D_and_T, val_y_D_and_T))
test_y_D_and_T = df_sf.iloc[test_idxs]['D_and_T'].values.astype(int)

saveFile(osp.join(processed_data_path,'predict_D_and_T'), train_X, '/train_X.npy')
saveFile(osp.join(processed_data_path,'predict_D_and_T'), train_y_D_and_T, '/train_y_D_and_T.npy')
saveFile(osp.join(processed_data_path,'predict_D_and_T'), val_X, '/val_X.npy')
saveFile(osp.join(processed_data_path,'predict_D_and_T'), val_y_D_and_T, '/val_y_D_and_T.npy')
saveFile(osp.join(processed_data_path,'predict_D_and_T'), train_cross_val_X, '/train_cross_val_X.npy')
saveFile(osp.join(processed_data_path,'predict_D_and_T'), train_cross_val_y_D_and_T, '/train_cross_val_y_D_and_T.npy')
saveFile(osp.join(processed_data_path,'predict_D_and_T'), test_X, '/test_X.npy')
saveFile(osp.join(processed_data_path,'predict_D_and_T'), test_y_D_and_T, '/test_y_D_and_T.npy')

In [30]:
saveFile(processed_data_path, test_idxs, '/test_idxs.npy')

In [31]:
saveFile(processed_data_path, train_idxs, '/train_idxs.npy')
saveFile(processed_data_path, val_idxs, '/val_idxs.npy')

In [32]:
df_sf.columns.values

array(['suspect.height', 'suspect.weight', 'additional.report',
       'additional.investigation', 'additional.proximity',
       'additional.evasive', 'additional.associating',
       'additional.direction', 'additional.highcrime', 'additional.time',
       'additional.sights', 'additional.other', 'stopped.bc.object',
       'stopped.bc.desc', 'stopped.bc.casing', 'stopped.bc.lookout',
       'stopped.bc.clothing', 'stopped.bc.drugs', 'stopped.bc.furtive',
       'stopped.bc.violent', 'stopped.bc.bulge', 'stopped.bc.other', 'T',
       'D', 'D_and_T', 'suspect.race_Hispanic', 'suspect.race_White',
       'suspected.crime_abortion', 'suspected.crime_absconding',
       'suspected.crime_adultery', 'suspected.crime_aggravated assault',
       'suspected.crime_aggravated harassment',
       'suspected.crime_aggravated sexual abuse', 'suspected.crime_arson',
       'suspected.crime_assault', 'suspected.crime_auto stripping',
       'suspected.crime_bigamy', 'suspected.crime_bribe receiving