In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from scipy import stats
import seaborn as sns
import os
import os.path as osp
import sys
import pickle
import joblib
from collections import Counter
from itertools import product
import torch
import pdb
import random
import tables
from sklearn.linear_model import LogisticRegression, LinearRegression
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report, average_precision_score,\
balanced_accuracy_score
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.calibration import CalibratedClassifierCV
import wandb
from wandb.lightgbm import wandb_callback, log_summary
from dill.source import getsource
from dill import detect
import functools

### set the seeds and change to current directory + set the output directory

In [2]:
SEED=90210
np.random.seed(SEED)
os.chdir('/share/pierson/selective_labels_data/hirid_data_analysis/richras_dir/learning_from_doctor_and_patient/ParksInspection/')
os.environ['OUT_PATH']='/share/pierson/selective_labels_data/hirid_data_analysis/richras_dir/learning_from_doctor_and_patient/output_directory'

In [3]:
def function_to_string(fn):
    return getsource(detect.code(fn)) 

### Load the dataset initially from the current directory

In [4]:
from dataCleaningScripts import synthesize_dataset

In [5]:
data_raw = pd.read_csv('./aggregated_dataset_12-30-2022.csv', index_col=0)

In [6]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 464817 entries, 0 to 464816
Columns: 114 entries, index to SRCommunityBoardCleaned
dtypes: float64(50), int64(7), object(57)
memory usage: 407.8+ MB


In [7]:
data_raw.head()

Unnamed: 0,index,IncidentGlobalID,SRID,SRGlobalID,SRClosedDate,SRCreatedDate,CreatedDate,SRUpdatedDate,SRCategory,SRtype,...,month,year,year_month_day,year_month_reports,hanging_blocking_dummy,SRtypeCleaned,SRCategoryCleaned,SRSourceCleaned,Descriptor1Cleaned,SRCommunityBoardCleaned
0,0,0000100E-70B3-4F0E-A2FD-A5A7F0B152BA,17944200,0000100E-70B3-4F0E-A2FD-A5A7F0B152BA,2020-07-08 00:40:00,2020-06-30 03:03:00,2020-06-30 03:03:00,07/08/2020 00:40:00,Hazard,Limb Down - Sidewalk,...,6,2020,2020-06-30,7919,0,Limb Down - Sidewalk,Hazard,3-1-1 Call Center,Branch or Limb Has Fallen Down,318.0
1,1,00002A8F-3226-4C3C-99DA-7747FC434225,12185633,00002A8F-3226-4C3C-99DA-7747FC434225,2019-04-02 02:24:43,2018-08-15 23:42:00,2018-08-15 23:42:00,04/02/2019 06:24:43,Remove Tree,Dead Tree Older Than 2 Years,...,8,2018,2018-08-15,7450,0,Dead Tree Older Than 2 Years,Remove Tree,3-1-1 Call Center,Planted More Than 2 Years Ago,502.0
2,2,00004AB2-9974-4447-9658-EDF40EFB4D0B,14836810,00004AB2-9974-4447-9658-EDF40EFB4D0B,2021-10-28 13:58:39,2019-05-01 23:52:00,2019-05-01 23:52:00,10/28/2021 13:58:39,Plant Tree,Street Tree,...,5,2019,2019-05-01,7126,0,Street Tree,Plant Tree,Department of Parks and Recreation - Public We...,For One Address,104.0
3,3,00005428-D9AC-4891-BE3F-5C67419CE54E,759045,00005428-D9AC-4891-BE3F-5C67419CE54E,2015-12-06 13:56:00,2015-07-10 21:58:00,2015-07-10 21:58:00,12/06/2015 13:56:00,Hazard,Hanging Limb from Wires,...,7,2015,2015-07-10,7308,1,Hanging Limb from Wires,Hazard,3-1-1 Call Center,Branch Cracked and Will Fall,403.0
4,4,00005AC4-66EE-4F62-B6BB-85776F9BBE4B,5327162,00005AC4-66EE-4F62-B6BB-85776F9BBE4B,2020-11-21 11:11:11,2016-09-09 15:12:04,2016-09-09 15:12:04,11/21/2020 11:11:11,Remove Tree,Dead Tree Older Than 2 Years,...,9,2016,2016-09-09,5530,0,Dead Tree Older Than 2 Years,Remove Tree,3-1-1 Call Center,Planted More Than 2 Years Ago,204.0


In [8]:
pd.set_option('display.max_rows', None)
data_raw.iloc[0]

index                                                                                  0
IncidentGlobalID                                    0000100E-70B3-4F0E-A2FD-A5A7F0B152BA
SRID                                                                            17944200
SRGlobalID                                          0000100E-70B3-4F0E-A2FD-A5A7F0B152BA
SRClosedDate                                                         2020-07-08 00:40:00
SRCreatedDate                                                        2020-06-30 03:03:00
CreatedDate                                                          2020-06-30 03:03:00
SRUpdatedDate                                                        07/08/2020 00:40:00
SRCategory                                                                        Hazard
SRtype                                                              Limb Down - Sidewalk
SRPriority                                                                       Routine
SRSource             

### base rates without any data preprocessing

In [9]:
print(f" percent of T=1:{data_raw.loc[data_raw['INSPcreated']==1]['INSPcreated'].sum()*100/len(data_raw):.3f} %")
print(f" percent of D=1|T=1:{data_raw.loc[data_raw['INSPcreated']==1]['high_risk_label'].sum()*100/data_raw.loc[data_raw['INSPcreated']==1]['INSPcreated'].sum():.3f} %")

 percent of T=1:65.843 %
 percent of D=1|T=1:18.703 %


In [10]:
# check that all records have some value for Inspection created (this is the T variable)
assert pd.isna(data_raw["INSPcreated"]).sum()==0

In [11]:
data_raw[["INSPcreated","high_risk_label"]].head()

Unnamed: 0,INSPcreated,high_risk_label
0,1,0.0
1,1,0.0
2,1,
3,1,
4,1,1.0


In [12]:
len(data_raw.loc[(data_raw['INSPcreated']==1) & pd.isna(data_raw["high_risk_label"])])

153833

In [13]:
pd.isna(data_raw.loc[data_raw['INSPcreated']==1]['high_risk_label']).sum()

153833

In [14]:
# length of rows where T=1, but for some reason D is Nan
len(data_raw[data_raw["INSPcreated"] & pd.isna(data_raw["high_risk_label"])].index)

153833

In [15]:
# drop rows which have T=1 but D=Nan
len_before=len(data_raw)
print(f"length before dropping:{len_before}")
data_cleaned = data_raw.drop(data_raw[data_raw["INSPcreated"] & pd.isna(data_raw["high_risk_label"])].index)
print(f"length after dropping:{len(data_cleaned)}")
print(f"% dropped rows which have T=1 but D=Nan:{(len_before-len(data_cleaned))/len_before}")
assert pd.isna(data_cleaned.loc[data_cleaned['INSPcreated']==1]['high_risk_label']).sum()==0

length before dropping:464817
length after dropping:310984
% dropped rows which have T=1 but D=Nan:0.33095390228842747


In [16]:
pd.isna(data_cleaned.loc[data_cleaned['INSPcreated']==1]['high_risk_label']).sum()

0

In [17]:
data_cleaned.reset_index(drop=True, inplace=True)
print(len(data_cleaned))

310984


In [18]:
# check base rates now
print(f" percent of T=1:{data_cleaned.loc[data_cleaned['INSPcreated']==1]['INSPcreated'].sum()*100/len(data_cleaned):.3f} %")
print(f" percent of D=1|T=1:{data_cleaned.loc[data_cleaned['INSPcreated']==1]['high_risk_label'].sum()*100/data_cleaned.loc[data_cleaned['INSPcreated']==1]['INSPcreated'].sum():.3f} %")

 percent of T=1:48.947 %
 percent of D=1|T=1:37.604 %


In [19]:
df_T_D_grouped = data_cleaned.groupby(["INSPcreated", "high_risk_label"], dropna=False)['IncidentGlobalID'].describe().reset_index()
df_T_D_grouped

Unnamed: 0,INSPcreated,high_risk_label,count,unique,top,freq
0,0,,158768,158768,CDF5FBB6-7961-4F4C-9895-91298EE3B8C3,1
1,1,0.0,94977,94977,6F4E1198-B200-4A2A-994F-70447B398618,1
2,1,1.0,57239,57239,FD316C22-C5B4-4982-977B-97F9049BACC0,1


In [20]:
feats_to_use=   ['SRtypeCleaned',
            'SRCategoryCleaned',
            'SRSourceCleaned',
            'BoroughCode',
            'SRCommunityBoardCleaned',
            'SRPriority',
            'Descriptor1Cleaned',
            'hanging_blocking_dummy',
            'year_month_reports',
            'year_month',
             'population'
            , 'income_per_capita'
            , 'median_rent_as_pct_household_income'
            , 'median_value_usd'
            , 'frac_white'
            , 'frac_black'
            , 'frac_native'
            , 'frac_asian'
            , 'frac_hawaiian_islander'
            , 'frac_other'
            , 'frac_hispanic'
            , 'frac_family'
            , 'frac_single'
            , 'frac_grad_degree'
            , 'frac_college_degree'
            , 'frac_no_hs_degree'
            , 'frac_public_assistance'
            , 'frac_rented'
            , 'logmedianhouseholdincome'
            , 'logdensity'
            ]        

In [21]:
print(f" Before dropping rows with all values nan:{len(data_cleaned)}")
data_cleaned=data_cleaned[~pd.isna(data_cleaned)].reset_index(drop=True)
print(f"After:{len(data_cleaned)}")

 Before dropping rows with all values nan:310984
After:310984


In [22]:
for f in feats_to_use:
    print(f" feature: {f} unique values are :{data_cleaned[f].unique()}")

 feature: SRtypeCleaned unique values are :['Limb Down - Sidewalk' 'Dead Tree Older Than 2 Years' 'Street Tree'
 'Poor Condition Hollow' 'Clear Utilities - Power Lines'
 'Poor Condition Cracked' 'Other' 'Tree Down - Other' 'Tree Down - House'
 'Tree Uprooted' 'Hanging Limb Other' 'Limb Down - Other'
 'Limb Down - Driveway' 'Branches Damaged' 'Tree Leaning'
 'Unauthorized Tree Pruning' 'Clear Road' 'Limb Down - Street'
 'Prune Dead Wood' 'Hanging Limb from other Branches' 'Blocked Sewer'
 'Tree Down - Sidewalk' 'Clear Building' 'Root Damage'
 'Dead Tree Planted Within 2 Years' 'Sidewalk Consultation'
 'Limb Down - Car' 'Clear Utilities - Other' 'Tree Down - Street'
 'Unauthorized Tree Removal' 'Hanging Limb from Wires' 'Tree Down - Car'
 'Limb Down - House' 'Tree Split' 'Trunk Damaged' 'Clear Street Light'
 'Clear Traffic - Other Sign' 'Tree Down - Driveway'
 'Trunk Damaged Bicycle Chained' 'Cracked Foundation'
 'Clear Utilities - Telephone Lines']
 feature: SRCategoryCleaned unique val

 feature: frac_single unique values are :[0.1314433  0.11494253 0.         ... 0.59826087 0.2739726  0.29657795]
 feature: frac_grad_degree unique values are :[0.07674944 0.21552322 0.05683564 ... 0.10115082 0.54155251 0.05174129]
 feature: frac_college_degree unique values are :[0.34537246 0.55093555 0.48387097 ... 0.49897541 0.44676617 0.17621777]
 feature: frac_no_hs_degree unique values are :[0.1241535  0.07623008 0.30568356 ... 0.14427861 0.75625    0.35530086]
 feature: frac_public_assistance unique values are :[0.07474227 0.15070243 0.04225352 ... 0.0973913  0.82705479 0.39353612]
 feature: frac_rented unique values are :[0.30927835 0.29757344 1.         ... 0.97904192 0.89192708 0.49391304]
 feature: logmedianhouseholdincome unique values are :[11.3682349          nan 11.0948917  ... 11.04188129  9.30291955
 10.03548026]
 feature: logdensity unique values are :[0.0071613  0.01406125 0.00755423 ... 0.03524371 0.00535158 0.01750418]


In [23]:
cat_feats=['SRtypeCleaned',
            'SRCategoryCleaned',
            'SRSourceCleaned',
            'BoroughCode',
            'SRCommunityBoardCleaned',
            'SRPriority',
            'Descriptor1Cleaned', 
            'hanging_blocking_dummy', #this is a dummy variables over a few SRtypes/SRcategories that seemed especially worrying
            'year_month_reports',
            'year_month']
numerical_feats=list(set(feats_to_use) - set(cat_feats)) 

In [24]:
print(data_cleaned.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310984 entries, 0 to 310983
Columns: 114 entries, index to SRCommunityBoardCleaned
dtypes: float64(50), int64(7), object(57)
memory usage: 270.5+ MB
None


In [25]:
#check the values of lat/long for rows with census tract nan
print(f"number of rows with nan census tract:{pd.isna(data_cleaned['census_tract']).sum()}")
print(f" number of rows with nan:{pd.isna(data_cleaned[pd.isna(data_cleaned['census_tract'])][['Latitude','Longitude']]).sum()}")
print(data_cleaned[pd.isna(data_cleaned['census_tract'])][['Latitude','Longitude','census_tract']].head())

number of rows with nan census tract:24086
 number of rows with nan:Latitude     24078
Longitude    24078
dtype: int64
    Latitude  Longitude  census_tract
16       NaN        NaN           NaN
20       NaN        NaN           NaN
39       NaN        NaN           NaN
46       NaN        NaN           NaN
70       NaN        NaN           NaN


In [26]:
len_before=len(data_cleaned)
print(f"length before dropping:{len_before}")
data_cleaned=data_cleaned[~pd.isna(data_cleaned["census_tract"])].reset_index(drop=True)
print(f"length after dropping census tract:{len(data_cleaned)}")
print(f"% dropped with nan census tract:{(len_before-len(data_cleaned))/len_before}")
print(data_cleaned.info())

length before dropping:310984
length after dropping census tract:286898
% dropped with nan census tract:0.07745092995138013
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286898 entries, 0 to 286897
Columns: 114 entries, index to SRCommunityBoardCleaned
dtypes: float64(50), int64(7), object(57)
memory usage: 249.5+ MB
None


In [27]:
data_cleaned=data_cleaned[feats_to_use+["census_tract", "INSPcreated", "high_risk_label"]]
data_cleaned['T']=data_cleaned["INSPcreated"]
data_cleaned['D']=data_cleaned["high_risk_label"]
data_cleaned['D_and_T']=(data_cleaned['INSPcreated']==1) & (data_cleaned['high_risk_label']==1)
assert data_cleaned['D_and_T'].values.sum()==data_cleaned.loc[~pd.isna(data_cleaned['D']),'D'].values.sum()
print(data_cleaned.groupby(['T', 'D', 'D_and_T'], dropna=False)['T'].describe())
print(data_cleaned.info())

                  count  mean  std  min  25%  50%  75%  max
T D   D_and_T                                              
0 NaN False    145456.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0
1 0.0 False     88328.0   1.0  0.0  1.0  1.0  1.0  1.0  1.0
  1.0 True      53114.0   1.0  0.0  1.0  1.0  1.0  1.0  1.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286898 entries, 0 to 286897
Data columns (total 36 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   SRtypeCleaned                        286898 non-null  object 
 1   SRCategoryCleaned                    286898 non-null  object 
 2   SRSourceCleaned                      286898 non-null  object 
 3   BoroughCode                          286889 non-null  object 
 4   SRCommunityBoardCleaned              286898 non-null  object 
 5   SRPriority                           286898 non-null  object 
 6   Descriptor1Cleaned                   286898 non-

In [28]:
df_T_D_grouped = data_cleaned.groupby(["INSPcreated", "high_risk_label"], dropna=False)['T'].describe().reset_index()
df_T_D_grouped

Unnamed: 0,INSPcreated,high_risk_label,count,mean,std,min,25%,50%,75%,max
0,0,,145456.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,88328.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
2,1,1.0,53114.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [29]:
data_cleaned['D_and_T'].values.sum(), data_cleaned.loc[~pd.isna(data_cleaned['D']),'D'].values.sum()

(53114, 53114.0)

In [30]:
# drop rows with numerical feats nan
def dropNa(df, feats_to_drop):
    """
    drop rows that contain Nan for specific features
    """
    for feat in feats_to_drop:
        print(f" number of rows with {feat} value as nan :{pd.isna(df[feat]).sum()}")
        df=df.loc[~pd.isna(df[feat])]
        assert df[feat].isna().sum()==0
        assert pd.isna(df[feat]).sum()==0
    return df
data_cleaned = dropNa(data_cleaned, numerical_feats)
data_cleaned=data_cleaned.reset_index(drop=True)

 number of rows with median_rent_as_pct_household_income value as nan :33533
 number of rows with frac_public_assistance value as nan :0
 number of rows with median_value_usd value as nan :32641
 number of rows with frac_native value as nan :0
 number of rows with frac_family value as nan :0
 number of rows with frac_white value as nan :0
 number of rows with frac_grad_degree value as nan :0
 number of rows with frac_black value as nan :0
 number of rows with frac_asian value as nan :0
 number of rows with frac_no_hs_degree value as nan :0
 number of rows with logdensity value as nan :0
 number of rows with logmedianhouseholdincome value as nan :10960
 number of rows with frac_hawaiian_islander value as nan :0
 number of rows with population value as nan :0
 number of rows with frac_other value as nan :0
 number of rows with frac_hispanic value as nan :0
 number of rows with income_per_capita value as nan :0
 number of rows with frac_college_degree value as nan :0
 number of rows with 

In [31]:
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209764 entries, 0 to 209763
Data columns (total 36 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   SRtypeCleaned                        209764 non-null  object 
 1   SRCategoryCleaned                    209764 non-null  object 
 2   SRSourceCleaned                      209764 non-null  object 
 3   BoroughCode                          209755 non-null  object 
 4   SRCommunityBoardCleaned              209764 non-null  object 
 5   SRPriority                           209764 non-null  object 
 6   Descriptor1Cleaned                   209764 non-null  object 
 7   hanging_blocking_dummy               209764 non-null  int64  
 8   year_month_reports                   209764 non-null  int64  
 9   year_month                           209764 non-null  object 
 10  population                           209764 non-null  float64
 11  income_per_ca

In [32]:
print(f" percent of T=1:{data_cleaned.loc[data_cleaned['INSPcreated']==1]['INSPcreated'].sum()*100/len(data_cleaned):.3f} %")
print(f" percent of D=1|T=1:{data_cleaned.loc[data_cleaned['INSPcreated']==1]['high_risk_label'].sum()*100/data_cleaned.loc[data_cleaned['INSPcreated']==1]['INSPcreated'].sum():.3f} %")

 percent of T=1:49.408 %
 percent of D=1|T=1:38.125 %


In [33]:
# non nans in census tract field
assert pd.isna(data_cleaned['census_tract']).sum()==0

In [35]:
def splitFunc(df, groupName, test_size=0.5):
    """
    stratified split with groupName
    """
    df.reset_index(inplace=True)
    df_grouped=df[['index', groupName]].groupby([groupName], dropna=False).agg(['count']).reset_index()
    df_grouped.columns=[groupName, 'prop']
    df_grouped['prop']=df_grouped['prop'].apply(lambda x:x/len(df))
    df_grouped = df_grouped.sample(frac=1).reset_index(drop=True)
    
    cumProp=0
    i=0
    train_groups=[]
    
    while (cumProp<=(1-test_size)):
        cumProp+=df_grouped.iloc[i]['prop']
        train_groups.append(df_grouped.iloc[i][groupName])
        i+=1
    test_groups = set(df_grouped[groupName])-set(train_groups)
    
    assert len(np.intersect1d(np.array(train_groups), np.array(list(test_groups))))==0
    print(cumProp, i)
    
    train_idxs=np.array([])
    train_idxs=np.concatenate((train_idxs, np.array(df.loc[df[groupName].isin(train_groups)].index)))
    train_idxs = train_idxs.astype(int)
    
    idxs=np.arange(len(df))
    test_idxs=np.setdiff1d(idxs, train_idxs)
    
    print(len(train_idxs), len(test_idxs), len(idxs))
    assert len(np.intersect1d(train_idxs, test_idxs))==0
    return train_idxs, test_idxs

In [36]:
data_cleaned.head()

Unnamed: 0,SRtypeCleaned,SRCategoryCleaned,SRSourceCleaned,BoroughCode,SRCommunityBoardCleaned,SRPriority,Descriptor1Cleaned,hanging_blocking_dummy,year_month_reports,year_month,...,frac_public_assistance,frac_rented,logmedianhouseholdincome,logdensity,census_tract,INSPcreated,high_risk_label,T,D,D_and_T
0,Limb Down - Sidewalk,Hazard,3-1-1 Call Center,Brooklyn,318.0,Routine,Branch or Limb Has Fallen Down,0,7919,2020-06,...,0.074742,0.309278,11.368235,0.007161,360470700000.0,1,0.0,1,0.0,False
1,Street Tree,Plant Tree,Department of Parks and Recreation - Public We...,Queens,406.0,Routine,For One Address,0,4161,2015-04,...,0.155738,0.35041,11.098349,0.008272,360810700000.0,0,,0,,False
2,Dead Tree Older Than 2 Years,Remove Tree,3-1-1 Call Center,Brooklyn,306.0,Routine,Planted More Than 2 Years Ago,0,7183,2017-05,...,0.055556,0.628968,12.025354,0.006728,360470100000.0,1,0.0,1,0.0,False
3,Street Tree,Plant Tree,Department of Parks and Recreation - Public We...,Brooklyn,302.0,Routine,For One Address,0,6710,2018-10,...,0.0,0.471074,11.738179,0.007144,360470000000.0,0,,0,,False
4,Dead Tree Older Than 2 Years,Remove Tree,3-1-1 Call Center,Staten Island,501.0,Routine,Planted More Than 2 Years Ago,0,7607,2019-06,...,0.0,0.195592,12.047915,0.007697,360850100000.0,0,,0,,False


In [37]:
data_cleaned.iloc[:10].head()

Unnamed: 0,SRtypeCleaned,SRCategoryCleaned,SRSourceCleaned,BoroughCode,SRCommunityBoardCleaned,SRPriority,Descriptor1Cleaned,hanging_blocking_dummy,year_month_reports,year_month,...,frac_public_assistance,frac_rented,logmedianhouseholdincome,logdensity,census_tract,INSPcreated,high_risk_label,T,D,D_and_T
0,Limb Down - Sidewalk,Hazard,3-1-1 Call Center,Brooklyn,318.0,Routine,Branch or Limb Has Fallen Down,0,7919,2020-06,...,0.074742,0.309278,11.368235,0.007161,360470700000.0,1,0.0,1,0.0,False
1,Street Tree,Plant Tree,Department of Parks and Recreation - Public We...,Queens,406.0,Routine,For One Address,0,4161,2015-04,...,0.155738,0.35041,11.098349,0.008272,360810700000.0,0,,0,,False
2,Dead Tree Older Than 2 Years,Remove Tree,3-1-1 Call Center,Brooklyn,306.0,Routine,Planted More Than 2 Years Ago,0,7183,2017-05,...,0.055556,0.628968,12.025354,0.006728,360470100000.0,1,0.0,1,0.0,False
3,Street Tree,Plant Tree,Department of Parks and Recreation - Public We...,Brooklyn,302.0,Routine,For One Address,0,6710,2018-10,...,0.0,0.471074,11.738179,0.007144,360470000000.0,0,,0,,False
4,Dead Tree Older Than 2 Years,Remove Tree,3-1-1 Call Center,Staten Island,501.0,Routine,Planted More Than 2 Years Ago,0,7607,2019-06,...,0.0,0.195592,12.047915,0.007697,360850100000.0,0,,0,,False


In [38]:
train_idxs_original, test_idxs = splitFunc(data_cleaned, groupName='census_tract', test_size=0.2)
data_cleaned_copy = data_cleaned.copy()
train_idxs, val_idxs = splitFunc(data_cleaned_copy.iloc[train_idxs_original], groupName='census_tract', test_size=0.2)
train_idxs = data_cleaned_copy.iloc[train_idxs_original].index[train_idxs]
val_idxs = data_cleaned_copy.iloc[train_idxs_original].index[val_idxs]

0.8000562536946271 3215
167823 41941 209764
0.8001048723953196 2573
134276 33547 167823


In [39]:
train_groups=data_cleaned.iloc[train_idxs]['census_tract'].unique()
test_groups=data_cleaned.iloc[test_idxs]['census_tract'].unique()
val_groups=data_cleaned.iloc[val_idxs]['census_tract'].unique()
assert len(np.intersect1d(np.array(train_groups), np.array(list(test_groups))))==0
assert len(np.intersect1d(np.array(val_groups), np.array(list(test_groups))))==0
assert len(np.intersect1d(np.array(train_groups), np.array(list(val_groups))))==0

In [41]:


def normalize_data(df,  train_idxs, columns_to_norm, epsilon=1e-3):
    """"Normalizes the data as Goel et al do - continuous features only"""
    df_unnormed_train = df.iloc[train_idxs].copy()
    for feature_name in columns_to_norm:
        print(feature_name, np.std(df_unnormed_train[feature_name]))
        df[feature_name] = df[feature_name] - np.mean(df_unnormed_train[feature_name])
        df[feature_name]  = df[feature_name]  / np.std(df_unnormed_train[feature_name])
    df.reset_index(drop=True, inplace=True)
    return df

In [42]:
df_parks =normalize_data(data_cleaned.copy(),  train_idxs, numerical_feats)

median_rent_as_pct_household_income 10.316888028597292
frac_public_assistance 0.11297273794765782
median_value_usd 369853.65219405334
frac_native 0.015369078847561682
frac_family 0.16657864178393902
frac_white 0.3047917525087153
frac_grad_degree 0.12537346851484654
frac_black 0.29297078218747974
frac_asian 0.1912279566577609
frac_no_hs_degree 0.09904866430516707
logdensity 0.0072910984258922566
logmedianhouseholdincome 0.39922405050045834
frac_hawaiian_islander 0.00480495270714713
population 575.3269853057124
frac_other 0.11662029214911321
frac_hispanic 0.19024356406873158
income_per_capita 28984.97476081545
frac_college_degree 0.1914634745547241
frac_rented 0.22884129001626496
frac_single 0.1353489200515478


In [43]:

assert len(np.intersect1d(train_idxs, test_idxs))==0
assert len(np.intersect1d(train_idxs, val_idxs))==0
assert len(np.intersect1d(val_idxs, test_idxs))==0
assert len(np.intersect1d(val_idxs, train_idxs))==0

In [44]:
def get_ohe(df, cat_feats):
    for f in cat_feats:
        ohe_df=pd.get_dummies(df[f], prefix=f, drop_first=True).astype('bool')
        df=df.drop(f, axis=1)
        df=df.join(ohe_df)
        df.reset_index(drop=True, inplace=True)
    return df
df_parks=get_ohe(df_parks, cat_feats)

In [45]:
features=[]
for c in df_parks.columns:
    if c not in ['T', 'D', 'D_and_T', "census_tract", "INSPcreated", "high_risk_label", "index"]:
        features.append(c)
        assert df_parks[c].isna().sum()==0

In [None]:
#how to check on the features ?
pd.set_option('display.max_columns', None)
print(features, len(features))

In [47]:
os.chdir('/share/pierson/selective_labels_data/hirid_data_analysis/richras_dir/learning_from_doctor_and_patient/')
from AnalysisFuncs import saveFile, loadFile

In [48]:
%reload_ext autoreload
%autoreload 2

In [49]:
processed_data_path=osp.join(os.environ.get('OUT_PATH'), 'ParksInspection', 'New_Dataset')
if not os.path.exists(processed_data_path):
    os.makedirs(processed_data_path)
df_parks.to_csv(processed_data_path+'/processed_df.csv', index=False)
data_cleaned.to_csv(processed_data_path+'/df_cleaned.csv', index=False)
saveFile(processed_data_path, features, 'features.pkl')

## Initialiaze wandb run for storing data artifacts

In [50]:
run = wandb.init(project='Inspection', job_type='preprocess-data-12-30-2022', config={'wandb_nb':'wandb_Inspection'})

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrr568[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [51]:
os.environ['WANDB_SILENT']="true"

In [52]:
# # Create a new artifact for the processed data, including the function that created it, to Artifacts
# processed_ds_art = wandb.Artifact(name='Inspection_processed', 
#                                     type='processed_dataset',
#                                     description='One-hot encoded + numerical features normalized dataset',
#                                     metadata={
#                                               'dropNa_fn': function_to_string(dropNa),
#                                               'getTrainTestIdx_fn': function_to_string(getTrainTestIdx),
#                                              'normalize_data_fn': function_to_string(normalize_data),
#                                               'get_ohe_fn': function_to_string(get_ohe),
#                                               'feats_to_use':feats_to_use,
#                                               'ohe_cols':cat_feats,
#                                               'numerical_feats':numerical_feats,
#                                               'SEED': SEED,
#                                                'train_idxs': train_idxs,
#                                                'val_idxs': val_idxs,
#                                                'test_idxs': test_idxs,}
#                                  )

# # Attach our processed data to the Artifact 
# processed_ds_art.add_file(processed_data_path+'/processed_df.csv')

# # Create a W&B Table and log 1000 random rows of the dataset to explore
# table_T_D = wandb.Table(dataframe=df_T_D_grouped)

# # Log the Table to your W&B workspace
# wandb.log({'grouped_dataset_T_D': table_T_D,
#           })

# # Log this Artifact to the current wandb run
# run.log_artifact(processed_ds_art)

# run.finish()

In [53]:
len(df_parks)

209764

In [54]:
train_X = np.array(df_parks.iloc[train_idxs][features].reset_index(drop=True))
assert ~pd.isna(df_parks.iloc[train_idxs][features].reset_index(drop=True)).isnull().values.any()
train_X=train_X.astype(float)
train_y_T = df_parks.iloc[train_idxs]['T'].values
assert np.isnan(train_y_T).sum()==0
train_y_T=train_y_T.astype(int)

val_X = np.array(df_parks.iloc[val_idxs][features].reset_index(drop=True).astype(float))
val_y_T = df_parks.iloc[val_idxs]['T'].values.astype(int)

train_cross_val_X = np.concatenate((train_X, val_X))
train_cross_val_y_T = np.concatenate((train_y_T, val_y_T))

test_X = np.array(df_parks.iloc[test_idxs][features].reset_index(drop=True).astype(float))
test_y_T = df_parks.iloc[test_idxs]['T'].values.astype(int)

saveFile(osp.join(processed_data_path,'predict_T'), train_X, '/train_X.npy')
saveFile(osp.join(processed_data_path,'predict_T'), train_y_T, '/train_y_T.npy')
saveFile(osp.join(processed_data_path,'predict_T'), val_X, '/val_X.npy')
saveFile(osp.join(processed_data_path,'predict_T'), val_y_T, '/val_y_T.npy')
saveFile(osp.join(processed_data_path,'predict_T'), train_cross_val_X, '/train_cross_val_X.npy')
saveFile(osp.join(processed_data_path,'predict_T'), train_cross_val_y_T, '/train_cross_val_y_T.npy')
saveFile(osp.join(processed_data_path,'predict_T'), test_X, '/test_X.npy')
saveFile(osp.join(processed_data_path,'predict_T'), test_y_T, '/test_y_T.npy')

train_X_D_given_T = np.array(df_parks.iloc[train_idxs].loc[df_parks['T']==1][features].reset_index(drop=True).astype(float))
train_y_D_given_T = df_parks.iloc[train_idxs].loc[df_parks['T']==1]['D'].values.astype(int)

val_X_D_given_T = np.array(df_parks.iloc[val_idxs].loc[df_parks['T']==1][features].reset_index(drop=True).astype(float))
val_y_D_given_T = df_parks.iloc[val_idxs].loc[df_parks['T']==1]['D'].values.astype(int)

test_X_D_given_T = np.array(df_parks.iloc[test_idxs].loc[df_parks['T']==1][features].reset_index(drop=True).astype(float))
test_y_D_given_T = df_parks.iloc[test_idxs].loc[df_parks['T']==1]['D'].values.astype(int)

train_cross_val_X_D_given_T = np.concatenate((train_X_D_given_T, val_X_D_given_T))
train_cross_val_y_D_given_T = np.concatenate((train_y_D_given_T, val_y_D_given_T))

saveFile(osp.join(processed_data_path,'predict_D_given_T'), train_X_D_given_T, '/train_X_D_given_T.npy')
saveFile(osp.join(processed_data_path,'predict_D_given_T'), train_y_D_given_T, '/train_y_D_given_T.npy')
saveFile(osp.join(processed_data_path,'predict_D_given_T'), val_X_D_given_T, '/val_X_D_given_T.npy')
saveFile(osp.join(processed_data_path,'predict_D_given_T'), val_y_D_given_T, '/val_y_D_given_T.npy')
saveFile(osp.join(processed_data_path,'predict_D_given_T'), train_cross_val_X_D_given_T,
                                                                 '/train_cross_val_X_D_given_T.npy')
saveFile(osp.join(processed_data_path,'predict_D_given_T'), train_cross_val_y_D_given_T, 
                                                                 '/train_cross_val_y_D_given_T.npy')
saveFile(osp.join(processed_data_path,'predict_D_given_T'), test_X_D_given_T, '/test_X_D_given_T.npy')
saveFile(osp.join(processed_data_path,'predict_D_given_T'), test_y_D_given_T, '/test_y_D_given_T.npy')

train_y_D_and_T = df_parks.iloc[train_idxs]['D_and_T'].values.astype(int)
val_y_D_and_T = df_parks.iloc[val_idxs]['D_and_T'].values.astype(int)
train_cross_val_y_D_and_T = np.concatenate((train_y_D_and_T, val_y_D_and_T))
test_y_D_and_T = df_parks.iloc[test_idxs]['D_and_T'].values.astype(int)

saveFile(osp.join(processed_data_path,'predict_D_and_T'), train_X, '/train_X.npy')
saveFile(osp.join(processed_data_path,'predict_D_and_T'), train_y_D_and_T, '/train_y_D_and_T.npy')
saveFile(osp.join(processed_data_path,'predict_D_and_T'), val_X, '/val_X.npy')
saveFile(osp.join(processed_data_path,'predict_D_and_T'), val_y_D_and_T, '/val_y_D_and_T.npy')
saveFile(osp.join(processed_data_path,'predict_D_and_T'), train_cross_val_X, '/train_cross_val_X.npy')
saveFile(osp.join(processed_data_path,'predict_D_and_T'), train_cross_val_y_D_and_T, '/train_cross_val_y_D_and_T.npy')
saveFile(osp.join(processed_data_path,'predict_D_and_T'), test_X, '/test_X.npy')
saveFile(osp.join(processed_data_path,'predict_D_and_T'), test_y_D_and_T, '/test_y_D_and_T.npy')

In [55]:
# assert on the dtype of train_X, train_y etc. -- should be all float and no nan -- going into fitting of the
# estimator

In [56]:
assert len(np.intersect1d(train_idxs, val_idxs))==0
assert len(np.intersect1d(test_idxs, val_idxs))==0
assert len(np.intersect1d(train_idxs, test_idxs))==0

In [57]:
tr=len(train_idxs)
v=len(val_idxs)
t=len(test_idxs)
print(f"train %:{tr*100/(tr+v+t):.2f}, val %:{v*100/(tr+v+t):.2f}, test %:{t*100/(tr+v+t):.2f}")

train %:64.01, val %:15.99, test %:19.99


In [58]:
saveFile(processed_data_path, test_idxs, '/test_idxs.npy')
saveFile(processed_data_path, train_idxs, '/train_idxs.npy')
saveFile(processed_data_path, val_idxs, '/val_idxs.npy')