In [28]:
import arcpy
from arcpy import env
import os
from arcgis import GIS
from arcgis.features import GeoAccessor
import pandas as pd
import numpy as np
from zipfile import ZipFile

arcpy.env.overwriteOutput = True
arcpy.env.parallelProcessingFactor = "90%"

# show all columns
pd.options.display.max_columns = None

# pd.DataFrame.spatial.from_featureclass(???)
# df.spatial.to_featureclass(location=???,sanitize_columns=False)

## Create empty hdf

In [29]:
# store path for new hdf
hdf_name = 'remm_data_2019_base_year_20220526'
new_hdf = os.path.join('.\\Outputs', hdf_name + '.h5')
print(new_hdf)

# if the h5 exists already delete it; it will not overwrite
if os.path.exists(new_hdf):
    try:
        new_hdf.close()
    except:
        pass
    
    os.remove(new_hdf)

# Create empty h5   
hdf = pd.HDFStore(new_hdf)   

.\Outputs\remm_data_2019_base_year_20220526.h5


## Add parcels

In [30]:
# parcels
parcels = pd.read_csv(r'E:\REMM\Base_Year_Data_2019\v1\edits\parcels_20220513.csv')
# hafb_tazids = pd.read_csv(r'.\Inputs\Hill_Air_Force_Base_TAZID832.csv')


parcels['parcel_id'] = parcels['parcel_id_REMM']
parcels['parent_parcel'] = parcels['parcel_id_REMM'] # crosswalked using old parcel id - parent parcel id relationship
parcels['zone_id'] = parcels['TAZID_832']
parcels.rename({'COUNTY_ID':'county_id'}, axis=1, inplace=True)

# convert back to square feet for now, since the model is coded to used those units (parcel_acres is derived from shape_area)
# parcels['shape_area'] = parcels['shape_area'] * 10.7639
# parcels['shape_area'] = parcels['shape_area'].round(0).astype(int)

parcels = parcels[['parcel_id_REMM', 'zone_id', 'CO_NAME', 'county_id', 'land_value',
       'x', 'y', 'Split', 'parcel_acres',
       'TAZID_832', 'TAZID_900', 'Old_PID', 'parent_parcel', 'elevation',
       'fwy_exit', 'airport', 'rail_depot', 'stream', 'trail', 'university',
       'shape_area', 'volume_one_way', 'volume_two_way', 'airport_distance',
       'fwy_exit_dist', 'raildepot_dist', 'university_dist', 'trail_dist',
       'stream_dist', 'train_station', 'rail_stn_dist', 'bus_rte_dist',
       'bus_stop', 'bus_stop_dist', 'volume_two_way_nofwy', 'distsml_id',
       'distmed_id', 'distlrg_id', 'zonal_ppa', 'parcel_id']]

In [31]:
parcels[parcels['parcel_id_REMM']==35823]

Unnamed: 0,parcel_id_REMM,zone_id,CO_NAME,county_id,land_value,x,y,Split,parcel_acres,TAZID_832,TAZID_900,Old_PID,parent_parcel,elevation,fwy_exit,airport,rail_depot,stream,trail,university,shape_area,volume_one_way,volume_two_way,airport_distance,fwy_exit_dist,raildepot_dist,university_dist,trail_dist,stream_dist,train_station,rail_stn_dist,bus_rte_dist,bus_stop,bus_stop_dist,volume_two_way_nofwy,distsml_id,distmed_id,distlrg_id,zonal_ppa,parcel_id
33496,35823,1001,Salt Lake,35,22651200.0,1531671.508,7444871.339,0.0,9.999832,1001,1134,43559.0,35823,1293.464774,0.0,0.0,0.0,0.0,0.0,0.0,435592.6886,0.0,656.0,4.80067,0.94616,3.16089,2.85429,1.02749,1.32079,0.0,0.124219,51.493291,0.0,0.104867,656.0,235,23,8,1284250.279,35823


In [32]:
hdf.put('parcels', parcels.set_index('parcel_id'), format='t', data_columns=True)

## Add buildings

In [33]:
buildings = pd.read_csv(r'E:\REMM\Base_Year_Data_2019\v1\edits\buildings_20220513.csv')
buildings_lu = {0:None,
                1:1,
                2:2,
                3:3,
                4:4,
                5:5,
                6:6,
                7:7,
                8:8,
                9:6, 
                10:8,
                11:8,
                12:None,
                13:5,
                14:None,
                15:None,
                16:None,
                99:None}

# remap building types
buildings['building_type_id'] = buildings['building_type_id'].map(buildings_lu)

######################################
# fill in some fake square footage and years (comment out later)
######################################

# fill in some fake numbers
buildings.loc[(buildings['year_built'].isna() ==True) | (buildings['year_built'] == 0), 'year_built'] =  np.random.randint(1900, 2019, buildings[(buildings['year_built'].isna() ==True) | (buildings['year_built'] == 0)].shape[0])

buildings.loc[((buildings['building_sqft'].isna() ==True) | (buildings['building_sqft'] == 0)) & (buildings['building_type_id'] == 3), 
              'building_sqft'] =  np.random.randint(10000, 200000, buildings[((buildings['building_sqft'].isna() ==True) | (buildings['building_sqft'] == 0)) & (buildings['building_type_id'] == 3)].shape[0])

buildings.loc[((buildings['building_sqft'].isna() ==True) | (buildings['building_sqft'] == 0)) & (buildings['building_type_id'] != 3), 
              'building_sqft'] =  np.random.randint(1400, 14000, buildings[((buildings['building_sqft'].isna() ==True) | (buildings['building_sqft'] == 0)) & (buildings['building_type_id'] != 3)].shape[0])

buildings.loc[((buildings['non_residential_sqft'].isna() ==True) | (buildings['non_residential_sqft'] == 0)) & (buildings['building_type_id'].isin([1,2]) == False), 
              'non_residential_sqft'] = buildings['building_sqft']

buildings.loc[((buildings['non_residential_sqft'].isna() ==True) | (buildings['non_residential_sqft'] == 0)) & (buildings['building_type_id'].isin([1,2]) == True), 
              'non_residential_sqft'] = 0

buildings.loc[(buildings['residential_units'].isna() ==True), 
              'residential_units'] =  0

buildings.loc[(buildings['unit_price_non_residential'].isna() ==True), 
              'unit_price_non_residential'] =  0

mean = buildings['res_price_per_sqft'].mean()
buildings.loc[((buildings['res_price_per_sqft'].isna() ==True) | (buildings['res_price_per_sqft'] == 0)) & (buildings['building_type_id'].isin([1,2]) == True), 
              'res_price_per_sqft'] = mean

buildings.replace([np.inf, -np.inf], 0, inplace=True)

buildings.loc[((buildings['res_price_per_sqft'].isna() ==True) | (buildings['res_price_per_sqft'] == 0)) & (buildings['building_type_id'].isin([1,2]) == False), 
              'res_price_per_sqft'] = 0
####################################################

# subset to buildings with a building type
buildings = buildings[buildings['building_type_id'] >= 1]
buildings.columns

Index(['building_id', 'building_sqft', 'building_type_id',
       'non_residential_sqft', 'note', 'parcel_id', 'residential_units',
       'stories', 'unit_price_non_residential', 'year_built',
       'res_price_per_sqft', 'job_spaces'],
      dtype='object')

In [34]:
hdf.put('buildings', buildings.set_index('building_id'), format='t', data_columns=True)

## Add jobs

In [35]:
jobs = pd.read_csv(r'E:\REMM\Base_Year_Data_2019\v1\edits\jobs_20220513.csv')
jobs.columns

Index(['jobs_id', 'building_id', 'cid', 'sector_id'], dtype='object')

In [36]:
hdf.put('jobs', jobs.set_index('jobs_id'), format='t', data_columns=True)

## Add households

In [37]:
households = pd.read_csv(r'E:\REMM\Base_Year_Data_2019\v1\edits\households_20220513.csv')
households.columns

Index(['household_id', 'cars', 'household_type_id', 'persons', 'income',
       'workers', 'children', 'age_of_head', 'race_id', 'familyhh', 'block_id',
       'cid', 'building_id'],
      dtype='object')

In [38]:
hdf.put('households', households, format='t', data_columns=True)

## Add travel data

In [39]:
travel_data = pd.read_csv(r'E:\REMM\Base_Year_Data_2019\v1\travel_data_2015.csv')
travel_data.columns

Index(['from_zone_id', 'to_zone_id', 'travel_time', 'travel_time_transit',
       'log0', 'log1', 'log2'],
      dtype='object')

In [40]:
hdf.put('travel_data', travel_data.set_index(['from_zone_id', 'to_zone_id']), format='t', data_columns=True)

## Add zoning baseline

In [41]:
# zoning_baseline = pd.read_csv(r'E:\REMM\Base_Year_Data_2019\v1\zoning_baseline2019.csv')
zoning_baseline = pd.read_csv(r'E:\REMM\Base_Year_Data_2019\v1\edits\zoning_baseline_20220513.csv')
zoning_baseline.columns

Index(['parcel_id', 'max_dua', 'max_far', 'max_height', 'type1', 'type2',
       'type3', 'type4', 'type5', 'type6', 'type7', 'type8'],
      dtype='object')

In [42]:
hdf.put('zoning_baseline', zoning_baseline.set_index('parcel_id'), format='t', data_columns=True)

In [43]:
# close the h5
hdf.close()

In [44]:
# zip it up for distribution
ZipFile(os.path.join('.\\Outputs',hdf_name + '.zip'), mode='w').write(new_hdf, arcname=hdf_name + '.h5')

In [45]:
# # Read in hdf5 
# store1 = pd.HDFStore('.\\Data\\remm_data_2015_base_year_02082019.h5')
# tables = list(store1.keys())
# tables

In [46]:
# store1.info()

In [47]:
# store1['buildings'].head(5)
# store1['buildings_for_estimation']
# store1['buildings_for_estimation_grouped'].head(5)
# store1['employment_controls'].head(5)
# store1['household_controls'].head(5)
# store1['household_for_estimation'].head(5)
# store1['households'].head(5)
# store1['households_for_estimation'].head(5)
# store1['households_for_estimation1'].head(5)
# store1['jobs'].head(5)
# store1['parcels'].head(5)
# store1['travel_data'].head(5)
# store1['valid_parcels'].head(5)
# store1['zoning'].head(5)
# store1['zoning_base_line'].head(5)
# store1['zoning_baseline'].head(5)
# store1['zoning_for_parcels'].head(5)

In [48]:
# # Read in hdf5 
# store2 = pd.HDFStore('.\\Results\\remm_data_2015_base_year_09102020.h5')
# tables = list(store2.keys())
# tables

In [49]:
# store2['buildings'].head(5)
# store2['buildings_for_estimation']
# store2['buildings_for_estimation_grouped'].head(5)
# store2['employment_controls'].head(5)
# store2['household_controls'].head(5)
# store2['household_for_estimation'].head(5)
# store2['households'].head(5)
# store2['households_for_estimation'].head(5)
# store2['households_for_estimation1'].head(5)
# store2['jobs'].head(5)
# store2['parcels'].head(5)
# store2['travel_data'].head(5)
# store2['valid_parcels'].head(5)
# store2['zoning'].head(5)
# store2['zoning_base_line'].head(5)
# store2['zoning_baseline'].head(5)
# store2['zoning_for_parcels'].head(5)