## Data structure diagnostics

Sam Maurer, August 2015

In [1]:
# Goal is to put together a flowchart of what data is used for 
# each model, and where it comes from

In [2]:
%load_ext autoreload
%autoreload 2
if 'sim' not in globals():
    import os; os.chdir('..')
import models
import datasources
import urbansim.sim.simulation as sim

In [None]:
# Generate the network vars -- there may also be other columns that don't show up
# until particular models are run...

In [3]:
%%capture
sim.run(["neighborhood_vars"]);

## List tables and columns

In [4]:
sim.list_tables()

['zones',
 'buildings',
 'residential_units',
 'jobs',
 'zoning_test',
 'household_controls',
 'craigslist',
 'parcels_geography',
 'households',
 'costar',
 'household_extras',
 'homesales',
 'employment_controls',
 'logsums',
 'nodes',
 'zoning_baseline',
 'parcels']

In [5]:
# List columns within each table, both those coded in the table definition
# and those registered separately

for t in sim.list_tables():
    print t.upper()
    print "\nfrom table definition:"
    try:
        print sim.get_table(t).local_columns
    except Exception, e:
        print e
    print "\nregistered separately:"
    print [p[1] for p in sim.list_columns() if p[0]==t]
    print

ZONES

from table definition:
['gid', 'tract', 'area', 'acres']

registered separately:
[]

BUILDINGS

from table definition:
['parcel_id', 'residential_units', 'residential_sqft', 'non_residential_sqft', 'building_sqft', 'stories', 'year_built', 'redfin_sale_year', 'building_type_id', 'residential_price', 'non_residential_price']

registered separately:
['sqft_per_unit', 'node_id', 'general_type', 'residential_price', 'vacant_residential_units', 'residential_rent', 'job_spaces', 'vacant_job_spaces', 'sqft_per_job', 'lot_size_per_unit', 'zone_id']

RESIDENTIAL_UNITS

from table definition:
Initial unit tenure assignment: 56% owner occupied, 4% unfilled
['building_id', 'deed_restricted', 'num_units', 'unit_num', 'unit_residential_price', 'unit_residential_rent', 'unit_tenure']

registered separately:
['submarket_id', 'vacant_units']

JOBS

from table definition:
['sector_id', 'taz', 'building_id']

registered separately:
['empsix_id', 'node_id', 'zone_id', 'naics', 'empsix']

ZONING_TES

  data = self._reader.read(nrows)


## List models, injectables, broadcasts

In [4]:
sim.list_models()

['households_transition',
 'pusher',
 'diagnostic_output',
 'calc_prop_taxes',
 'feasibility',
 'hlcm_estimate',
 'jobs_relocation',
 'clear_cache',
 'price_vars',
 'simple_jobs_transition',
 'residential_developer',
 'rsh_simulate',
 'jobs_transition',
 'nrh_estimate',
 'nrh_simulate',
 'simple_households_transition',
 'neighborhood_vars',
 'hlcm_simulate',
 'non_residential_developer',
 'travel_model_output',
 'subsidized_residential_developer',
 'elcm_simulate',
 'elcm_estimate',
 'rsh_estimate',
 'households_relocation']

In [5]:
sim.list_injectables()

['building_type_map',
 'run_number',
 'uuid',
 'scenario',
 'settings',
 'parcel_is_allowed_func',
 'aggregations',
 'summary',
 'parcel_average_price',
 'scenario_inputs',
 'parcel_sales_price_sqft_func',
 'form_to_btype_func',
 'supply_and_demand_multiplier_func',
 'add_extra_columns_func',
 'building_sqft_per_job',
 'net',
 'acct_settings',
 'store',
 'coffer']

In [6]:
sim.list_broadcasts()

[('logsums', 'parcels'),
 ('parcels_geography', 'buildings'),
 ('buildings', 'households'),
 ('nodes', 'parcels'),
 ('nodes', 'homesales'),
 ('nodes', 'buildings'),
 ('parcels', 'buildings'),
 ('logsums', 'costar'),
 ('logsums', 'buildings'),
 ('nodes', 'costar'),
 ('logsums', 'homesales'),
 ('buildings', 'jobs')]

## Dig into particular data series as needed

In [8]:
hh = sim.get_table('households').to_frame()

In [15]:
len(hh)

2732722

In [11]:
print hh[['white','black','asian','hisp']].describe()

                white           black           asian            hisp
count  2608019.000000  2608019.000000  2608019.000000  2608019.000000
mean         0.686555        0.080798        0.174240        0.129447
std          0.463894        0.272524        0.379315        0.335694
min          0.000000        0.000000        0.000000        0.000000
25%          0.000000        0.000000        0.000000        0.000000
50%          1.000000        0.000000        0.000000        0.000000
75%          1.000000        0.000000        0.000000        0.000000
max          1.000000        1.000000        1.000000        1.000000
