In [None]:
# Sam Maurer, June 2015

In [14]:
%load_ext autoreload
%autoreload 2
import models
import urbansim.sim.simulation as sim
from urbansim.utils import misc

import numpy as np
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### 1. Figure out how the hedonics are currently estimated

In [2]:
s = sim.get_injectable('store')
s

<class 'pandas.io.pytables.HDFStore'>
File path: ./data/bayarea_v3.h5
/apartments                    frame        (shape->[4327,6])    
/buildings                     frame        (shape->[1841288,18])
/costar                        frame        (shape->[71520,23])  
/homesales                     frame        (shape->[220037,14]) 
/households                    frame        (shape->[2732722,47])
/jobs                          frame        (shape->[3386491,3]) 
/nets                          frame        (shape->[633973,7])  
/parcels                       frame        (shape->[1951911,15])
/zones                         frame        (shape->[1454,4])    
/zoning                        frame        (shape->[42311,19])  
/zoning_for_parcels            frame        (shape->[1951911,1]) 

In [None]:
# Where does the data for hedonic estimation come from?
# In rsh.yaml, the model expression is: 
'''
np.log(price_per_sqft) ~ I(year_built < 1940) + I(year_built > 2000)
    + np.log1p(sqft_per_unit) + ave_income + stories + poor + renters + sfdu + autoPeakTotal
    + transitPeakTotal + autoOffPeakRetail + ave_lot_size_per_unit + sum_nonresidential_units
    + sum_residential_units
'''

In [27]:
s.costar.columns.values

array(['X', 'Y', 'PropertyType', 'Res', 'City', 'county',
       'averageweightedrent', 'rentable_building_area', 'building_class',
       'parking_ratio', 'percent_leased', 'number_of_stories',
       'year_built', 'zoning', 'fname', 'same', 'two', 'ok', 'parcel_id',
       '_node_id0', '_node_id1', '_node_id2', '_node_id'], dtype=object)

In [5]:
s.homesales.columns.values

array(['X', 'Y', 'City', 'Lot_size', 'SQft', 'Year_built', 'Sale_price',
       'parcel_id', 'sales', 'Sale_price_flt', '_node_id0', '_node_id1',
       '_node_id2', '_node_id'], dtype=object)

In [6]:
s.parcels.columns.values

array(['development_type_id', 'land_value', 'acres', 'county_id',
       'zone_id', 'zoning_id', 'proportion_undevelopable',
       'tax_exempt_status', 'apn', 'parcel_id_local', 'geom_id',
       'imputation_flag', 'x', 'y', 'shape_area'], dtype=object)

In [7]:
s.buildings.columns.values

array(['parcel_id', 'development_type_id', 'improvement_value',
       'residential_units', 'residential_sqft', 'sqft_per_unit',
       'non_residential_sqft', 'building_sqft', 'nonres_rent_per_sqft',
       'res_price_per_sqft', 'stories', 'year_built', 'redfin_sale_price',
       'redfin_sale_year', 'redfin_home_type', 'costar_property_type',
       'costar_rent', 'building_type_id'], dtype=object)

In [None]:
# Many of the inputs come from the neighborhood_vars model, which does network aggregation
# and stores its results in the 'nodes' table - and others are defined in variables.py
'''
price_per_sqft:              homesales (which does not come from the h5 file, but is 
                                 constructed on the fly from the buildings table)
                                 buildings > redfin_sale_price and sqft_per_unit
year_built:                  buildings
sqft_per_unit:               buildings dynamic column
ave_income:                  nodes, from households > income
stories:                     buildings
poor:                        nodes, from households > persons
renters:                     nodes, from households > tenure
sfdu:                        nodes, from buildings > building_type_id
autoPeakTotal:               logsums
transitPeakTotal:            logsums
autoOffPeakRetail:           logsums
ave_lot_size_per_unit:       nodes, from buildings dynamic column
sum_nonresidential_units:    nodes, from buildings dynamic column
sum_residential_units:       nodes, from buildings > residential_units
'''

### 2. Bring in Craigslist data as a separate table, and link

In [None]:
# Craigslist gives us x,y coordinates, but they're not accurate enough to link
# to a specific parcel. Probably the best approach is to set up a new table for CL
# data, and then use a broadcast to link them to the nodes and logsums tables

In [15]:
df = pd.read_csv(os.path.join(misc.data_dir(), "sfbay_craigslist.csv"))

In [16]:
df.describe()

Unnamed: 0,price,bedrooms,sqft,longitude,latitude,price_sqft
count,73644.0,68812.0,73644.0,73644.0,73644.0,73644.0
mean,3806.642944,1.918575,1196.62111,-122.108903,37.639416,4.697802
std,146802.05571,0.935159,31192.63512,1.35975,0.863417,183.768413
min,1.0,1.0,1.0,-159.609375,-85.561269,0.000154
25%,1831.0,1.0,720.0,-122.286807,37.383128,2.055256
50%,2330.5,2.0,904.0,-122.050473,37.601374,2.587519
75%,2986.0,2.0,1160.0,-121.95227,37.844037,3.135714
max,26722744.0,8.0,8388607.0,146.425781,79.858833,31145.389277


In [17]:
# borrowing code from datasources.py to link x,y to nodes
net = sim.get_injectable('net')
df['_node_id'] = net.get_node_ids(df['longitude'], df['latitude'])

In [19]:
df['_node_id'].describe()

count     73644.000000
mean     156857.443906
std      100783.186320
min          62.000000
25%       80038.000000
50%      149462.000000
75%      190924.000000
max      354659.000000
Name: _node_id, dtype: float64

In [25]:
df.head(5)

Unnamed: 0,neighborhood,price,bedrooms,date,sqft,longitude,latitude,price_sqft,_node_id
0,twin peaks / diamond hts,4500,2,2014-05-11,1200,-122.4383,37.745,3.75,300155
1,sunnyvale,2650,2,2014-05-11,1040,-122.008131,37.353699,2.548077,143129
2,glen park,3100,2,2014-05-11,1000,-122.439743,37.731584,3.1,125905
3,redwood city,1850,1,2014-05-11,792,-122.234294,37.491715,2.335859,143879
4,walnut creek,1325,1,2014-05-11,642,-122.087751,37.923448,2.063863,70923
