In [500]:
from bps import base
import pandas as pd
import numpy as np
from us import states
import functools

import altair as alt

In [12]:
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 50)

In [467]:
import imp
imp.reload(base)

<module 'bps.base' from '/Users/sidharthkapur/personal-workspace/building-permits-survey/bps/base.py'>

# Download data

In [468]:
west_dfs = [
    base.load_data(scale='place', time_scale='annual', region='west', year=year)
    for year in range(1980, 2019)
]

In [469]:
south_dfs = [
    base.load_data(scale='place', time_scale='annual', region='south', year=year)
    for year in range(1980, 2019)
]

In [470]:
midwest_dfs = [
    base.load_data(scale='place', time_scale='annual', region='midwest', year=year)
    for year in range(1980, 2019)
]

In [471]:
northeast_dfs = [
    base.load_data(scale='place', time_scale='annual', region='northeast', year=year)
    for year in range(1980, 2019)
]

In [507]:
data = pd.concat(west_dfs + south_dfs + midwest_dfs + northeast_dfs)

# Clean up/prepare data

In [508]:
data['Survey Date'].unique()

array(['8099', '\x1a', '8199', '8299', '8399', '8499', '8599', '8699',
       '8799', '8899', '8999', '9099', '9199', '9299', '9399', '9499',
       '9599', '9699', '9799', '9899', 1999, 2000, 2001, 2002, 2003, 2004,
       2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017, 2018], dtype=object)

In [509]:
broken_dates_map = {
    '{:2d}99'.format(i): '19{:2d}'.format(i) for i in range(80, 99)
}
def fix_date(date):
    if date in broken_dates_map:
        return broken_dates_map[date]
    else:
        return date
data['Survey Date'] = data['Survey Date'].map(fix_date)

In [510]:
def get_state(state_code):
    if not np.isnan(state_code):
        state = states.lookup('{:02d}'.format(int(state_code)))
        if state:
            return state.abbr
    
data['State'] = data['State Code'].apply(get_state)

In [511]:
cbsa_2018 = data[
    (data['Survey Date'] == 2018)
][['Place Name', 'State', 'CBSA Code']].drop_duplicates()

In [512]:
cbsa_2018_dict = cbsa_2018.set_index(['Place Name', 'State'])['CBSA Code'].to_dict()

In [513]:
data['2018 CBSA'] = data.apply(
    lambda row: cbsa_2018_dict.get(
        (row['Place Name'], row['State'])
    ), axis=1
)

In [514]:
data['Survey Date'].unique()

array(['1980', '\x1a', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994',
       '1995', '1996', '1997', '1998', 1999, 2000, 2001, 2002, 2003, 2004,
       2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017, 2018], dtype=object)

In [515]:
data['Year'] = pd.to_datetime(data['Survey Date'], format='%Y', errors='coerce')

In [516]:
data['Population'] = data['Pop   '].str.strip().apply(lambda x: x if x else None).astype(float)

In [517]:
population_by_place = data.query('Year == 2018')[['Place Name', 'State', 'Population']].set_index(['Place Name', 'State'])['Population'].to_dict()

In [518]:
data['2018 Population'] = data[['Place Name', 'State']].apply(
    lambda row: population_by_place.get((row['Place Name'], row['State'])),
    axis=1
)

In [519]:
data[['Place Name', 'Population']].sort_values('Population', ascending=False).drop_duplicates().head()

Unnamed: 0,Place Name,Population
1819,Bronx borough,8008278.0
1820,Brooklyn borough,8008278.0
2841,Staten Island borough,8008278.0
2685,Queens borough,8008278.0
2393,Manhattan borough,8008278.0


In [520]:
data['5-unit per 1000 population'] = data['5+ units Units'] / data['2018 Population'] * 1000

# Make plots

In [539]:
cities = {
    'CA': [
        'San Francisco', 'Los Angeles', 'Irvine', 
        'Long Beach', 'Oakland', 'Sacramento', 'Glendale',
        'Berkeley', 'San Jose', 'Mountain View',
        'Palo Alto', 'Davis', 'Fremont', 'Dublin',
        'Emeryville',
        'Contra Costa Centre',
        'Walnut Creek',
        'San Mateo',
        'Redwood City',
        'Milpitas', 'Campbell', 'Santa Clara',
        'Cupertino',
        'Culver City',
        'Pasadena',
        'West Hollywood',
        'Santa Monica',
        'Santa Ana',
        'Newport Beach',
        'Inglewood',
        'Burbank',
        'Azusa',
    ],
    'TX': [
        'Dallas', 'Plano', 'Frisco', 'Richardson', 'Fort Worth', 'The Colony',
        'Allen', 'McKinney', 'Irving', 'Garland', 'Addison', 'Highland Park', 
        'University Park',
        
        'Houston', 'The Woodlands', 'Woodlands', 'Katy', 'Sugarland'
        
        'Austin',
    ],
    'WA': [
        'Seattle', 'Tacoma',
        'Kent', 'Seatac',
        'Bellevue', 'Redmond'
    ],
    # 'UT': ['Salt Lake City'],
    # 'NV': ['Las Vegas', 'Reno'],
    'CO': ['Denver', 'Boulder'],
    'OR': ['Portland'],
#     'NY': ['Hicksville'],
#     'CT': ['Stamford'],
    'IL': ['Chicago']
}

In [540]:
row_indices = [
    (data['Place Name'] == place) & (data['State'] == state)
    for state, places in cities.items()
    for place in places
]
to_plot = data[
    functools.reduce(lambda a, b: a | b, row_indices)
]

In [544]:
to_plot[
    to_plot['Place Name'] == 'Highland Park'
]

Unnamed: 0,Survey Date,State Code,6-Digit ID,County Code,MSA/CMSA,PMSA Code,Region Code,Division Code,Number of Months Rep,Unnamed: 10,Place Name,Bldgs,1-unit Units,1-unit Value,1-unit Bldgs,2-units Units,2-units Value,2-units Bldgs,3-4 units Units,3-4 units Value,3-4 units Bldgs,5+ units Units,5+ units Value,5+ units Bldgs,1-unit rep Units,...,2-units rep Units,2-units rep Value,2-units rep Bldgs,3-4 units rep Units,3-4 units rep Value,3-4 units rep Bldgs,5+ units rep Units,5+ units rep Value,Place Code,Central City,Zip Code,CSA CSA,CBSA Code,CSA Code,Footnote Code,Census Place Code,FIPS Place Code,FIPS MCD Code,Pop,State,2018 CBSA,Year,Population,2018 Population,5-unit per 1000 population


In [541]:
hover = alt.selection_single(
    on='mouseover',
    nearest=True,
    empty='none'
)

plot = alt.Chart(to_plot).mark_line().encode(
    color='Place Name',
    x=alt.X('Year'),
    # y=alt.Y('5+ units Units'),
    y=alt.Y('5-unit per 1000 population'),
)

(
    plot
     .facet('2018 CBSA', columns=3)
     .resolve_legend(color='independent')
     .resolve_scale(color='independent', x='independent')
)

In [217]:
alt.Chart(to_plot).mark_line().encode(
    color='Place Name',
    x=alt.X('Year'),
    y='1-unit Units'
)

In [131]:
data[[
    'Survey Date',
    'Place Name',
    'State',
    '1-unit Units',
    '2-units Units', 
    '3-4 units Units',
    '5+ units Units',
    '5+ units Bldgs',
]].sort_values('5+ units Units', ascending=False).head(50).reset_index(drop=True)

Unnamed: 0,Survey Date,Place Name,State,1-unit Units,2-units Units,3-4 units Units,5+ units Units,5+ units Bldgs
0,2015,Los Angeles,California,1834,508,0,13671,1834
1,2018,Los Angeles,California,2636,1112,0,12551,2636
2,2017,Los Angeles,California,2360,918,0,11568,2360
3,2006,Los Angeles,California,2421,382,109,11536,2421
4,2016,Los Angeles,California,1796,666,89,11339,1796
5,2015,Seattle,Washington,810,286,422,9822,810
6,2014,Los Angeles,California,1668,410,9,9177,1668
7,2007,Clark County Unincorporated Area,Nevada,5859,4,447,8952,5859
8,2017,Seattle,Washington,593,342,354,8598,593
9,2016,Seattle,Washington,797,342,280,8580,797
