In [30]:
import glob

In [34]:
import numpy as np
import pandas as pd

In [150]:
import ipywidgets

In [152]:
import bokeh.plotting
import bokeh.layouts
import bokeh.models
import bokeh.io
from bokeh.palettes import Category10_10 as palette

In [118]:
bokeh.plotting.output_notebook()

In [69]:
import logging
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG)

# Housing Affordability Data System
Data provided by:<br />
Office of Policy Development and Research (PD&R)<br />
U.S. Department of Housing and Urban Development<br />
Secretary Ben Carson<br />
https://www.huduser.gov/portal/datasets/hads/hads.html

## Load the data

In [8]:
! ls -lh data/

total 651M
-rw-rw-r-- 1 schowell schowell  37M Feb 28  2007 hads1985.txt
-rw-rw-r-- 1 schowell schowell  37M Feb 28  2007 hads1987.txt
-rw-rw-r-- 1 schowell schowell  39M Feb 28  2007 hads1989.txt
-rw-rw-r-- 1 schowell schowell  38M Feb 28  2007 hads1991.txt
-rw-rw-r-- 1 schowell schowell  41M Oct  3  2007 hads1993.txt
-rw-rw-r-- 1 schowell schowell  38M Feb 28  2007 hads1995.txt
-rw-rw-r-- 1 schowell schowell  36M Feb 28  2007 hads1997.txt
-rw-rw-r-- 1 schowell schowell  40M Feb 28  2007 hads1999.txt
-rw-rw-r-- 1 schowell schowell  36M Feb 28  2007 hads2001.txt
-rw-rw-r-- 1 schowell schowell  41M Feb 28  2007 hads2003.txt
-rw-rw-r-- 1 schowell schowell  37M Jan 19  2011 hads2005.txt
-rw-rw-r-- 1 schowell schowell  34M Jan 19  2011 hads2007.txt
-rw-rw-r-- 1 schowell schowell  39M Jun 25  2010 hads2009.txt
-rw-r----- 1 schowell schowell 115M Apr 12  2013 hads2011.txt
-rw-r----- 1 schowell schowell  51M Jun 24  2015 hads2013.txt


The entire data set is 651M, let only grab some of these files.

In [35]:
fnames = glob.glob('data/*txt')
fnames.sort()

In [71]:
fnames

['data/hads1985.txt',
 'data/hads1987.txt',
 'data/hads1989.txt',
 'data/hads1991.txt',
 'data/hads1993.txt',
 'data/hads1995.txt',
 'data/hads1997.txt',
 'data/hads1999.txt',
 'data/hads2001.txt',
 'data/hads2003.txt',
 'data/hads2005.txt',
 'data/hads2007.txt',
 'data/hads2009.txt',
 'data/hads2011.txt',
 'data/hads2013.txt']

In [92]:
years = np.array([int(fname.strip('data/hads.txt')) for fname in fnames])

# print(years)  # too much data to start with 
print(years[::3])  # lets some years

data = {}
columns = []
for year in years[::3]:
    data[year] = pd.read_csv('data/hads{}.txt'.format(year), skiprows=0, sep=',')
    data[year].columns = map(str.lower, data[year].columns)  # make column labels lowercase
    data[year]['year'] = year  # add the year as a feature
    columns.append(data[year].columns)
    

years = years[::3]

[1985 1991 1997 2003 2009]


In [93]:
columns[0][1] in columns[-1]

True

In [94]:
# how similar are the columns?
for i, i_columns in enumerate(columns):
    for column in i_columns:
        if column not in columns[-1]:
            print('`{}` values, from {} data, is not in the 2009 data'.format(column, years[i]))

`metro` values, from 1985 data, is not in the 2009 data
`istatus` values, from 1985 data, is not in the 2009 data
`age` values, from 1985 data, is not in the 2009 data
`fmtmetro` values, from 1985 data, is not in the 2009 data
`metro` values, from 1991 data, is not in the 2009 data
`istatus` values, from 1991 data, is not in the 2009 data
`age` values, from 1991 data, is not in the 2009 data
`fmtmetro` values, from 1991 data, is not in the 2009 data


Identified that the columns did not have the same case.  After reformatting the all to lowercase, there are only a few missing, `metro`, `istatus`, `age`, and `fmtmetro`.  Lets combine these into a single DataFrame.

In [105]:
df_raw = pd.concat([data[key] for key in data.keys()])

In [106]:
df_raw.head()

Unnamed: 0,abl30,abl50,abl80,ablmed,age,age1,aplmed,assisted,bedrms,built,...,type,utility,vacancy,value,vchrmov,weight,year,zadeq,zinc2,zsmhc
0,,11957.72448,19127.18912,23424.2944,39.0,,22523.36,0,3,81,...,1,40.0,-9,-9,,2883.27,1985,1,18000,476
1,,12700.46,20322.648,25404.1704,40.0,,25404.1704,0,2,81,...,1,8.0,-9,-9,,2785.89,1985,1,14200,383
2,,9346.32383,14956.772553,18249.084,-9.0,,-9.0,0,2,84,...,1,0.0,1,-9,,2450.53,1985,1,-9,425
3,,10346.31,16550.8695,20271.024,19.0,,18018.688,0,2,85,...,1,15.75,-9,-9,,2306.52,1985,1,14000,371
4,,17066.681956,25737.559467,33612.27168,28.0,,29087.5428,0,3,84,...,1,34.166667,-9,160000,,2504.35,1985,1,35000,809


For now lets drop columns that contain `NaN` values.

In [121]:
df = df_raw.dropna(axis=1, how='any')

In [122]:
df.head()

Unnamed: 0,abl50,abl80,ablmed,aplmed,assisted,bedrms,built,burden,control,cost06,...,totsal,type,utility,vacancy,value,weight,year,zadeq,zinc2,zsmhc
0,11957.72448,19127.18912,23424.2944,22523.36,0,3,81,0.317333,'100006110249',476.0,...,18000,1,40.0,-9,-9,2883.27,1985,1,18000,476
1,12700.46,20322.648,25404.1704,25404.1704,0,2,81,0.323662,'100007130148',383.0,...,13000,1,8.0,-9,-9,2785.89,1985,1,14200,383
2,9346.32383,14956.772553,18249.084,-9.0,0,2,84,-9.0,'100008700141',425.0,...,-9,1,0.0,1,-9,2450.53,1985,1,-9,425
3,10346.31,16550.8695,20271.024,18018.688,0,2,85,0.318,'100014110140',371.0,...,14000,1,15.75,-9,-9,2306.52,1985,1,14000,371
4,17066.681956,25737.559467,33612.27168,29087.5428,0,3,84,0.277371,'100014350142',1122.519423,...,35000,1,34.166667,-9,160000,2504.35,1985,1,35000,809


### What is the distribution of different features and how do these change over time?
Lets explore the data visually.

In [130]:
int(np.where(years == 1997)[0])

2

In [134]:
colors = df.year.apply(lambda year: palette[int(np.where(years == year)[0])])

In [166]:
def update(x='bedrms', y='cost06'):
    r.data_source.data['x'] = df[x]
    r.data_source.data['y'] = df[y]
    bokeh.io.push_notebook()

In [168]:
p = bokeh.plotting.figure()
x = df.bedrms
y = df.cost06
colors = df.year.apply(lambda year: palette[int(np.where(years == year)[0])])
r = p.circle(x, y, color=colors)
bokeh.plotting.show(p, notebook_handle=True)

In [169]:
ipywidgets.interact(update, x=list(df.columns), y=list(df.columns))

<function __main__.update>

Maybe better to use a boxplot: http://bokeh.pydata.org/en/latest/docs/gallery/boxplot.html

There are obviously some strange values used for missing data. To handle this appropriately, we need to better understand how the data was measured.