# Data prep for estimating models

Sam Maurer, June 2017

Python 3.6

In [4]:
%matplotlib inline

import matplotlib
import pandas as pd
import zipfile

## Load raw CHTS tables

This requires the file named caltrans_full_survey.zip. You can download it by following the instructions in the "data" directory.

In [5]:
z = zipfile.ZipFile('../data/caltrans_full_survey.zip')

In [6]:
households = pd.read_csv(z.open('caltrans_full_survey/survey_households.csv'), low_memory=False)
len(households)

42426

In [7]:
# Limit to the Bay Area

households_ba = households[households.home_county_id.isin([1, 13, 41, 55, 75, 81, 85, 95, 97])]
len(households_ba)

9715

In [8]:
persons = pd.read_csv(z.open('caltrans_full_survey/survey_person.csv'), low_memory=False)
len(persons)

109113

In [9]:
places = pd.read_csv(z.open('caltrans_full_survey/survey_place.csv'), low_memory=False)
len(places)

460524

In [10]:
activities = pd.read_csv(z.open('caltrans_full_survey/survey_activity.csv'), low_memory=False)
len(activities)

604711

## Build table of census tracts

Generate a table of census tracts in the 9-county Bay Area, for use in destination choice models.

Calculate some covariates: residential density, school/employment density.

In [33]:
# PROBLEM - this isn't going to work because all the tracts are about the same size

# Residential density: number of CHTS respondents living in each census tract, weighted
# by each household's sampling correction


_count = households.groupby('home_tract_id').persons_count.sum()
res_density = _count.rename('res_density').to_frame()

In [70]:
# Calculate the weighted persons count
households['_weighted_persons_count'] = households.persons_count * households.hhwgt

# Sum by census tract
_sum = households.groupby('home_tract_id')._weighted_persons_count.sum()
res_density = _sum.rename('res_density').to_frame()

In [64]:
households.hhwgt.describe()

count    42421.000000
mean         0.999955
std          0.704667
min          0.003498
25%          0.447392
50%          0.915924
75%          1.376790
max          5.400840
Name: hhwgt, dtype: float64

In [59]:
city_key = households[['home_tract_id', 'home_city']].drop_duplicates().set_index('home_tract_id')

In [71]:
# What does the ranking look like?

_t = pd.merge(res_density, city_key, left_index=True, right_index=True)
print(_t.sort_values('res_density', ascending=False).head(50))

               res_density         home_city
home_tract_id                               
6.065046e+09     92.550705           THERMAL
6.065046e+09     92.550705             MECCA
6.073017e+09     85.857617         SAN DIEGO
6.073015e+09     85.855156           LA MESA
6.019006e+09     82.998200         MIRAMONTE
6.019006e+09     82.998200           REEDLEY
6.019006e+09     82.998200      SQUAW VALLEY
6.019006e+09     82.998200            SANGER
6.019006e+09     82.998200            DUNLAP
6.019006e+09     82.998200            FRESNO
6.073010e+09     78.033191         SAN DIEGO
6.073019e+09     77.991605             VISTA
6.073020e+09     75.716906          CARLSBAD
6.073012e+09     75.047587     NATIONAL CITY
6.073020e+09     73.330748        SAN MARCOS
6.073020e+09     73.330748          CARLSBAD
6.037431e+09     69.550365      SIERRA MADRE
6.037460e+09     67.705812          PASADENA
6.037460e+09     67.705812          ALTADENA
6.073020e+09     67.665465        SAN MARCOS
6.099004e+

In [None]:
# Employment density: number of CHTS respondents with primary work location in each tract

work_density = persons.