# Data prep for estimating models

Sam Maurer, June 2017

Python 3.6

In [77]:
%matplotlib inline

import matplotlib
import numpy as np
import pandas as pd
import zipfile

## Load raw CHTS tables

This requires the file named caltrans_full_survey.zip. You can download it by following the instructions in the "data" directory.

In [5]:
z = zipfile.ZipFile('../data/caltrans_full_survey.zip')

In [6]:
households = pd.read_csv(z.open('caltrans_full_survey/survey_households.csv'), low_memory=False)
len(households)

42426

In [7]:
# Limit to the Bay Area

households_ba = households[households.home_county_id.isin([1, 13, 41, 55, 75, 81, 85, 95, 97])]
len(households_ba)

9715

In [8]:
persons = pd.read_csv(z.open('caltrans_full_survey/survey_person.csv'), low_memory=False)
len(persons)

109113

In [9]:
places = pd.read_csv(z.open('caltrans_full_survey/survey_place.csv'), low_memory=False)
len(places)

460524

In [10]:
activities = pd.read_csv(z.open('caltrans_full_survey/survey_activity.csv'), low_memory=False)
len(activities)

604711

## Build table of census tracts

Generate a table of census tracts in the 9-county Bay Area, for use in destination choice models.

Calculate some covariates: residential density, school/employment density.

In [76]:
def full_tract_id(state_id, county_id, tract_id):
    """
    Generates a full tract ID from its components
    """
    return state_id * 1e9 + county_id * 1e6 + tract_id

In [79]:
# Suppress scientific notation in the display output

pd.set_option('display.float_format', lambda x: '%.0f' % x)

In [78]:
# Generate full tract identifiers for the `places` table

places['full_tract_id'] = full_tract_id(places.state_id, places.county_id, places.tract_id)

# Replace missing identifiers with NaN's

places.ix[(places.tract_id == 999999) |
          (places.county_id == 999) |
          (places.state_id == 99), 'full_tract_id'] = np.nan

In [84]:
# Generate a master list of census tracts from the `places` table, keeping the
# city name most commonly associated with each tract

tracts = places[['full_tract_id', 'city']].groupby('full_tract_id').\
        agg(lambda x:x.value_counts().index[0])

print(tracts.shape[0])
print(tracts.head())

9097
                     city
full_tract_id            
1015000800       ANNISTON
1101001500     MONTGOMERY
1161400100        SEVILLA
2020001000      ANCHORAGE
2020001100      ANCHORAGE


In [70]:
# Calculate the weighted persons count
households['_weighted_persons_count'] = households.persons_count * households.hhwgt

# Sum by census tract
_sum = households.groupby('home_tract_id')._weighted_persons_count.sum()
res_density = _sum.rename('res_density').to_frame()

In [59]:
# Map tracts to city names
city_key = households[['home_tract_id', 'home_city']].drop_duplicates().set_index('home_tract_id')

In [72]:
# What does the ranking look like? -- it's pretty arbitrary, because the tracts
# themselves have a similar population to each other... is this a problem?

_t = pd.merge(res_density, city_key, left_index=True, right_index=True)
print(_t.sort_values('res_density', ascending=False).head(10))

               res_density     home_city
home_tract_id                           
6.065046e+09     92.550705       THERMAL
6.065046e+09     92.550705         MECCA
6.073017e+09     85.857617     SAN DIEGO
6.073015e+09     85.855156       LA MESA
6.019006e+09     82.998200     MIRAMONTE
6.019006e+09     82.998200       REEDLEY
6.019006e+09     82.998200  SQUAW VALLEY
6.019006e+09     82.998200        SANGER
6.019006e+09     82.998200        DUNLAP
6.019006e+09     82.998200        FRESNO


In [73]:
# Employment density = sum of person weights by census tract of work location

_sum = persons.groupby('empl_tract_id').perwgt.sum()
work_density = _sum.rename('work_density').to_frame()