In [2]:
%reload_ext autoreload
%autoreload 2

from IPython.core.display import Markdown
from tqdm.auto import tqdm
import config
import docs
import pandas as pd
import requests
import acs

pd.set_option('display.max_colwidth', None)

INDICATOR = 'commute_mode'
CONFIG = config.get_config(INDICATOR, '../config.toml')
CENSUS_API_KEY = CONFIG['acs_api_key']

display(Markdown('## Raw data path'))
raw_dir_path = str(CONFIG['raw_dir']).replace('\\', '/')
display(Markdown(f"[{raw_dir_path}]({raw_dir_path})"))


  from .autonotebook import tqdm as notebook_tqdm


## Raw data path

[C:/Users/tan/src/regional-pm-2023/data/raw/transportation/commute/commute_mode](C:/Users/tan/src/regional-pm-2023/data/raw/transportation/commute/commute_mode)

# Transportation Planning: Commute

## Commute Mode

In [3]:
docs.describe_indicator(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Typical resident commute mode.

nan

In [4]:
docs.list_schema(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0_level_0,name,description,type
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
year,Year,Year of record.,datetime
geography,Geography,"Geography of record (""United States"", ""California"", or ""San Diego"").",string
mode,Mode,"Transportation mode of record (""Drive Alone"", ""Car or Vanpool"", ""Transit"" ,""Walk"" ,""Bike"", ""Work at Place of Residence"", or""Other"").",string
mode_share,Mode Share,Percentage of residents in a given year and geography commuting with a given transportation mode.,float


In [5]:
docs.list_sources(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0_level_0,name,organization,active,notes
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acs_B08301,American Community Survey API (B08301),US Census Bureau,1.0,


In [6]:
steps = docs.list_update_steps(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)
steps

Unnamed: 0,step
0,Download raw ACS data.
1,"Transform raw ACS data (calculate percentages, summarize extra columns as Other)"
2,Extract legacy PM data and combine it with new data.


In [7]:
docs.list_remarks(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0,author,note
0,TAN,"Public ACS data only goes back to 2010, so had to use legacy PM data 2005-2010."
1,TAN,"US and California modes were never archived for more than the most recent year, so only had San Diego County data all the way back to 2005."


### Step 0: Download ACS data

Note that finalized 2021 is not availible for 1 year ACS (and it may never be availible), and 2022 data isn't expected until sometime September 2023.

In [8]:
display(steps.loc[0])

# Per https://api.census.gov/data/2021/acs/acs1/variables.html
columns = {
    'B08301_001E': 'Total',
    'B08301_003E': 'Drive Alone',
    'B08301_004E': 'Car or Vanpool',
    'B08301_010E': 'Transit',
    'B08301_019E': 'Walk',
    'B08301_018E': 'Bike',
    'B08301_021E': 'Work at Place of Residence',
}

if not (CONFIG['raw_dir']/'B08301.csv').exists():
    raw_data = acs.download_detail_table_acs_data(
        CENSUS_API_KEY,
        years=list(range(2010, 2020)) + [2021],
        columns=list(columns.keys()),
    )
    raw_data.to_csv(CONFIG['raw_dir']/'B08301.csv', index=False)
else:
    raw_data = pd.read_csv(CONFIG['raw_dir']/'B08301.csv')

raw_data.tail(3)

step    Download raw ACS data.
Name: 0, dtype: object

Unnamed: 0,NAME,B08301_001E,B08301_003E,B08301_004E,B08301_010E,B08301_019E,B08301_018E,B08301_021E,state,county,us,year
19,United States,156941346.0,119153349.0,13900979.0,7778444.0,4153050.0,805722.0,8970800.0,,,1.0,2019-01-01
20,"San Diego County, California",1591072.0,1011792.0,115574.0,27527.0,50559.0,6077.0,351947.0,6.0,73.0,,2021-01-01
21,United States,154314179.0,104650121.0,12018354.0,3793329.0,3399405.0,616153.0,27568098.0,,,1.0,2021-01-01


### Step 1: Transform ACS data

In [9]:
display(steps.loc[1])

step    Transform raw ACS data (calculate percentages, summarize extra columns as Other)
Name: 1, dtype: object

In [10]:
# Clean raw data
commute_mode = (
    raw_data
    .drop(columns=['us', 'state', 'county'])
    .rename(columns={'NAME': 'geography'} | columns)
    .melt(
        id_vars=['year', 'geography'],
        var_name='mode',
        value_name='mode_share',
    )
    .assign(year=lambda df: pd.to_datetime(df.year, format='%Y-%m-%d'))
    .set_index(['year', 'geography', 'mode'])
    .rename({'San Diego County, California': 'San Diego County'})
)

# convert to percentages out of Total
total = commute_mode.xs('Total', level='mode')
commute_mode = (commute_mode / total)
commute_mode = commute_mode.drop('Total', level='mode')

# Add an other category for the remaining percentages
other = (
    1.00 - commute_mode
    .groupby(['year', 'geography'])
    .sum()
    .assign(mode='Other')
    .set_index('mode', append=True)
)

### Step 2: Extract and combine legacy data.

In [11]:
display(steps.loc[2])

step    Extract legacy PM data and combine it with new data.
Name: 2, dtype: object

In [12]:
# Read in legacy data (only availible for San Diego County)
legacy_data = (
    pd.read_excel(
        CONFIG['legacy_xlsx_path'],
        sheet_name=CONFIG['legacy_sheet'],
        usecols='A:H',
        skiprows=3,
        nrows=5,
    )
    .rename(columns={'Unnamed: 0': 'year'})
    .melt('year', var_name='mode', value_name='mode_share')
    .assign(geography='San Diego County')
    .assign(year=lambda df: pd.to_datetime(df.year, format='%Y'))
    .set_index(['year', 'geography', 'mode'])
    .rename({'San Diego': 'San Diego County'})
)
legacy_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mode_share
year,geography,mode,Unnamed: 3_level_1
2007-01-01,San Diego County,Other,0.010656
2008-01-01,San Diego County,Other,0.01257
2009-01-01,San Diego County,Other,0.012188


In [13]:
commute_mode = pd.concat([commute_mode, other, legacy_data]).sort_index()
commute_mode.tail(9)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mode_share
year,geography,mode,Unnamed: 3_level_1
2021-01-01,San Diego County,Walk,0.031777
2021-01-01,San Diego County,Work at Place of Residence,0.221201
2021-01-01,United States,Bike,0.003993
2021-01-01,United States,Car or Vanpool,0.077882
2021-01-01,United States,Drive Alone,0.678163
2021-01-01,United States,Other,0.014702
2021-01-01,United States,Transit,0.024582
2021-01-01,United States,Walk,0.022029
2021-01-01,United States,Work at Place of Residence,0.178649


### Save Data

In [14]:
display(Markdown('#### Clean data path'))
clean_dir_path = str(CONFIG['clean_dir']).replace('\\', '/')
display(Markdown(f"[{clean_dir_path}]({clean_dir_path})"))

#### Clean data path

[C:/Users/tan/src/regional-pm-2023/data/clean/transportation/commute/commute_mode](C:/Users/tan/src/regional-pm-2023/data/clean/transportation/commute/commute_mode)

In [15]:
commute_mode.to_csv(
    CONFIG['clean_dir']
    / f'{INDICATOR}_odp.csv'
)