In [60]:
%reload_ext autoreload
%autoreload 2

from IPython.core.display import Markdown
from tqdm.auto import tqdm
import config
import docs
import pandas as pd
import requests
import acs
from datetime import datetime

pd.set_option('display.max_colwidth', None)

INDICATOR = 'commute_time'
CONFIG = config.get_config(INDICATOR, '../config.toml')
CENSUS_API_KEY = CONFIG['acs_api_key']

display(Markdown('## Raw data path'))
raw_dir_path = str(CONFIG['raw_dir']).replace('\\', '/')
display(Markdown(f"[{raw_dir_path}]({raw_dir_path})"))

## Raw data path

[C:/Users/tan/src/regional-pm-2023/data/raw/transportation/commute/commute_time](C:/Users/tan/src/regional-pm-2023/data/raw/transportation/commute/commute_time)

# Transportation Planning: Commute

## Commute Time

In [61]:
docs.describe_indicator(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Average resident commute time.

nan

In [62]:
docs.list_schema(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0_level_0,name,description,type
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
year,Year,Year of record.,datetime
geography,Geography,"Geography of record (""United States"", ""California"", or ""San Diego"").",string
commute_time,Commute Time,Average resident commute time (in minutes).,float


In [63]:
docs.list_sources(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0_level_0,name,organization,active,notes
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acs_S0802,American Community Survey API (S0802),US Census Bureau,1.0,


In [64]:
steps = docs.list_update_steps(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)
steps

Unnamed: 0,step
0,Download raw ACS data.
1,Transform raw ACS data.
2,Extract legacy PM data and combine it with new data.


In [65]:
docs.list_remarks(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0,author,note
0,TAN,"Public ACS data only goes back to 2010, so had to use legacy PM data 2005-2010."
1,TAN,"US and California modes were never archived for more than the most recent year, so only had San Diego County data all the way back to 2005."


### Step 0: Download ACS data

Note that finalized 2021 is not availible for 1 year ACS (and it may never be availible), and 2022 data isn't expected until sometime September 2023.

In [66]:
display(steps.loc[0])

# Per https://api.census.gov/data/2021/acs/acs1/subject/variables.html
columns = {
    'S0802_C01_090E': 'commute_time',
}

if not (CONFIG['raw_dir']/'S0802.csv').exists():
    raw_data = acs.download_subject_table_acs_data(
        CENSUS_API_KEY,
        years=[2021],
        columns=list(columns.keys()),
    )
    raw_data.to_csv(CONFIG['raw_dir']/'S0802.csv', index=False)
else:
    raw_data = pd.read_csv(CONFIG['raw_dir']/'S0802.csv')
raw_data.tail(3)

step    Download raw ACS data.
Name: 0, dtype: object

Unnamed: 0,NAME,S0802_C01_090E,state,county,us,year
0,"San Diego County, California",24.5,6.0,73.0,,2021-01-01
1,California,27.6,6.0,,,2021-01-01
2,United States,25.6,,,1.0,2021-01-01


### Step 1: Transform ACS data

In [67]:
display(steps.loc[1])

# Clean raw data
new_data = (
    raw_data
    .drop(columns=['us', 'state', 'county'])
    .rename(columns={'NAME': 'geography'} | columns)
    .assign(year=lambda df: pd.to_datetime(df.year, format='%Y-%m-%d'))
    .set_index(['year', 'geography'])
    .rename({'San Diego County, California': 'San Diego County'})
)

step    Transform raw ACS data.
Name: 1, dtype: object

### Step 2: Extract legacy data and combine

In [68]:
display(steps.loc[2])

step    Extract legacy PM data and combine it with new data.
Name: 2, dtype: object

In [69]:
legacy_data = (
    pd.read_excel(
        CONFIG['legacy_xlsx_path'],
        sheet_name=CONFIG['legacy_sheet'],
        usecols='A:D',
        skiprows=4,
        nrows=16,
    )
    .rename(columns={'Unnamed: 0': 'year'})
    .melt('year', var_name='geography', value_name='commute_time')
    .assign(year=lambda df: pd.to_datetime(df.year, format='%Y'))
    .set_index(['year', 'geography'])
)

commute_time = pd.concat([new_data, legacy_data]).sort_index()
commute_time.tail(9)

Unnamed: 0_level_0,Unnamed: 1_level_0,commute_time
year,geography,Unnamed: 2_level_1
2019-01-01,California,30.7
2019-01-01,San Diego,27.2
2019-01-01,United States,27.6
2020-01-01,California,
2020-01-01,San Diego,
2020-01-01,United States,
2021-01-01,California,27.6
2021-01-01,San Diego County,24.5
2021-01-01,United States,25.6


### Save Data

In [70]:
display(Markdown('#### Clean data path'))
clean_dir_path = str(CONFIG['clean_dir']).replace('\\', '/')
display(Markdown(f"[{clean_dir_path}]({clean_dir_path})"))

#### Clean data path

[C:/Users/tan/src/regional-pm-2023/data/clean/transportation/commute/commute_time](C:/Users/tan/src/regional-pm-2023/data/clean/transportation/commute/commute_time)

In [71]:
commute_time.to_csv(
    CONFIG['clean_dir']
    / f'{INDICATOR}_odp.csv'
)