In [32]:
%reload_ext autoreload
%autoreload 2

from IPython.core.display import Markdown
from tqdm.auto import tqdm
import config
import docs
import pandas as pd
import requests
import acs
from datetime import datetime

pd.set_option('display.max_colwidth', None)

INDICATOR = 'unemployment_rate'
CONFIG = config.get_config(INDICATOR, '../config.toml')
CENSUS_API_KEY = CONFIG['acs_api_key']

display(Markdown('## Raw data path'))
raw_dir_path = str(CONFIG['raw_dir']).replace('\\', '/')
display(Markdown(f"[{raw_dir_path}]({raw_dir_path})"))


## Raw data path

[C:/Users/tan/src/regional-pm-2023/data/raw/qol/unemployment/unemployment_rate](C:/Users/tan/src/regional-pm-2023/data/raw/qol/unemployment/unemployment_rate)

# Quality of Life: Unemployment

## Unemployment Rate

In [33]:
docs.describe_indicator(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Percentage of residents unemployed.

nan

In [34]:
docs.list_schema(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0_level_0,name,description,type
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
year,Year,Year of record.,datetime
geography,Geography,"Geography of record (""United States"", ""California"", or ""San Diego"").",string
unemployment_rate,Unemployment Rate,"Percentage of residents unemployed, in a given year for a given geography.",float


In [35]:
docs.list_sources(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0_level_0,name,organization,active,notes
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acs_S0102,American Community Survey API (S0102),US Census Bureau,1.0,


In [36]:
steps = docs.list_update_steps(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)
steps

Unnamed: 0,step
0,Download raw ACS data.
1,Transform raw ACS data.


In [37]:
docs.list_remarks(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0,author,note
0,TAN,Public ACS data only goes back to 2010.


### Step 0: Download ACS data

Note that finalized 2021 is not availible for 1 year ACS (and it may never be availible), and 2022 data isn't expected until sometime September 2023.

In [38]:
display(steps.loc[0])

step    Download raw ACS data.
Name: 0, dtype: object

In [39]:
# Per https://api.census.gov/data/2021/acs/acs1/subject/variables.html
columns = {
    'S0102_C01_071E': 'unemployment_rate',
}

if not (CONFIG['raw_dir']/'S0102.csv').exists():
    raw_data = acs.download_subject_table_acs_data(
        CENSUS_API_KEY,
        years=list(range(2010, 2020)) + [2021],
        columns=list(columns.keys()),
    )
    raw_data.to_csv(CONFIG['raw_dir']/'S0102.csv', index=False)
else:
    raw_data = pd.read_csv(CONFIG['raw_dir']/'S0102.csv')
raw_data.tail(3)

Unnamed: 0,NAME,S0102_C01_071E,state,county,us,year
30,"San Diego County, California",8.5,6.0,73.0,,2021-01-01
31,California,8.3,6.0,,,2021-01-01
32,United States,6.3,,,1.0,2021-01-01


### Step 1: Transform ACS data

In [43]:
display(steps.loc[1])

# Clean raw data
unemployment_rate = (
    raw_data
    .drop(columns=['us', 'state', 'county'])
    .rename(columns={'NAME': 'geography'} | columns)
    .assign(year=lambda df: pd.to_datetime(df.year, format='%Y-%m-%d'))
    .set_index(['year', 'geography'])
    .rename({'San Diego County, California': 'San Diego County'})
)

# Add rows for missing 2020 data
unemployment_rate = (
    pd.concat(
        [
            unemployment_rate,
            pd.DataFrame(
                data={
                    'unemployment_rate': [pd.NA, pd.NA, pd.NA,]
                },
                index=pd.MultiIndex.from_tuples(
                    [
                        (datetime(2020, 1, 1), 'San Diego County'),
                        (datetime(2020, 1, 1), 'California'),
                        (datetime(2020, 1, 1), 'United States'),
                    ],
                    names=['year', 'geography'])
            ),
        ]
    )
    .sort_index()
)
unemployment_rate.index.name = ('year', 'geography')
unemployment_rate.tail(9)

step    Transform raw ACS data.
Name: 1, dtype: object

Unnamed: 0_level_0,Unnamed: 1_level_0,unemployment_rate
year,geography,Unnamed: 2_level_1
2019-01-01,California,5.1
2019-01-01,San Diego County,5.6
2019-01-01,United States,4.5
2020-01-01,California,
2020-01-01,San Diego County,
2020-01-01,United States,
2021-01-01,California,8.3
2021-01-01,San Diego County,8.5
2021-01-01,United States,6.3


### Save Data

In [41]:
display(Markdown('#### Clean data path'))
clean_dir_path = str(CONFIG['clean_dir']).replace('\\', '/')
display(Markdown(f"[{clean_dir_path}]({clean_dir_path})"))

#### Clean data path

[C:/Users/tan/src/regional-pm-2023/data/clean/qol/unemployment/unemployment_rate](C:/Users/tan/src/regional-pm-2023/data/clean/qol/unemployment/unemployment_rate)

In [42]:
unemployment_rate.to_csv(
    CONFIG['clean_dir']
    / f'{INDICATOR}_odp.csv'
)