In [108]:
%reload_ext autoreload
%autoreload 2

from IPython.core.display import Markdown
from tqdm.auto import tqdm
import config
import docs
import pandas as pd
import requests
import acs

pd.set_option('display.max_colwidth', None)

INDICATOR = 'poverty_rate'
CONFIG = config.get_config(INDICATOR, '../config.toml')
CENSUS_API_KEY = CONFIG['acs_api_key']

display(Markdown('## Raw data path'))
raw_dir_path = str(CONFIG['raw_dir']).replace('\\', '/')
display(Markdown(f"[{raw_dir_path}]({raw_dir_path})"))


## Raw data path

[C:/Users/tan/src/regional-pm-2023/data/raw/qol/poverty/poverty_rate](C:/Users/tan/src/regional-pm-2023/data/raw/qol/poverty/poverty_rate)

# Quality of Life: Poverty

## Poverty Rate

In [109]:
docs.describe_indicator(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Percentage of residents living in poverty.

nan

In [110]:
docs.list_schema(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0_level_0,name,description,type
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
year,Year,Year of record.,datetime
geography,Geography,"Geography of record (""United States"", ""California"", or ""San Diego""), in a given year for a given geography.",string
poverty_rate,Povery Rate,Percentage of residents living in poverty,float


In [111]:
docs.list_sources(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0_level_0,name,organization,active,notes
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acs_S1701,American Community Survey API (S1701),US Census Bureau,1.0,


In [112]:
steps = docs.list_update_steps(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)
steps

Unnamed: 0,step
0,Download raw ACS data.
1,Transform raw ACS data (STEPS).


In [113]:
docs.list_remarks(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0,author,note
0,,"Public ACS data only goes back to 2010, so had to use legacy PM data 2005-2010."


### Step 0: Download ACS data

Note that finalized 2021 is not availible for 1 year ACS (and it may never be availible), and 2022 data isn't expected until sometime September 2023.

In [114]:
display(steps.loc[0])

step    Download raw ACS data.
Name: 0, dtype: object

In [115]:
# Per https://api.census.gov/data/2021/acs/acs1/subject/variables.html
columns = {
    'S1701_C01_001E': 'population',
    'S1701_C01_042E': 'below_200',
}

if not (CONFIG['raw_dir']/'S1701.csv').exists():
    raw_data = acs.download_subject_table_acs_data(
        CENSUS_API_KEY,
        years=[2021],
        columns=list(columns.keys()),
    )
    raw_data.to_csv(CONFIG['raw_dir']/'S1701.csv', index=False)
else:
    raw_data = pd.read_csv(CONFIG['raw_dir']/'S1701.csv')
raw_data.tail(10)

Unnamed: 0,NAME,S1701_C01_001E,S1701_C01_042E,state,county,us,year
0,"San Diego County, California",3201030,762473,6.0,73.0,,2021-01-01
1,California,38481790,10612491,6.0,,,2021-01-01
2,United States,324173084,92740902,,,1.0,2021-01-01


### Step 1: Transform ACS data

In [116]:
display(steps.loc[1])

step    Transform raw ACS data (STEPS).
Name: 1, dtype: object

In [117]:
# Clean raw data
new_data = (
    raw_data
    .drop(columns=['us', 'state', 'county'])
    .rename(columns={'NAME': 'geography'} | columns)
    .set_index(['year', 'geography'])
    .rename({'San Diego County, California': 'San Diego County'})
)
new_data

Unnamed: 0_level_0,Unnamed: 1_level_0,population,below_200
year,geography,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-01-01,San Diego County,3201030,762473
2021-01-01,California,38481790,10612491
2021-01-01,United States,324173084,92740902


In [118]:
# Calculate new poverty rates
new_poverty_rate = (
    (new_data.below_200 / new_data.population)
    .to_frame(name='poverty_rate')
    .reset_index()
    .assign(year=lambda df: pd.to_datetime(df.year, format='%Y-%m-%d'))
    .set_index(['year', 'geography'])
)
new_poverty_rate

Unnamed: 0_level_0,Unnamed: 1_level_0,poverty_rate
year,geography,Unnamed: 2_level_1
2021-01-01,San Diego County,0.238196
2021-01-01,California,0.27578
2021-01-01,United States,0.286085


### Step 2: Extract and combine legacy data

In [None]:
display(steps.loc[2])

In [119]:
legacy_poverty_rate = (
    pd.read_excel(
        CONFIG['legacy_xlsx_path'],
        sheet_name=CONFIG['legacy_sheet'],
        usecols='A:D',
        skiprows=4,
        nrows=16,
    )
    .rename(columns={'Unnamed: 0': 'year'})
    .melt('year', var_name='geography', value_name='poverty_rate')
    .assign(year=lambda df: pd.to_datetime(df.year, format='%Y'))
    .set_index(['year', 'geography'])
    .rename({'San Diego': 'San Diego County'})
)
legacy_poverty_rate.tail(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,poverty_rate
year,geography,Unnamed: 2_level_1
2018-01-01,United States,0.30471
2019-01-01,United States,0.289442
2020-01-01,United States,


In [120]:
poverty_rate = pd.concat([new_poverty_rate, legacy_poverty_rate]).sort_index()

poverty_rate.tail(9)

Unnamed: 0_level_0,Unnamed: 1_level_0,poverty_rate
year,geography,Unnamed: 2_level_1
2019-01-01,California,0.279966
2019-01-01,San Diego County,0.252132
2019-01-01,United States,0.289442
2020-01-01,California,
2020-01-01,San Diego County,
2020-01-01,United States,
2021-01-01,California,0.27578
2021-01-01,San Diego County,0.238196
2021-01-01,United States,0.286085


### Save Data

In [121]:
display(Markdown('#### Clean data path'))
clean_dir_path = str(CONFIG['clean_dir']).replace('\\', '/')
display(Markdown(f"[{clean_dir_path}]({clean_dir_path})"))

#### Clean data path

[C:/Users/tan/src/regional-pm-2023/data/clean/qol/poverty/poverty_rate](C:/Users/tan/src/regional-pm-2023/data/clean/qol/poverty/poverty_rate)

In [122]:
poverty_rate.to_csv(
    CONFIG['clean_dir']
    / f'{INDICATOR}_odp.csv'
)