In [26]:
%reload_ext autoreload
%autoreload 2

from IPython.core.display import Markdown
from tqdm.auto import tqdm
import config
import docs
import pandas as pd
import requests
from pathlib import Path

pd.set_option('display.max_colwidth', None)

INDICATOR = 'real_income'
CONFIG = config.get_config(INDICATOR, '../config.toml')

display(Markdown('## Raw data path'))
raw_dir_path = str(CONFIG['raw_dir']).replace('\\', '/')
display(Markdown(f"[{raw_dir_path}]({raw_dir_path})"))


## Raw data path

[C:/Users/tan/src/regional-pm-2023/data/raw/qol/income/real_income](C:/Users/tan/src/regional-pm-2023/data/raw/qol/income/real_income)

# Quality of Life: Income

## Real Income

In [27]:
docs.describe_indicator(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Real per capita income (adjusted for 2022 CPI).

nan

In [28]:
docs.list_schema(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0_level_0,name,description,type
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
year,Year,Year of record.,datetime
geography,Geography,"Geography of record (""United States"", ""California"", or ""San Diego"").",string
income,Income,Unajusted per capita income in a given year for a given geography.,
cpi,Consumer Price Index (CPI),CPI value in a given year for a given geography.,
real_income,Real Income,"Real per capita income (adjusted for 2022 CPI), in a given year for a given geography.",float


In [29]:
docs.list_sources(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0_level_0,name,organization,active,notes
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
income_fred,San Diego Personal Income (PCPI06073),St. Louis Federal Reserve,True,
income_fred,California Personal Income (CAPCPI),St. Louis Federal Reserve,True,
income_fred,US Personal Income (A792RC0A052NBEA),St. Louis Federal Reserve,True,
cpi_fred,San Diego CPI (CUUSA424SA0),St. Louis Federal Reserve,True,
cpi_bls,US CPI (CUUR0000SA0),Bureau of Labor Statistics,True,A FRED source for this was not found.
cpi_dir,California CPI,California Department of Industrial Relations,True,"This PDF sheet was the best source we found, so values were manually extracted."


In [30]:
steps = docs.list_update_steps(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)
steps

Unnamed: 0,step
0,Download new Per Capita Income and CPI data.
1,Extract Per Capita Income data.
2,Extract CPI data.
3,Combine Per Capita Income and CPI and calculate personal income.


In [31]:
docs.list_remarks(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0,author,note
0,,


### Step 0: Download new data

In [32]:
display(steps.loc[0])

step    Download new Per Capita Income and CPI data.
Name: 0, dtype: object

* New years: 2021-2022 (though income is only 2021 for San Diego).
* Data comes from the following:
  * Per Capita Income: all tables from St. Louis Federal Reserve Economic Data (FRED)
    * United States: [A792RC0A052NBEA](https://fred.stlouisfed.org/series/A792RC0A052NBEA) 
    * San Diego: [PCPI06073](https://fred.stlouisfed.org/series/PCPI06073)
    * California: [CAPCPI](https://fred.stlouisfed.org/series/CAPCPI)
  * CPI (for adjusting to real income)
    * United States: [https://data.bls.gov/timeseries/CUUR0000SA0 from Bureau of Labor Statistics (BLS)](https://data.bls.gov/timeseries/CUUR0000SA0) 
    * San Diego: [CUUSA424SA0 from FRED](https://fred.stlouisfed.org/series/CUUSA424SA0)
    * California: [Table from CA Department of Industrial Relations](https://www.dir.ca.gov/OPRL/CPI/EntireCCPI.PDF) 
      * (FRED or BLS doesn't report it, this PDF is the best alternative we found. There's a manual extract in the raw data folder.)
* The email in `raw/` mentions that past data was revised. We recalculated based on the sources suggested in the email.


### Step 1: Extract Per Capita Income

In [33]:
display(steps.loc[1])

step    Extract Per Capita Income data.
Name: 1, dtype: object

In [34]:
def parse_fred_income(csv_path: Path, geography: str) -> pd.DataFrame:
    return (
        pd.read_csv(csv_path, names=['year', 'income'], header=None, skiprows=1)
        .assign(geography=geography)
        .query('`year` >= "2005"')
        .assign(year=lambda df: pd.to_datetime(df.year, format='%Y-%m-%d'))
        .set_index(['year', 'geography'])
    )

In [35]:
income = pd.concat(
    [
        parse_fred_income(
            csv_path=CONFIG['raw_dir']/'income/A792RC0A052NBEA.csv', 
            geography='United States'
        ),
        parse_fred_income(
            csv_path=CONFIG['raw_dir']/'income/PCPI06073.csv', 
            geography='San Diego'
        ),
        parse_fred_income(
            csv_path=CONFIG['raw_dir']/'income/CAPCPI.csv', 
            geography='California'
        )
    ]
)
income.sort_index().tail(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,income
year,geography,Unnamed: 2_level_1
2021-01-01,California,76800.0
2021-01-01,San Diego,72637.0
2021-01-01,United States,64073.0
2022-01-01,California,77339.0
2022-01-01,United States,65280.0


### Step 2: Extract CPI

In [36]:
display(steps.loc[2])

step    Extract CPI data.
Name: 2, dtype: object

In [37]:
us_cpi = (
    pd.read_excel(
        CONFIG['raw_dir']/'cpi/SeriesReport-20230816142508_0050ae.xlsx',
        skiprows=11,
        usecols='A:M'
    )
    .rename(columns=lambda col: col.lower())
    .assign(year=lambda df: pd.to_datetime(df.year, format='%Y'))
    .query('`year` < 2023 and `year` >= 2005')
    .melt(id_vars='year', var_name='month',value_name='cpi')
    .groupby('year')
    .aggregate(
        {
            'month': 'min',
            'cpi': 'mean',
        }
    )
    .drop(columns='month')
    .assign(geography='United States')
    .set_index('geography', append=True)
)

sd_cpi = (
    pd.read_csv(
        CONFIG['raw_dir']/'cpi/CUUSA424SA0.csv',
        names=['year', 'cpi'], 
        header=None, 
        skiprows=1,
    )
        .assign(geography='San Diego')
        .query('`year` >= "2005-01-01"')
        .assign(year=lambda df: pd.to_datetime(df.year, format='%Y-%m-%d'))
        .set_index(['year', 'geography'])
)

ca_cpi = (
    pd.read_excel(CONFIG['raw_dir']/'cpi/EntireCCPI.xlsx')
    .assign(year=lambda df: pd.to_datetime(df.year, format='%Y'))
    .assign(geography='California')
    .set_index(['year', 'geography'])
)

In [38]:
cpi = (
    pd.concat(
        [
            us_cpi,
            sd_cpi,
            ca_cpi,
        ]
    )
    .sort_index()
)
cpi.tail(6)

Unnamed: 0_level_0,Unnamed: 1_level_0,cpi
year,geography,Unnamed: 2_level_1
2021-01-01,California,297.371
2021-01-01,San Diego,319.761
2021-01-01,United States,270.96975
2022-01-01,California,319.224
2022-01-01,San Diego,344.416
2022-01-01,United States,292.654917


### Step 3: Calculate real income

In [39]:
display(steps.loc[3])

step    Combine Per Capita Income and CPI and calculate personal income.
Name: 3, dtype: object

Use most recent CPI to weigh dollars.

In [40]:
current_cpi = cpi.loc['2022'].droplevel(0)
current_cpi

Unnamed: 0_level_0,cpi
geography,Unnamed: 1_level_1
California,319.224
San Diego,344.416
United States,292.654917


In [41]:
real_income = (
    income
    .join(cpi, how='right')
    .join(current_cpi, rsuffix='_current', how='left')
    .assign(
        real_income=lambda df:
            df.income * (df.cpi_current / df.cpi)
    )
    .drop(columns='cpi_current')
)

real_income.tail(6)

Unnamed: 0_level_0,Unnamed: 1_level_0,income,cpi,real_income
year,geography,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-01,California,76800.0,297.371,82443.826735
2021-01-01,San Diego,72637.0,319.761,78237.636835
2021-01-01,United States,64073.0,270.96975,69200.633929
2022-01-01,California,77339.0,319.224,77339.0
2022-01-01,San Diego,,344.416,
2022-01-01,United States,65280.0,292.654917,65280.0


### Save Data

In [42]:
display(Markdown('#### Clean data path'))
clean_dir_path = str(CONFIG['clean_dir']).replace('\\', '/')
display(Markdown(f"[{clean_dir_path}]({clean_dir_path})"))

#### Clean data path

[C:/Users/tan/src/regional-pm-2023/data/clean/qol/income/real_income](C:/Users/tan/src/regional-pm-2023/data/clean/qol/income/real_income)

In [43]:
real_income.to_csv(
    CONFIG['clean_dir']
    / f'{INDICATOR}_odp.csv'
)