In [None]:
#!/usr/bin/env python3
#
# This notebook reads a full year of the
#      Bureau of Labor Statistics'
#      Consumer Expenditure Survey
#      using the Family Interviews (FMLI) of
#      it's Public Use Micro Data (PUMD)
#
# It applies the business rules needed to weight and allocate rolling 3 month interviews
# and generates the fmli.pkl use to train models.
#
#   https://www.bls.gov/cex/cecomparison/acs_profile.htm
#   https://www.bls.gov/cex/cecomparison.htm#cedc
#   https://www.bls.gov/cex/pumd/ce_pumd_interview_diary_dictionary.xlsx
#   https://www.bls.gov/cex/pumd/stubs.zip
#   https://www.bls.gov/cex/csxgloss.htm
#   https://webapps.ilo.org/surveyLib/index.php/catalog/1193/download/17782
#
# Set global variables
#
source_folder = "."
this_year = 2023

#
# Import libraries, set working variables
#
import pandas as pd
pd.set_option('display.float_format',  '{:,.2f}'.format)
import numpy as np
import glob
import warnings
warnings.filterwarnings('ignore')

aggvars = ['TOTEXP','FOOD','ALCBEV','HOUS','APPAR','TRANS','HEALTH', 'ENTERT',
           'PERSCA', 'READ','EDUCA','TOBACC','LIFINS','MISC', 'CASHCO','RETPEN']

this_year2 = str(this_year)[-2:]
last_year = this_year - 1
last_year2 = str(last_year)[-2:]
next_year = this_year + 1
next_year2 = str(next_year)[-2:]


#
# Read in all the fmli source files downloaded from the BLS
#

Fquarters = []

filelist = glob.glob(source_folder+"/intrvw"+this_year2+"/intrvw"+this_year2+"/fmli*")
filelist.append(source_folder+"/intrvw"+last_year2+"/intrvw"+last_year2+"/fmli"+this_year2+"1.sas7bdat")

# The file list looks like:
#['./intrvw22/intrvw22/fmli231.sas7bdat',
# './intrvw23/intrvw23/fmli232.sas7bdat',
# './intrvw23/intrvw23/fmli233.sas7bdat',
# './intrvw23/intrvw23/fmli234.sas7bdat',
# './intrvw23/intrvw23/fmli241.sas7bdat',]

for f in filelist:
    tmpF = pd.read_sas(f, format = 'sas7bdat', encoding="utf-8")
    tmpF["FILENAME"] = f
    Fquarters.append(tmpF.copy())

fmli = pd.concat(Fquarters)


#
# Rule 1: Enforce numeric data types
#

fmli["QINTRVMO"] = fmli.QINTRVMO.astype(int)
fmli["QINTRVYR"] = fmli.QINTRVYR.astype(int)
fmli["FINLWT21"] = fmli.FINLWT21.astype(int)
for col in [c+q for q in ['CQ','PQ'] for c in aggvars]:
    fmli[col] = fmli[col].astype(float)
fmli['REF_RACE'] = fmli.REF_RACE.astype(int)
fmli['HISP_REF'] = fmli.HISP_REF.astype(int)
#
# Rule 2: calculate the MO_SCOPE, number of months each interview included from this_year
#

fmli['MO_SCOPE'] = \
           np.where(fmli['QINTRVYR'] == this_year,    # if this year Q1, weight 0,1,2
               np.where(fmli['QINTRVMO'] == 1,0,
               np.where(fmli['QINTRVMO'] == 2,1,
               np.where(fmli['QINTRVMO'] == 3,2,3))),
           np.where(fmli['QINTRVYR'] == next_year,    # else if next year Q1, weight 3,2,1
               np.where(fmli['QINTRVMO'] == 1,3,
               np.where(fmli['QINTRVMO'] == 2,2,
               np.where(fmli['QINTRVMO'] == 3,1,9999))),9999)) # 9999 is an error condition


#
# Rule 3. Apply weights to the aggregate variables
#

for expense in [c+q for q in ['CQ','PQ'] for c in aggvars]:
    fmli["WTD_"+expense] = fmli.FINLWT21 * fmli[expense]

   
#
# Rule 4: Combine PQ and CQ variables conditionally for annual aggregating
#

for expense in aggvars:
    fmli["TQ_"+expense] = \
        np.where((fmli.QINTRVYR == this_year) & (fmli.QINTRVMO < 4),  # This year Q1, only CQ
                  fmli["WTD_"+expense+"CQ"],
        np.where((fmli.QINTRVYR == next_year) & (fmli.QINTRVMO < 4),  # Next year Q1, only PQ
                  fmli["WTD_"+expense+"PQ"],
                  fmli["WTD_"+expense+"CQ"]+fmli["WTD_"+expense+"PQ"] # Otherwise PQ + CQ
                ))


#
# Rule 5: Combine the PQ & CQ variables to a total on each row for modelling
#

for expense in aggvars:
    fmli[expense] = fmli[expense+"CQ"]+fmli[expense+"PQ"]


#
# Rule 6: Average the aggregate columns using the quarterly CU counts
#

report = fmli[["FILENAME","FINLWT21"]+["TQ_"+c for c in aggvars]]\
        .groupby("FILENAME").sum().reset_index()

for column in aggvars:
    report[column] = report["TQ_"+column] / report.FINLWT21


#
# Check these results against the reported totals:  https://www.bls.gov/cex/tables.htm
#

print()
print("CU count:", f"{sum((fmli.FINLWT21 * fmli.MO_SCOPE / 3) / 4):,.0f}")
print(report[aggvars].sum())

# Expecting these results from the bls:
#Number of consumer units              134,556
#Average annual expenditures            77,280
#Food                                    9,985
#Alcoholic beverages                       637
#Housing                                25,436
#Apparel and services                    2,041
#Transportation                         13,174
#Healthcare                              6,159
#Entertainment                           3,635
#Personal care products and services       950
#Reading                                   117
#Education                               1,656
#Tobacco products                          370
#Miscellaneous                           1,184
#Cash contributions                      2,378
#Personal insurance and pensions         9,556

#
# Finally, write the fmli.pkl file to serve modelling.
# 

fmli.to_pickle("fmli.pkl")
