In [1]:
import pandas as pd, numpy as np
from datetime import date

#### Link to Online Codebook
https://www.census.gov/data-tools/demo/uccb/sippdict?sortby=topic

#### Load Data Dictionary Files & Format df Column Names

In [2]:
# Read data dictionary csv's, create and display data dictionary df
sipp_dict_1 = pd.read_csv('../data/raw/sipp_2018/sippdict_1_of_2.csv')
sipp_dict_2 = pd.read_csv('../data/raw/sipp_2018/sippdict_2_of_2.csv')
sipp_dict = pd.concat([sipp_dict_1, sipp_dict_2])
sipp_dict.columns = [name.lower().replace(' ', '_') for name in sipp_dict.columns]

In [3]:
sipp_dict.head()

Unnamed: 0,variable,topic,subtopic,survey_years,response_code,description,question,data_type,universe,universe_description,user_notes,record_level
0,EAWBCRACK,Adult and Child Well Being,Adult Well-Being,2018,1. Yes||2. No,Are there cracks in the ceiling or walls?,Are there cracks in the ceiling or walls?,Numeric,"THHLDSTATUS in (1,2,3,4)",All interviewed households (asked of reference...,"Descriptions, response codes, and universes ma...",Household
1,EAWBCRACK,Adult and Child Well Being,Adult Well-Being,"2014 Wave 4, 2014 Wave 3, 2014 Wave 2, 2014 Wa...",1. Yes||2. No,Are there cracks in the ceiling or walls?,Are there cracks in the ceiling or walls?,Numeric,All interviewed households.,All interviewed households (asked of reference...,"Descriptions, response codes, and universes ma...",Household
2,EAWBGAS,Adult and Child Well Being,Adult Well-Being,2018,1. Yes||2. No,Was ... unable to pay the utility bills?,Was ... unable to pay the utility bills?,Numeric,"THHLDSTATUS in (1,2,3,4)",All interviewed households (asked of reference...,"Descriptions, response codes, and universes ma...",Household
3,EAWBGAS,Adult and Child Well Being,Adult Well-Being,"2014 Wave 4, 2014 Wave 3, 2014 Wave 2, 2014 Wa...",1. Yes||2. No,Was ... unable to pay the utility bills?,Was ... unable to pay the utility bills?,Numeric,All interviewed households.,All interviewed households (asked of reference...,"Descriptions, response codes, and universes ma...",Household
4,EAWBHOLES,Adult and Child Well Being,Adult Well-Being,2018,1. Yes||2. No,Are there holes in the floor?,Are there holes in the floor?,Numeric,"THHLDSTATUS in (1,2,3,4)",All interviewed households (asked of reference...,"Descriptions, response codes, and universes ma...",Household


#### Identify Variable Names for Data Schema

In [11]:
# Harrison's Columns

collateralized_debt = ['EBOATDEBT', 'EMCYCDEBT', 'EMHLOAN1SITE', 'EPRDEBT', 'EPRLOAN1FXVR', 'EPRLOAN1RATE',
                      'EPRLOAN1TYPE', 'ERVDEBT', 'TBOATDEBTVAL', 'TBSI1DEBTVAL', 'TMCYCDEBTVAL', 'TDEBT_BUS', 'TDEBT_HOME',
                       'TDEBT_RE', 'TDEBT_SEC', 'TDEBT_VEH', 'TPRLOANAMT', 'TRVDEBTVAL']

unsecured_debt = ['EDEBT_CC', 'EDEBT_ED', 'EDEBT_MED', 'EJSCCDEBT', 'EJSEDDEBT', 'EJSOTDEBT',  'TDEBT_CC', 'TDEBT_ED',
                 'TDEBT_OT', 'TDEBT_BUS', 'TDEBT_AST', 'TDEBT_RENT', 'TDEBT_USEC']

arthur_boat_bonus = ['EREC_BOAT', 'TBOATVAL']

harrison_cols = collateralized_debt + unsecured_debt + arthur_boat_bonus

In [12]:
# Standard Columns suggested by Census
std_cols = [#Common case-identification variables
    'SSUID','PNUM','MONTHCODE','ERESIDENCEID','ERELRPE','SPANEL','SWAVE',
    #The base weight and monthly in-survey-universe indicator
    'WPFINWGT','RIN_UNIV',
    #Additional variables for analysis
    'TPTOTINC','RTANF_MNYN']

# Core Demographics columns
demo_cols = ['ESEX', 'TAGE', 'TRACE',          # sex, age @interview, detailed race
             'EORIGIN', 'TLANG1', 'EHOWWELL',  # hispanic origin, lang spoken @home, how well eng spoken
             'EBORNUS', 'ECITIZEN', 'ESPEAK',  # US born, US citizen, lang other than eng at home
             'RFAMKIND', 'EMS', 'EEDUC',       # family type, marital status, highest school/degree completed
             'RLNGISOL', 'TLIVQTR', 'ETENURE', # ltd eng household, type livng qtr, own/rent/other livng qtr
             'TEHC_REGION', 'TEHC_STATE'       # state and region
            ]

# Set of Stock and 401k ownership columns
st_own_cols = ['EOWN_ST', 'EOWN_THR401']       # own stocks, own 401k/tax-sheltered investment

# Team Topic Columns
jesus_cols =['TPTRNINC']

ivan_cols = ['TTHR401VAL', 'TIRAKEOVAL', 'TOSAVVAL']

stanley_cols = []

arthur_cols = []

team_cols = ivan_cols + jesus_cols + stanley_cols + harrison_cols + arthur_cols

# Concatenate columns to import
data_schema_variables = (std_cols
                         + demo_cols 
                         + st_own_cols
                         + team_cols
                        )

#### Filtering Cell

In [14]:
# Create boolean masks to drill down to dictionary variables

# Variable mask
variable_mask = sipp_dict.variable.isin(data_schema_variables)

# Combine masks
mask_final = variable_mask

#### Create & Display Data Schema

In [16]:
# Display filtered data dictionary
pd.set_option('display.max_colwidth', 0)   # display all column text
filtered_data_dict = sipp_dict[mask_final][['variable', 'data_type', 'description']]
filtered_data_dict['variable'] = (filtered_data_dict
                                  .variable
                                  .str.lower()
                                  .str.replace(' ', '_')
                                 )

#### Save Data Schema to csv

In [17]:
# Save to csv
today = date.today()
filepath = f'../docs/data_schema/data_schema_{today}.csv'
filtered_data_dict.to_csv(filepath, index=False)

In [21]:
filtered_data_dict.count()

variable       113
data_type      113
description    113
dtype: int64