In [2]:
import pandas as pd, numpy as np

#### Link to Online Codebook
https://www.census.gov/data-tools/demo/uccb/sippdict?sortby=topic

#### Load Data Dictionary Files & Format df Column Names

In [6]:
# Read data dictionary csv's, create and display data dictionary df
sipp_dict_1 = pd.read_csv('../data/raw/sipp_2018/sippdict_1_of_2.csv')
sipp_dict_2 = pd.read_csv('../data/raw/sipp_2018/sippdict_2_of_2.csv')
sipp_dict = pd.concat([sipp_dict_1, sipp_dict_2])
#sipp_dict.columns = [name.lower().replace(' ', '_') for name in sipp_dict.columns]

In [7]:
sipp_dict.columns


Index(['Variable', 'Topic', 'Subtopic', 'Survey Years', 'Response Code',
       'Description', 'Question', 'Data Type', 'Universe',
       'Universe Description', 'User Notes', 'Record Level'],
      dtype='object')

#### Filtering Cell

In [16]:
# Create boolean masks to drill down to dictionary variables

# Survey year filter
mask_2018 = sipp_dict.survey_years.str.contains('2018')

# Topic filter
topic = 'Assets'                              # << Type topic you want to filter by here 
list_topic = sipp_dict.topic.unique()         # Print this variable to view list of topics
mask_topic = sipp_dict.topic == topic

# Topic filter 2 (optiona)
topic2 = 'Demographics'                              
mask_topic2 = sipp_dict.topic == topic2

# Subtopic filter
subtopic = 'Commuting and Work Schedule'                              # << Type subtopic you want to filter by here 
list_subtopic = sipp_dict.subtopic.unique()   # Print this variable to view list of all subtopics
mask_subtopic = sipp_dict.subtopic == subtopic

# Exclusion masks
mask_flag = ~sipp_dict.description.str.contains('Status Flag')
mask_suppressed =  ~sipp_dict.description.str.contains('Suppressed')
mask_topcoded_stats = ~sipp_dict.description.str.contains('Median|standard deviation')
mask_exclusions = mask_flag & mask_suppressed & mask_topcoded_stats

# Combines all filters groups. Exclude a filter group by commenting out line
mask_final = (mask_2018 
              #& (mask_topic | mask_topic2)
              & mask_exclusions
             )

#### Create & Display Filtered Data Dictionary

In [17]:
# Display filtered data dictionary
pd.set_option('display.max_colwidth', None)   # display all column text
filtered_data_dict = sipp_dict[mask_final][['variable', 'data_type', 'description', 'topic', 'subtopic', 'record_level']]
filtered_data_dict.head()
filtered_data_dict.shape

(2191, 6)

#### Save Filtered Data Dictionary to csv

In [18]:
# Optional - save to csv. I find it easier to browse large tables on excel than jupyter.
filepath = '../data/interim/feature_import_meta.csv'  # this filepath saves to your desktop on mac
filtered_data_dict.to_csv(filepath, index=False)

## Create Metadata csv's - 1 for Each Topic

In [28]:
def create_data_dict(df, topic):
    '''
    Create a data dictionary and save to csv that is limited to the topic that is provided in the input.
    Input: df and topic as string. Topic corresponds to values in the topic column of the inputted csv.
    Output: csv saved to the interim data directory.
    '''
    
    # Masks
    mask_topic = df.topic == topic
    mask_2018 = df.survey_years.str.contains('2018')
    mask_flag = ~sipp_dict.description.str.contains('Status Flag')
    mask_suppressed =  ~sipp_dict.description.str.contains('Suppressed')
    mask_topcoded_stats = ~sipp_dict.description.str.contains('Median|standard deviation')
    all_masks = mask_topic & mask_2018 & mask_flag & mask_suppressed & mask_topcoded_stats
    
    filtered_data_dict = df[all_masks][['variable', 'data_type', 'description', 'topic', 'subtopic', 'record_level']]
    filepath = '../data/interim/feature_import_meta_' + topic.replace(' ', '_').lower() + '.csv'
    filtered_data_dict.to_csv(filepath, index=False)
    
    return print(f'csv has {filtered_data_dict.shape[0]} rows and was saved to {filepath}')

In [30]:
for topic in sipp_dict.topic.unique():
    create_data_dict(sipp_dict, topic)

csv has 46 rows and was saved to ../data/interim/feature_import_meta_adult_and_child_well_being.csv
csv has 447 rows and was saved to ../data/interim/feature_import_meta_assets.csv
csv has 42 rows and was saved to ../data/interim/feature_import_meta_child_and_dependent_care.csv
csv has 4 rows and was saved to ../data/interim/feature_import_meta_coverage.csv
csv has 116 rows and was saved to ../data/interim/feature_import_meta_demographics.csv
csv has 13 rows and was saved to ../data/interim/feature_import_meta_disability.csv
csv has 20 rows and was saved to ../data/interim/feature_import_meta_education_enrollment.csv
csv has 15 rows and was saved to ../data/interim/feature_import_meta_fertility_history.csv
csv has 45 rows and was saved to ../data/interim/feature_import_meta_health_care.csv
csv has 91 rows and was saved to ../data/interim/feature_import_meta_health_insurance.csv
csv has 16 rows and was saved to ../data/interim/feature_import_meta_id_variables.csv
csv has 18 rows and was