In [1]:
import pandas as pd, numpy as np, json

#### Define code extraction function

In [2]:
def code_string_to_dict(string):
    code_dict = {}
    code_pairs = string.split('||')
    for pair in code_pairs:
        split_pair = pair.split('. ')
        try:
            code_dict.update({split_pair[0]:split_pair[-1]})
        except:
            continue

    return code_dict

#### Load metadata csv's

In [3]:
# load sipp feature metadata
sipp_meta_1 = pd.read_csv('../data/raw/sipp_2018/sippdict_1_of_2.csv')
sipp_meta_2 = pd.read_csv('../data/raw/sipp_2018/sippdict_2_of_2.csv')
sipp_meta = pd.concat([sipp_meta_1, sipp_meta_2])
sipp_meta.columns = [name.lower().replace(' ', '_') for name in sipp_meta.columns]

#### Create Metadata DF and Assign Dictionary Column

In [13]:
# mask response codes that don't apply
mask = (sipp_meta.response_code.str.contains('||')
        & ~(sipp_meta.response_code.str.contains('\$\d+:|\d+:', regex=True, na=True))
        & sipp_meta.survey_years.str.contains('2018')
       )
       

response_code_df = sipp_meta[mask].filter(['variable', 'response_code'])

response_code_df = (response_code_df
                    .assign(response_code_dict = response_code_df.response_code.map(code_string_to_dict),
                           variable = response_code_df.variable.str.lower()
                          )
                   
                   )

response_code_df[response_code_df.variable == 'eeduc']

Unnamed: 0,variable,response_code,response_code_dict
1997,eeduc,"31. Less than 1st grade||32. 1st, 2nd, 3rd or ...","{'31': 'Less than 1st grade', '32': '1st, 2nd,..."


#### Initialize Dictionary and Save to json

In [14]:
response_code_dict = {}
for index, row in response_code_df.iterrows():
    response_code_dict.update({row['variable']:row['response_code_dict']})
    
# Remove empty dict key:value pairs
response_code_dict = {k: v for k, v in response_code_dict.items() if len(v) > 0}

filepath = '../data/interim/response_code_dict.json'

with open(filepath, 'w') as f:
    json.dump(response_code_dict, f, indent=4)