In [None]:
import time
import numpy as np
import pandas as pd

In [None]:
rd_schema = pd.read_json('../data/raw/pu2018_schema.json')

In [None]:
rd_schema.dtype.value_counts()

In [None]:
rd_schema['dtype'] = (['Int64' if x == 'integer'
                       else 'object' if x == 'string'
                       else 'Float64' if x == 'float'
                       else 'ERROR'
                       for x in rd_schema['dtype']]
                     )

In [None]:
rd_schema.dtype.value_counts()

In [None]:
# Standard columns suggested by SIPP documentation
base_cols = [#Common case-identification variables
    'SSUID','PNUM','MONTHCODE','ERESIDENCEID','ERELRPE','SPANEL','SWAVE',
    #The base weight and monthly in-survey-universe indicator
    'WPFINWGT','RIN_UNIV',
    #Common demographics variables, including age at time of interview (TAGE)
    #and monthly age during the reference period (TAGE_EHC)
    'ESEX','TAGE','TAGE_EHC','ERACE','EORIGIN','EEDUC',
    #Additional variables for analysis
    'TPTOTINC','RTANF_MNYN']

# Stock and 401k ownership columns
st_own_cols = ['EOWN_ST', 'EOWN_THR401']

# Combine columns to import
use_cols = base_cols + st_own_cols


In [None]:
# Load dataset using Pandas. This works on my computer - takes about 66 seconds to read.
start = time.time()
df = pd.read_csv("../data/raw/pu2018.csv",
                      names=rd_schema['name'],
                      dtype=dict([(i,v) for i,v in zip(rd_schema.name, rd_schema.dtype)]),
                      sep='|',
                      header=0,
                      usecols=use_cols,
                     )
end = time.time()
print(f'Read with Pandas: {end - start} seconds')

In [None]:
# Python friendly column name formatting
df.columns = [name.lower().replace(' ', '_') for name in df.columns]
df.head()

In [None]:
(df
 .groupby(['monthcode'])
 .eown_st
 .value_counts(normalize=True)
 .to_frame()
)

In [None]:
df.describe()

In [None]:
# Read and combine SIPP Data Dictionaries and format columns names to python friendly format
sipp_dict_1 = pd.read_csv('../data/raw/sippdict_1_of_2.csv')
sipp_dict_2 = pd.read_csv('../data/raw/sippdict_2_of_2.csv')
sipp_dict = pd.concat([sipp_dict_1, sipp_dict_2])
sipp_dict.columns = [name.lower().replace(' ', '_') for name in SIPP_dict.columns]
sipp_dict.head()