In [1]:
import time
import numpy as np
import pandas as pd

#### Load Feature Schema from json

In [6]:
rd_schema = pd.read_json('../data/raw/sipp_2018/pu2018_schema.json')
rd_schema['dtype'] = (['Int64' if x == 'integer'
                       else 'object' if x == 'string'
                       else 'Float64' if x == 'float'
                       else 'ERROR'
                       for x in rd_schema['dtype']]
                     )

#### Choose Features for Import

In [5]:
# This cell is for seleting which of the ~5,000 features to import

# Initialize lists with column names to read from csv into df

# Set of Standard columns suggested by SIPP documentation
std_cols = [#Common case-identification variables
    'SSUID','PNUM','MONTHCODE','ERESIDENCEID','ERELRPE','SPANEL','SWAVE',
    #The base weight and monthly in-survey-universe indicator
    'WPFINWGT','RIN_UNIV',
    #Additional variables for analysis
    'TPTOTINC','RTANF_MNYN']

# Set of Demographics columns
demo_cols = ['ESEX', 'TAGE', 'TRACE',          # sex, age @interview, detailed race
             'EORIGIN', 'TLANG1', 'EHOWWELL',  # hispanic origin, lang spoken @home, how well eng spoken
             'EBORNUS', 'ECITIZEN', 'ESPEAK',  # US born, US citizen, lang other than eng at home
             'RFAMKIND', 'EMS', 'EEDUC',       # family type, marital status, highest school/degree completed
             'RLNGISOL', 'TLIVQTR', 'ETENURE'  # ltd eng household, type livng qtr, own/rent/other livng qtr
            ]

# Set of Stock and 401k ownership columns
st_own_cols = ['EOWN_ST', 'EOWN_THR401']       # own stocks, own 401k/tax-sheltered investment

# Concatenate columns to import
use_cols = std_cols + demo_cols + st_own_cols

print(f'You are importing {len(use_cols)} columns')

You are importing 28 columns


#### Load Dataset to df

In [4]:
# Load dataset. This works on my computer (takes about 99 secs to load 26 columns).
start = time.time()
df = pd.read_csv("../data/raw/sipp_2018/pu2018.csv",
                      names=rd_schema['name'],
                      dtype=dict([(i,v) for i,v in zip(rd_schema.name, rd_schema.dtype)]),
                      sep='|',
                      header=0,
                      usecols=use_cols,
                     )
end = time.time()
print(f'Read with Pandas: {end - start} seconds')

Read with Pandas: 99.26237273216248 seconds


#### Reformat df Column Names

In [5]:
# Rename columns to Python friendly format and display df
df.columns = [name.lower().replace(' ', '_') for name in df.columns]
df.head()

Unnamed: 0,ssuid,spanel,swave,pnum,ems,erelrpe,esex,eorigin,erace,eeduc,...,wpfinwgt,eresidenceid,rfamkind,rtanf_mnyn,rin_univ,tlang1,trace,tage,tage_ehc,tptotinc
0,11413607018,2018,1,101,6,2,1,1,1,38,...,5971.747954,100002,,2,1,,1,33,32,1738
1,11413607018,2018,1,101,6,2,1,1,1,38,...,5896.534769,100002,,2,1,,1,33,32,1738
2,11413607018,2018,1,101,6,2,1,1,1,38,...,5845.211127,100002,,2,1,,1,33,32,1738
3,11413607018,2018,1,101,6,2,1,1,1,38,...,5794.719891,100002,,2,1,,1,33,32,1738
4,11413607018,2018,1,101,6,2,1,1,1,38,...,5790.817291,100002,,2,1,,1,33,32,1738


In [8]:
(df
 .drop_duplicates()
 .eown_st
 .value_counts(normalize=True)
 .to_frame()
)

Unnamed: 0,eown_st
2,0.888283
1,0.111717


In [None]:
df.describe()