In [1]:
import time
import numpy as np
import pandas as pd

### Load Feature Schema & Response Code Dictionary from json

In [2]:
# Schema for import
rd_schema = pd.read_json('../data/raw/sipp_2018/pu2018_schema.json')
rd_schema['dtype'] = (['Int64' if x == 'integer'
                       else 'object' if x == 'string'
                       else 'Float64' if x == 'float'
                       else 'ERROR'
                       for x in rd_schema['dtype']]
                     )

# Response Code Dict for recoding factors to strings
def load_response_code_dict():
    import json
    with open('../data/interim/response_code_dict.json') as jf:
        response_code_dict = json.load(jf)
    return response_code_dict

rcode_dict = load_response_code_dict()

### Choose Features for Import
#### Core Features

In [3]:
# This cell is for seleting which of the ~5,000 features to import

# Initialize lists with column names to read from csv into df

# Set of Standard columns suggested by SIPP documentation
ref_cols = ['SSUID', 'PNUM', 'ERESIDENCEID', 
            'ERELRPE', 'RFAMNUM', 'RFAMREF', 
            'MONTHCODE', 'RIN_UNIV', 'WPFINWGT',
           ]

stock_core = ['EOWN_ST', 'TOSTVAL',
              'EJOOWNST', 'TJOSTVAL',
              'EJSOWNST', 'TJSSTVAL',
             ]

res_core = ['TEHC_ST', 'TEHC_METRO']

demo_core = ['TAGE', 'EEDUC',
             'EORIGIN', 'ERACE', 'TRACE',
             'ESEX', 'TBORNPLACE', 'EBORNUS',
            ]

# Concatenate columns to import
core_cols = (ref_cols
            + res_core
            + demo_core
            + stock_core
           )

print(f'You are importing {len(core_cols)} core columns')

You are importing 25 core columns


#### Optional Features

In [4]:
demo_explore = ['ESPEAK', 'RAFEVER', 'RANY5',
                'RFAMKIND', 'RFPERSONS', 'RFPERSONSWT2',
                'RFRELU18', 'RFRELU18WT2', 'RLNGISOL',
                'EMS', 'ECITIZEN', 'ENATCIT',
                'TIMSTAT', 'TYRENTRY', 'EHOWWELL',
                'TLANG1', 'EDOB_BMONTH', 'RHNUM65OVER',
                'RHNUM65OVRT2', 
               ]

opt_cols = (demo_explore
           )

print(f'You are importing {len(opt_cols)} optional columns')

You are importing 19 optional columns


#### Combine Core and Optional Features

In [5]:
use_cols = core_cols + opt_cols
print(f'You are importing {len(use_cols)} total columns')

You are importing 44 total columns


### Load & Format

In [6]:
# Load dataset. This works on my computer (takes about 99 secs to load 26 columns).
start = time.time()
sipp_2018 = pd.read_csv("../data/raw/sipp_2018/pu2018.csv",
                      names=rd_schema['name'],
                      dtype=dict([(i,v) for i,v in zip(rd_schema.name, rd_schema.dtype)]),
                      sep='|',
                      header=0,
                      usecols=use_cols,
                     )
end = time.time()
print(f'Read with Pandas: {end - start} seconds.')
print(f'Imported dataframe has {sipp_2018.shape[0]} rows and {sipp_2018.shape[1]} columns/s.')

# Set df column order & reformat column names
sipp_2018 = sipp_2018[use_cols]
sipp_2018.columns = [name.lower().replace(' ', '_') for name in sipp_2018.columns]

Read with Pandas: 117.09956192970276 seconds.
Imported dataframe has 763186 rows and 44 columns/s.


### Recode Response Factors to Strings

In [None]:
sipp_2018_recode = sipp_2018.copy()
for col in sipp_2018_recode.columns:
    if col in rcode_dict:
        sipp_2018_recode[col] = (sipp_2018_recode
                                 [col]
                                 .astype(str)
                                 .map(rcode_dict.get(col), na_action='ignore')
                                )    

### Collapse Observations to the Person Level
* Keep all respondent observations in your sample universe
    * Determine whether you need 1 obs per person or all 12

In [11]:
sipp_2018_person = (sipp_2018
                    .sort_values(by=['ssuid', 'pnum', 'monthcode'], 
                                 ascending=[True, True, False])
                    .drop_duplicates(['ssuid', 'pnum'])
                    .query('rin_univ > 0')
                    .drop(['monthcode', 'rin_univ'], axis='columns')
                   )

print(f'There are {sipp_2018_person.shape[0]} people in the sample.')

There are 63915 people in the sample.


### Collapse Observations to the Family Level
* Keep 1 observation per family
* *rfamref*: pnum of family reference person
* *rfamkind*: kind of family
* *rfamnum*: family number

**Family**: a set of people related by blood, marriage, or adoption.<br/>
**Reference person**: the owner of renter of the housing unit (the first, if more than one)

Household and family variables are recorded in each sample member's observation.

In [12]:
'''
Multi-condition query instead of drop_duplicates because different persons may be the family ref person 
on different months. Take the person who is the family ref person for the final month of reference period.
'''

sipp_2018_family = (sipp_2018
                    .sort_values(by=['ssuid', 'eresidenceid', 'rfamnum', 'rfamref', 'monthcode'], 
                                 ascending=[True, True, True, True, False])
                    .query('pnum == rfamref and monthcode == 12')
                    .drop(['monthcode', 'rfamref', 'rin_univ'], axis='columns')
                   )

print(f'There are {sipp_2018_family.shape[0]} families in the sample.')

There are 29689 families in the sample.


### Collapse Observations to the Household Level
+ Keep 1 observation per household
+ Household heads are the 'owner or renter of note
    + User *erelrpe* in (1,2)
+ Can identify unique households with sample unit identifier (*ssuid*) + the household residence ID (*eresidenceid*)

**Household**: a set of people living together<br/>
**Reference person**: the owner of renter of the housing unit (the first, if more than one)

Household and family variables are recorded in each sample member's observation.

In [13]:
sipp_2018_household = (sipp_2018
                       .sort_values(by=['ssuid', 'eresidenceid', 'erelrpe', 'monthcode'], ascending=[True, True, True, False])
                       .drop_duplicates(['ssuid', 'eresidenceid'])
                       .query('monthcode == 12 and rin_univ > 0')
                       #.drop(['pnum', 'monthcode', 'rfamref', 'rfamnum', 'erelrpe'], axis='columns')
                      )

print(f'There are {sipp_2018_household.shape[0]} households in the sample.')

There are 26566 households in the sample.


### Weights *(wpfinwft)*
+ Weights provided at the person level

#### Can create family/household weights by using:
+ Person weight of family reference person
    + *wpfinwgt* where *rfamref* = *pnum*
+ Person weight of householder
    + *wpfinwgt* where *erelrpe* in (1,2)
+ Average person weights of family/household members

#### And at multiple timeframes
+ Monthly
+ Calendar year (use the December monthly weight)
+ Panel (*finpnl14* variable; waves 2+ only)