# Dataset Preprocessing
## **Preprocess and decode the dataset**

In [1]:
import pandas as pd
import numpy as np

In [2]:
us_personal_a = pd.read_csv('psam_pusa.csv')
us_personal_b = pd.read_csv('psam_pusb.csv')
us_personal = pd.concat([us_personal_a, us_personal_b])
us_personal

Unnamed: 0,RT,SERIALNO,DIVISION,SPORDER,PUMA,REGION,ST,ADJINC,PWGTP,AGEP,...,PWGTP71,PWGTP72,PWGTP73,PWGTP74,PWGTP75,PWGTP76,PWGTP77,PWGTP78,PWGTP79,PWGTP80
0,P,2018GQ0000049,6,1,1600,3,1,1013097,75,19,...,140,74,73,7,76,75,80,74,7,72
1,P,2018GQ0000058,6,1,1900,3,1,1013097,75,18,...,76,78,7,76,80,78,7,147,150,75
2,P,2018GQ0000219,6,1,2000,3,1,1013097,118,53,...,117,121,123,205,208,218,120,19,123,18
3,P,2018GQ0000246,6,1,2400,3,1,1013097,43,28,...,43,76,79,77,80,44,46,82,81,8
4,P,2018GQ0000251,6,1,2701,3,1,1013097,16,25,...,4,2,29,17,15,28,17,30,15,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1566022,P,2018HU1400326,8,4,400,4,56,1013097,87,9,...,96,28,98,111,88,27,30,92,73,112
1566023,P,2018HU1400326,8,5,400,4,56,1013097,87,7,...,96,28,97,111,88,28,30,92,72,110
1566024,P,2018HU1400502,8,1,100,4,56,1013097,49,49,...,44,18,34,85,18,15,85,47,31,22
1566025,P,2018HU1400502,8,2,100,4,56,1013097,46,19,...,64,17,19,68,19,15,117,43,39,19


In [3]:
us_personal.to_pickle('us_personal.zip')

In [4]:
us_personal = pd.read_pickle('us_personal.zip')

In [5]:
# Define all decoding/recoding methods for the data

def DIVISION_decode(code):
    if code == 0: return 'Puerto Rico'
    elif code == 1: return 'New England'
    elif code == 2: return 'Middle Atlantic'
    elif code == 3: return 'East North Central'
    elif code == 4: return 'West North Central'
    elif code == 5: return 'South Atlantic'
    elif code == 6: return 'East South Central'
    elif code == 7: return 'West South Central'
    elif code == 8: return 'Mountain'
    elif code == 9: return 'Pacific'
    else: raise ValueError('invalid DIVISION code encountered')

def ST_decode(code):
    if code == 1: return 'AL'
    elif code == 2: return 'AK'
    elif code == 4: return 'AZ'
    elif code == 5: return 'AR'
    elif code == 6: return 'CA'
    elif code == 8: return 'CO'
    elif code == 9: return 'CT'
    elif code == 10: return 'DE'
    elif code == 11: return 'DC'
    elif code == 12: return 'FL'
    elif code == 13: return 'GA'
    elif code == 15: return 'HI'
    elif code == 16: return 'ID'
    elif code == 17: return 'IL'
    elif code == 18: return 'IN'
    elif code == 19: return 'IA'
    elif code == 20: return 'KS'
    elif code == 21: return 'KY'
    elif code == 22: return 'LA'
    elif code == 23: return 'ME'
    elif code == 24: return 'MD'
    elif code == 25: return 'MA'
    elif code == 26: return 'MI'
    elif code == 27: return 'MN'
    elif code == 28: return 'MS'
    elif code == 29: return 'MO'
    elif code == 30: return 'MT'
    elif code == 31: return 'NE'
    elif code == 32: return 'NV'
    elif code == 33: return 'NH'
    elif code == 34: return 'NJ'
    elif code == 35: return 'NM'
    elif code == 36: return 'NY'
    elif code == 37: return 'NC'
    elif code == 38: return 'ND'
    elif code == 39: return 'OH'
    elif code == 40: return 'OK'
    elif code == 41: return 'OR'
    elif code == 42: return 'PA'
    elif code == 44: return 'RI'
    elif code == 45: return 'SC'
    elif code == 46: return 'SD'
    elif code == 47: return 'TN'
    elif code == 48: return 'TX'
    elif code == 49: return 'UT'
    elif code == 50: return 'VT'
    elif code == 51: return 'VA'
    elif code == 53: return 'WA'
    elif code == 54: return 'WV'
    elif code == 55: return 'WI'
    elif code == 56: return 'WY'
    elif code == 72: return 'PR'
    elif code == 16: return 'ID'
    else: raise ValueError('invalid ST code encountered')

def AGEP_recode_numerical(code):
    if code < 100: return code
    else: raise ValueError('invalid AGEP code encountered')

def AGEP_recode_categorical(code):
    if code < 18: return 'Under 18'
    elif code < 25: return '18 to 24'
    elif code < 35: return '25 to 34'
    elif code < 45: return '35 to 44'
    elif code < 55: return '45 to 54'
    elif code < 65: return '55 to 64'
    elif code < 75: return '65 to 74'
    elif code < 85: return '75 to 84'
    elif code < 100: return 'Over 84'
    else: raise ValueError('invalid AGEP code encountered')

def CIT_decode(code):
    if code == 1: return 'US Born'
    elif code == 2: return 'US Territory Born'
    elif code == 3: return 'US Parents Born Abroad'
    elif code == 4: return 'Naturalized Citizen'
    elif code == 5: return 'Not Citizen'
    else: raise ValueError('invalid CIT code encountered')

def COW_decode(code):
    if np.isnan(code): return 'N/A'
    elif code == 1: return 'Private For-Profit'
    elif code == 2: return 'Private Not-For-Profit'
    elif code == 3: return 'Local Government'
    elif code == 4: return 'State Government'
    elif code == 5: return 'Federal Government'
    elif code == 6: return 'Self-Employed'
    elif code == 7: return 'Self-Employed in own Inc'
    elif code == 8: return 'Unpaid Family Employee'
    elif code == 9: return 'Unemployed'
    else: raise ValueError('invalid COW code encountered')

def DEAR_decode(code):
    if code == 1: return 'Hearing Difficulty'
    elif code == 2: return 'No Hearing Difficulty'
    else: raise ValueError('invalid DEAR code encountered')

def DEYE_decode(code):
    if code == 1: return 'Vision Difficulty'
    elif code == 2: return 'No Vision Difficulty'
    else: raise ValueError('invalid DEYE code encountered')

def FER_decode(code):
    if np.isnan(code): return 'N/A'
    elif code == 1: return 'Gave Birth in Past 12 Months'
    elif code == 2: return 'No Birth in Past 12 Months'
    else: raise ValueError('invalid FER code encountered')

def JWMNP_recode_numerical(code):
    if np.isnan(code): return 0
    else: return code

def JWMNP_recode_categorical(code):
    if np.isnan(code): return 'N/A'
    elif code <= 30: return 'Up to 30 Minutes'
    elif code <= 60: return '31 to 60 Minutes'
    elif code <= 90: return '61 to 90 Minutes'
    elif code <= 120: return '91 to 120 Minutes'
    elif code <= 150: return '121 to 150 Minutes'
    elif code <= 180: return '151 to 180 Minutes'
    elif code <= 200: return 'Over 180 Minutes'
    else: raise ValueError('invalid JWMNP code encountered')

def MAR_decode(code):
    if code == 1: return 'Married'
    elif code == 2: return 'Widowed'
    elif code == 3: return 'Divorced'
    elif code == 4: return 'Separated'
    elif code == 5: return 'Never Married/Under 15'
    else: raise ValueError('invalid MAR code encountered')

def SCHL_recode(code):
    if np.isnan(code): return 'Under 3 Years'
    elif code < 15: return 'No HS Diploma'
    elif code < 18: return 'HS Diploma/Alternative'
    elif code < 20: return 'Some College'
    elif code == 20: return 'Associate\'s Degree'
    elif code == 21: return 'Bachelor\'s Degree'
    elif code == 22: return 'Master\'s Degree'
    elif code == 23: return 'Professional Degree'
    elif code == 24: return 'Doctorate Degree'
    else: raise ValueError('invalid SCHL code encountered')

def SEX_decode(code):
    if code == 1: return 'male'
    elif code == 2: return 'female'
    else: raise ValueError('invalid SEX code encountered')

def WAGP_recode_numerical(code):
    if np.isnan(code): return 0
    else: return code

def WKHP_recode_numerical(code):
    if np.isnan(code): return 0
    else: return code

def WKW_decode(code):
    if np.isnan(code): return 'No Work/Under 16 Years'
    elif code == 1: return '50 to 52 Weeks'
    elif code == 2: return '48 to 49 Weeks'
    elif code == 3: return '40 to 47 Weeks'
    elif code == 4: return '27 to 39 Weeks'
    elif code == 5: return '14 to 26 Weeks'
    elif code == 6: return 'Under 14 Weeks'
    else: raise ValueError('invalid WKW code encountered')

def DIS_decode(code):
    if code == 1: return 'Disability'
    elif code == 2: return 'No Disability'
    else: raise ValueError('invalid DIS code encountered')

def INDP_recode(code):
    if np.isnan(code): return 'N/A'
    elif code <= 290: return 'AGR'
    elif code <= 490: return 'EXT'
    elif code <= 690: return 'UTL'
    elif code <= 770: return 'CON'
    elif code <= 3990: return 'MFG'
    elif code <= 4590: return 'WHL'
    elif code <= 5790: return 'RET'
    elif code <= 6390: return 'TRN'
    elif code <= 6780: return 'INF'
    elif code <= 7190: return 'FIN'
    elif code <= 7790: return 'PRF'
    elif code <= 7890: return 'EDU'
    elif code <= 8290: return 'MED'
    elif code <= 8470: return 'SCA'
    elif code <= 8690: return 'ENT'
    elif code <= 9290: return 'SRV'
    elif code <= 9590: return 'ADM'
    elif code <= 9870: return 'MIL'
    elif code <= 9920: return 'Unemployed'
    else: raise ValueError('invalid INDP code encountered')

def NATIVITY_decode(code):
    if code == 1: return 'Native'
    elif code == 2: return 'Foreign Born'
    else: raise ValueError('invalid NATIVITY code encountered')

def OC_decode(code):
    if np.isnan(code): return 'N/A'
    elif code == 0: return 'No Children'
    elif code == 1: return 'Has Children'
    else: raise ValueError('invalid OC code encountered')

def PINCP_recode_numerical(code):
    if np.isnan(code): return 0
    else: return code

def RAC1P_decode(code):
    if code == 1: return 'White'
    elif code == 2: return 'Black'
    elif code == 3: return 'American Indian'
    elif code == 4: return 'Alaska Native'
    elif code == 5: return 'American Indian'
    elif code == 6: return 'Asian'
    elif code == 7: return 'Pacific Islander'
    elif code == 8: return 'Other'
    elif code == 9: return 'Multiple Races'
    else: raise ValueError('invalid RAC1P code encountered')

def WAOB_decode(code):
    if code == 1: return 'US State'
    elif code == 2: return 'PR and US Island'
    elif code == 3: return 'Latin America'
    elif code == 4: return 'Asia'
    elif code == 5: return 'Europe'
    elif code == 6: return 'Africa'
    elif code == 7: return 'North America'
    elif code == 8: return 'Oceania and at Sea'
    else: raise ValueError('invalid WAOB code encountered')


In [8]:
preprocessing_funcs = {
    'DIVISION':DIVISION_decode,
    'ST':ST_decode,
    'AGEP':AGEP_recode_numerical,
    'CIT':CIT_decode,
    'COW':COW_decode,
    'DEAR':DEAR_decode,
    'DEYE':DEYE_decode,
    'FER':FER_decode,
    'JWMNP':JWMNP_recode_numerical,
    'MAR':MAR_decode,
    'SCHL':SCHL_recode,
    'SEX':SEX_decode,
    'WAGP':WAGP_recode_numerical,
    'WKHP':WKHP_recode_numerical,
    'WKW':WKW_decode,
    'DIS':DIS_decode,
    'INDP':INDP_recode,
    'NATIVITY':NATIVITY_decode,
    'OC':OC_decode,
    'PINCP':PINCP_recode_numerical,
    'RAC1P':RAC1P_decode,
    'WAOB':WAOB_decode
}

def df_from_funcs(source_df, functions):
    output_df = pd.DataFrame()
    for column,function in functions.items():
        output_df[column] = source_df[column].map(function)
    return output_df

In [9]:
us_personal_processed = df_from_funcs(us_personal, preprocessing_funcs)

In [10]:
us_personal_processed.to_pickle('us_personal_processed.zip')

In [11]:
us_personal_processed = pd.read_pickle('us_personal_processed.zip')

In [12]:
us_personal_processed

Unnamed: 0,DIVISION,ST,AGEP,CIT,COW,DEAR,DEYE,FER,JWMNP,MAR,...,WAGP,WKHP,WKW,DIS,INDP,NATIVITY,OC,PINCP,RAC1P,WAOB
0,East South Central,AL,19,US Born,Private For-Profit,No Hearing Difficulty,No Vision Difficulty,No Birth in Past 12 Months,0.0,Never Married/Under 15,...,0.0,0.0,No Work/Under 16 Years,No Disability,RET,Native,,-1500.0,White,US State
1,East South Central,AL,18,US Born,Private For-Profit,No Hearing Difficulty,No Vision Difficulty,No Birth in Past 12 Months,0.0,Never Married/Under 15,...,1600.0,21.0,Under 14 Weeks,No Disability,ENT,Native,,1600.0,Black,US State
2,East South Central,AL,53,US Born,Federal Government,No Hearing Difficulty,No Vision Difficulty,,0.0,Never Married/Under 15,...,10000.0,40.0,50 to 52 Weeks,Disability,ADM,Native,,10000.0,White,US State
3,East South Central,AL,28,US Born,,No Hearing Difficulty,No Vision Difficulty,,0.0,Never Married/Under 15,...,0.0,0.0,No Work/Under 16 Years,No Disability,,Native,,0.0,White,US State
4,East South Central,AL,25,US Born,Private For-Profit,No Hearing Difficulty,No Vision Difficulty,No Birth in Past 12 Months,0.0,Never Married/Under 15,...,0.0,0.0,No Work/Under 16 Years,Disability,MFG,Native,,0.0,White,US State
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1566022,Mountain,WY,9,US Born,,No Hearing Difficulty,No Vision Difficulty,,0.0,Never Married/Under 15,...,0.0,0.0,No Work/Under 16 Years,No Disability,,Native,No Children,0.0,White,US State
1566023,Mountain,WY,7,US Born,,No Hearing Difficulty,No Vision Difficulty,,0.0,Never Married/Under 15,...,0.0,0.0,No Work/Under 16 Years,No Disability,,Native,No Children,0.0,White,US State
1566024,Mountain,WY,49,US Born,Private For-Profit,No Hearing Difficulty,No Vision Difficulty,No Birth in Past 12 Months,5.0,Divorced,...,18500.0,40.0,50 to 52 Weeks,No Disability,MED,Native,No Children,18500.0,White,US State
1566025,Mountain,WY,19,US Born,Local Government,No Hearing Difficulty,No Vision Difficulty,No Birth in Past 12 Months,45.0,Never Married/Under 15,...,11500.0,40.0,40 to 47 Weeks,No Disability,AGR,Native,No Children,11500.0,White,US State
