In [None]:
# Convert 2015 SEMCOG Urbansim base year demographic data to ABM test inputs

##   ABM needs several additional variables "HHT", "pemploy", "pstudent", "ptype" and defines "worker" differently than REMI. Therefore, additional PUMS attributes, "HHT", "ESR", "WKHP", "WKW", "SCHG" need to be extracted from original PUMS records.
##   SEMCOG 2015 model base year demographic data doesn't contain original PUMS record number ("SERIALNO") but has valid household_id and member id that can help to link back to synthesized data. However, both model HHs and persons were post-processed to better match REMI base year controls. Therefore, they contains additional records not directly available in sythesized results.
##   This notebook first uses aviable IDs to link syntheszied records. When IDs are not avaible, it looks up matching HH or person attributes plus geograhpic information such as PUMA or county to further identify/sample original PUMS serialno. 
##   After all households and persons have valid PUMS serialno,  it attachs need PUMS variables and compute to ABM inputs using conversion logics from RSG (for details see bottom of this notebook).     


In [None]:
import pandas as pd
import numpy as np
import math
import time
from pandas_profiling import ProfileReport


## Extract HHT from PUMS and add to model HHs

In [None]:
# get PUMA-blkgrp cross work file
tract_puma = 'https://www2.census.gov/geo/docs/maps-data/data/rel/2010_Census_Tract_to_2010_PUMA.txt'
df_tctpuma = pd.read_csv(tract_puma)
df_tctpuma = df_tctpuma.loc[df_tctpuma.STATEFP == 26]

In [None]:
stm = pd.HDFStore('all_semcog_data_02-02-18.h5', 'r') # model base year, final version
sts = pd.HDFStore('starter6_20171019-1526.h5', 'r') # synthesized results

In [None]:
#process model HHs, attach bg_id, zone_id, county and PUMA
modhh = stm['households']
modhh.fillna(0, inplace=True)
modhh = pd.merge(modhh, stm['buildings'][['parcel_id']], left_on = 'building_id',                                                                       right_index = True, how = 'left')
modhh = pd.merge(modhh, stm['parcels'][['census_bg_id','zone_id']], left_on = 'parcel_id',                                                              right_index = True, how = 'left')
modhh['county'] = modhh.index.values//10000000
modhh['tract'] = modhh.census_bg_id//10
modhh = pd.merge(modhh.reset_index(), df_tctpuma[['COUNTYFP','TRACTCE','PUMA5CE']], 
                        left_on = ['county', 'tract'], right_on = ['COUNTYFP','TRACTCE'], 
                        how='left').set_index('household_id')
print('model households', modhh.head(2))

In [None]:
#process synthesized HHS, attach county and PUMA
synhh = sts['sim_households']
synhh.fillna(0, inplace=True)

synhh.tract = synhh.tract.astype(int)
synhh['county'] = synhh.index.values//10000000
synhh = pd.merge(synhh.reset_index(), df_tctpuma[['COUNTYFP','TRACTCE','PUMA5CE']], 
                            left_on = ['county', 'tract'], right_on = ['COUNTYFP','TRACTCE'],                                       how='left').set_index('household_id')
print('synthpop household', synhh.head(2))

In [None]:
# enlist added HHs, that is all hhs in model inputs but not in original synthesis dataset( they were added in post-process)
modhh_added = modhh.loc[~modhh.index.isin(synhh.index)]
modhh_match = modhh.loc[modhh.index.isin(synhh.index)]
print(len(modhh), len(modhh_match) + len(modhh_added))

In [None]:
#for added HHs, sample original HHs with matching attributes in same geograhies (PUMA then county)the purpose is to add HHT info based on household attributes 

synhh.income=synhh.income.round(0)
modhh_added.income=modhh_added.income.round(0)
def querylst(alst, r, dfh):
    avals = [str(r[v]) for v in alst]
    qrystr = ' and '.join([ "==".join(x) for x in list(zip(alst, avals))])
    return dfh.query(qrystr)    

t0=time.time()
indlst = []
for ind, r in modhh_added.iterrows():
    alst = ['income', 'race_id', 'age_of_head','cars', 'PUMA5CE']
    if math.isnan(r.PUMA5CE) == False:
        dfq = querylst(alst, r, synhh)

    if (math.isnan(r.PUMA5CE)) or (len(dfq) == 0):
        alst = alst[:-1] + ['county']
        dfq = querylst(alst, r, synhh)

        if (len(dfq) == 0):
            alst = alst[:-1]
            dfq = querylst(alst, r, synhh)

    indlst.append(dfq.sample(1).serialno.values[0])
    
    if len(indlst) % 1000 == 0:
        print('r', len(indlst), round (time.time()-t0, 0), '|', end =" " )
print('r', len(indlst), round (time.time()-t0, 0), 'done' )

In [None]:
match = synhh.loc[synhh.index.isin(modhh.index)]
modhh.loc[match.index, 'serialno'] = match['serialno']
modhh.loc[modhh_added.index, 'serialno'] = indlst
modhh.loc[modhh.serialno.isnull()] #verify if there's still added HHs with missing serialno

In [None]:
# read HHT from 2015 PUMS dataset and attach them to model HHs
pumshhs = pd.read_csv('2015_synthpop_inputs/ss15hmi.csv', usecols=['SERIALNO', 'HHT'])
modhh = pd.merge(modhh.reset_index(), pumshhs, left_on='serialno', right_on='SERIALNO', how='left').set_index('household_id')
modhh.to_csv('abm_hhs_01282020_raw.csv', index_label = 'household_id' )

In [None]:
modhh.head()

## Extract work and edu related info from PUMS and attach to model persons

In [None]:
# add extra attributes from PUMS data to model persons
modpp = stm['persons']
modpp = pd.merge(modpp.reset_index(), modhh[['serialno', 'county']], left_on='household_id', right_index=True, how='left').set_index('person_id')

ext_vars = ['SCHG', 'ESR', 'WKHP','WKW']
pumspps = pd.read_csv('2015_synthpop_inputs/ss15pmi.csv', usecols=['SERIALNO', 'SPORDER'] + ext_vars)

modpp = pd.merge(modpp.reset_index(), pumspps, left_on=['serialno', 'member_id'], right_on=['SERIALNO', 'SPORDER'], how='left').set_index('person_id')


In [None]:
modpp_added = modpp.loc[modpp.SERIALNO.isnull() ] #persons without proper HH id after joining(new person only in model inputs) 
modpp_match = modpp.loc[~modpp.SERIALNO.isnull() ] #persons in original synthesis



In [None]:
# for added persons, sample original person records with matching attributes
t0=time.time()

dfsample = []
for ind, r in modpp_added.iterrows():
    qlst = ['county', 'worker', 'sex',  'race_id', 'age']

    for i in range(len(qlst)): 
        dfq = querylst(qlst, r, modpp_match) 

        if len(dfq) > 0:
            dfsample.append(dfq.sample(1))
            if len(dfsample) % 1000 == 0:
                print (len(dfsample), round(time.time()-t0, 0), '|', end =" ")
            break
        qlst = qlst[:-( i + 1)] # reduce qlst by one variable

dfs = pd.concat(dfsample, axis=0) #combine all sampled records

#update added model persons with extracted attributes
for v in ext_vars:
    modpp.loc[modpp.SERIALNO.isnull(), v] = dfs[v].values

In [None]:
modpp.to_csv('abm_persons_01282020_raw.csv')

In [None]:
modhh = pd.read_csv('abm_hhs_01282020_raw.csv', index_col = "household_id")
modpp = pd.read_csv('abm_persons_01282020_raw.csv', index_col = "person_id")

In [None]:
# update person and household worker with new definition ESR in [1,2,4,5] 
modpp.loc[modpp.ESR.isin([1,2,4,5]), 'worker'] = 1
modhh['workers'] = 0 #reset the values, cannot keep them
modhh['workers'] = modpp.groupby('household_id').worker.sum()

In [None]:
modpp['pemploy'] = 2 #part time
modpp.loc[modpp.age < 16, 'pemploy'] = 4  #under16
modpp.loc[(modpp.age >= 16) & (modpp.ESR.isin([3,6])) , 'pemploy'] = 3 #not employed
modpp.loc[(modpp.age >= 16) & (~modpp.ESR.isin([3,6])) & (modpp.WKHP >= 35) & (modpp.WKW.isin([1,2,3,4])), 'pemploy'] = 1  # full time

In [None]:
modpp.loc[(modpp.age >= 16) & (modpp.pemploy == 1), 'pstudent'] = 3 # not attending school
modpp.loc[(modpp.age < 16) & (modpp.pemploy == 1), 'pstudent'] = 1 # high school or lower
modpp.loc[modpp.SCHG.isnull() & (modpp.age >= 16), 'pstudent'] = 3
modpp.loc[modpp.SCHG.isnull() & (modpp.age < 16), 'pstudent'] = 1
modpp.loc[(modpp.pemploy != 1) & (modpp.SCHG >= 15) & (modpp.age >= 16), 'pstudent'] = 2
modpp.loc[(modpp.pemploy != 1) & (modpp.SCHG >= 15) & (modpp.age < 16), 'pstudent'] = 1
modpp.loc[(modpp.pemploy != 1) & (modpp.SCHG.isin(range(1,15))) & (modpp.age <= 19), 'pstudent'] = 1
modpp.loc[(modpp.pemploy != 1) & (modpp.SCHG.isin(range(1,15))) & (modpp.age > 19), 'pstudent'] = 2 # college or higher


In [None]:
modpp.loc[modpp.pemploy == 1, 'ptype'] = 1
modpp.loc[(modpp.pemploy == 2) & (modpp.pstudent == 3), 'ptype'] = 2
modpp.loc[(modpp.age >= 65) & (modpp.pemploy.isin([3,4])) & (modpp.pstudent == 3), 
            'ptype'] = 5
modpp.loc[(modpp.age < 6) & (modpp.pemploy.isin([3,4])) & (modpp.pstudent == 3), 
            'ptype'] = 8
modpp.loc[((modpp.age >= 6) & (modpp.age <= 64)) & (modpp.pemploy.isin([3,4])) &                           (modpp.pstudent == 3) , 'ptype'] = 4
modpp.loc[(modpp.pemploy.isin([2, 3, 4])) & (modpp.pstudent == 2) , 'ptype'] = 3
modpp.loc[(modpp.age < 6) & (modpp.pemploy.isin([2,3,4])) & (modpp.pstudent == 1), 
            'ptype'] = 8
modpp.loc[(modpp.age >= 16) & (modpp.pemploy.isin([2,3,4])) & (modpp.pstudent == 1), 
            'ptype'] = 6
modpp.loc[((modpp.age >= 6) & (modpp.age < 16)) & (modpp.pemploy.isin([2,3,4])) &                       (modpp.pstudent == 1) , 'ptype'] = 7



In [None]:
modhh[['building_id', 'cars', 'workers', 'persons', 'race_id', 'income',
       'age_of_head', 'children', 'parcel_id', 'census_bg_id', 'zone_id',
       'county', 'tract', 'HHT']].to_csv('abm_hhs_01282020.csv')

modpp[['relate', 'age', 'worker', 'sex', 'race_id', 'member_id',
       'household_id', 'SCHG',
       'WKHP', 'WKW', 'ESR', 'pemploy', 'pstudent', 'ptype']].to_csv('abm_persons_01282020.csv')

In [None]:
modpp.loc[modpp.SERIALNO.isnull()]

In [None]:
# RSG notes on ABM data inputs

# Household attributes:
# 1.	Hworkers: Yes, this is number of workers in the household based on each member’s Employment Status Recode (ESR). ESR is defined in PUMS as follows:
# Members with ESR as 1, 2, 4 and 5 are counted as workers
# 2.	HHT: Yes, this is the original PUMS field – Household/family type


# Person Type (ptype):
# 1.	The ptype code is defined using the following PUMS person-level variables:
# a.	ESR: Employment Status Recode (ESR)
# b.	WKHP: Usual hours worked per week past 12 months
# c.	WKW: Weeks worked during past 12 months
# d.	SCHG: Grade level attending
# e.	AGEP: Age

# 2.	The person type is derived from person’s age, employment status (pemploy) and student status (pstudent).
# 3.	The employment status is derived from ESR, WKHP, WKW and Age
# 4.	The student status is derived from SCHG, Age and employment status

# As long as we have ESR, WKHP, WKW, SCHG and AGEP in the person file, employment status, student status and person type can be derived.

# We have documented the person type coding process for ODOT. Please follow this link for a  detailed description of person type coding logic: https://github.com/RSGInc/SOABM/wiki/Person-Type-Coding-in-SOABM

# https://github.com/RSGInc/SOABM/wiki/Person-Type-Coding-in-SOABM
# PUMS variable definitions( year unkown * different from 2015 definitions see below)
# Employment status recode ESR
#     b .N/A (less than 16 years old)
#     1 .Civilian employed, at work
#     2 .Civilian employed, with a job but not at work
#     3 .Unemployed
#     4 .Armed forces, at work
#     5 .Armed forces, with a job but not at work
#     6 .Not in labor force

# WKHP
# Usual hours worked per week past 12 months
#     bb .N/A (less than 16 years old/did not work during the past 12 months)
#     01..98 .1 to 98 usual hours
#     99 .99 or more usual hours

# WKW
# Weeks worked during past 12 months
#     b .N/A (less than 16 years old/did not work during the past 12 months)
#     1 .50 to 52 weeks
#     2 .48 to 49 weeks
#     3 .40 to 47 weeks
#     4 .27 to 39 weeks
#     5 .14 to 26 weeks
#     6 .13 weeks or less

# ========================================================
# SCHG https://github.com/RSGInc/SOABM/wiki/Person-Type-Coding-in-SOABM
# Grade level attending
#     b .N/A (not attending school)
#     1 .Nursery school/preschool
#     2 .Kindergarten
#     3 .Grade 1 to grade 4
#     4 .Grade 5 to grade 8
#     5 .Grade 9 to grade 12
#     6 .College undergraduate
#     7 .Graduate or professional school
#     AGEP
#     Age
#     00 .Under 1 year
#     01..99 .1 to 99 years (Top-coded***)




# SCHG (2015 PUMS variable codes, SEMCOG model base year data)
# Grade level attending
#     bb .N/A (not attending school)
#     01 .Nursery school/preschool
#     02 .Kindergarten
#     03 .Grade 1
#     04 .Grade 2
#     05 .Grade 3
#     06 .Grade 4
#     07 .Grade 5
#     08 .Grade 6
#     09 .Grade 7
#     10 .Grade 8
#     11 .Grade 9
#     12 .Grade 10
#     13 .Grade 11
#     14 .Grade 12
#     15 .College undergraduate years (freshman to senior)
#     16 .Graduate or professional school beyond a bachelor's degree