### This is the script to develop 2040 household control totals from REMI population forecast
Need three input data tables:    
1, REMI populations  
2, Census HH populations  
3, Households table(the final version that will be used for forecast)  

In [1]:
import os
import pandas as pd
from numpy import *
from time import time

In [2]:
inputdir='./input/'  #where input files are
census_hhpop_file=inputdir+'census_reg_hh_pop.txt'  #base year Census household population file
syn_hh_file=inputdir+'households.txt'  #synthesized HH file

start=time()
years=[str(x) for x in range(2010,2041)]
baseyear='2010'

#group definitions in the format of bins
bin_census_agegrp=array([0,5,10,15,18,20,21,22,25,30,35,40,45,50,55,60,61,65,66,70,75,80,85,101])
bin_ageofhead=array([0,5,18,35,65,999])
bin_agp2aoh=array([1,2,5,11,18,24]) #boundaries of bin_agegrp
bin_persons=array([1,2,3,4,5,6,7,8,9,10,99]) 

#final household attributes, including large area, to be used in control totals
control_attrs=['large_area_id','race_id','age_of_head','persons','children','cars','workers','income']
##control_attrs_values=[[3,5,93,99,115,125,147,161],[1,2,3,4],[1,2,3,4,5],[1,2,3,4,5,6,7,8,9,10],[0,1,2,3],[0,1,2,3],[0,1,2,3],[1,2,3,4]] #value ranges corresponding to lst_at8
##lst_4at=['large_area_id','gender','race_id','age_group']
lst_3at=['large_area_id','race_id','age_of_head']

#used for formatting final control table, convert categories to min, max pairs
#should update values like income boundaries, number of persons, etc
dic_format={
    'age_of_head':{1:[0,4],
                   2:[5,17],
                   3:[18,34],
                   4:[35,64],
                   5:[65,-1]
                   },
    'persons':{1:[1,1],
               2:[2,2],
               3:[3,3],
               4:[4,4],
               5:[5,5],
               6:[6,6],
               7:[7,7],
               8:[8,8],
               9:[9,9],
               10:[10,-1]               
                },
    'children':{0:[0,0],
                1:[1,1],
               2:[2,2],
               3:[3,-1]
                },
    'cars':{0:[0,0],
            1:[1,1],
            2:[2,2],
            3:[3,-1]
            },
    'workers':{ 0:[0,0],
                1:[1,1],
                2:[2,2],
                3:[3,-1]
                },
    'income':{ 0:[0,26985],
                1:[26986,53525],
                2:[53526,91337],
                3:[91338,-1]
                }
}

In [3]:

def extend_ratios(arydata, fld_num, fld_denom, fldlst):
    ratio=arydata[fld_num]/arydata[fld_denom]
    ratio[isinf(ratio)]=0
    ratio[isnan(ratio)]=0
    for fld in fldlst:
        arydata[fld]=multiply(ratio,arydata[fld])
    return arydata

# def quartile_adj(ary_data):
#     count=0
#     vdiff,hdiff,tdiff=100,100,1000 
#     ary_qrtl=ary_data.reshape(-1,4)
#     #quart=array([round(ary_data.sum()/4,0)]*4)
#     quart=array([ary_data.sum()/4.0]*4)
#     hsum0=(ary_qrtl.sum(axis=1))[:,newaxis].astype(float)
#     while not(abs(vdiff)<50 and abs(hdiff)<50) and count<=100:
#         vsum=ary_qrtl.sum(axis=0)
#         vratio=(quart/vsum).astype(float)
#         ary_qrtl=around(ary_qrtl*vratio)
#         vdiff=abs((ary_qrtl.sum(axis=0)-quart).sum())
#         hsum=(ary_qrtl.sum(axis=1))[:,newaxis]
#         hratio=(hsum0/hsum).astype(float)
#         hratio[isinf(hratio)]=0
#         hratio[isnan(hratio)]=0
#         ary_qrtl=around(ary_qrtl*hratio)
#         hdiff=abs((ary_qrtl.sum(axis=1)[:,newaxis]-hsum0).sum())
#        # print vdiff, hdiff
#         if (vdiff+hdiff)<tdiff:
#             ary_min=ary_qrtl
#             tdiff=vdiff+hdiff
#         count=count+1
#    # print vdiff, hdiff
#     return ary_min.reshape(1,-1)   


def quartile_adj(ary_data):
    count=0
    vdiff,hdiff,tdiff=100,100,1000 
    ary_qrtl=ary_data.reshape(-1,4)
    #quart=array([round(ary_data.sum()/4,0)]*4)
    quart=array([ary_data.sum()/4.0]*4)
    hsum0=(ary_qrtl.sum(axis=1))[:,newaxis].astype(float)
    while not(abs(vdiff)<50 and abs(hdiff)<50) and count<=100:
        vsum=ary_qrtl.sum(axis=0)
        vratio=(quart/vsum).astype(float)
        ary_qrtl=around(ary_qrtl*vratio)
        vdiff=abs((ary_qrtl.sum(axis=0)-quart).sum())
        hsum=(ary_qrtl.sum(axis=1))[:,newaxis]
        hratio=(hsum0/hsum).astype(float)
        hratio[isinf(hratio)]=0
        hratio[isnan(hratio)]=0
        ary_qrtl=around(ary_qrtl*hratio)
        hdiff=abs((ary_qrtl.sum(axis=1)[:,newaxis]-hsum0).sum())
       # print vdiff, hdiff
        if (vdiff+hdiff)<tdiff:
            ary_min=ary_qrtl
            tdiff=vdiff+hdiff
        count=count+1
   # print vdiff, hdiff
    return ary_min.reshape(1,-1) 


## Step 1. Compute REMI household population
#### using the ratios between base year REMI pop and Census HH pop to produce REMI HH pop for all forecast years. 

In [4]:
# 1.1 combine 64 REMI text files; recode age to age_groups (23 Census groups); aggregate population by large area, gender, race and age groups(lgra)

dic_la={"detroit":5,"wayne balance":3, "macomb":99,"livingston":93, "monroe":115,"oakland":125,"st clair":147, "washtenaw":161}
dic_race={"white":1,"black":2, "hispanic":3,"other":4}
dic_gend={"males":1, "females":2}

remi_pop_input = pd.DataFrame()
for area in dic_la.keys():
    for gender in dic_gend.keys():
        for race in dic_race.keys():
            try:
                filname=inputdir+"pop "+area+" "+race+" "+gender+".xls.txt"
                dfp=pd.read_csv(filname,sep='\t',header=0, index_col=0)
                dfp=dfp.multiply(1000).reset_index()
                dfp['large_area_id']=dic_la[area]
                dfp['gender']=dic_gend[gender]
                dfp['race_id']=dic_race[race]
                remi_pop_input=remi_pop_input.append(dfp)
            except:
                print "file not exist !"
                exit()
print remi_pop_input.head(2)
#remi_pop_input.to_csv("test1.csv")            

         2010        2011        2012        2013        2014        2015  \
0  136.331946  137.431845  138.527334  140.117079  142.127216  144.819796   
1  135.149688  136.178851  137.226284  138.416901  139.982447  141.935021   

         2016        2017        2018        2019        2020        2021  \
0  147.270903  149.809822  153.085068  156.254351  159.236953  163.173243   
1  144.598246  147.080451  149.632975  152.926818  156.125575  159.129158   

         2022        2023        2024        2025        2026        2027  \
0  167.426929  171.965227  176.654860  181.339592  186.929032  192.454487   
1  163.092375  167.371884  171.934456  176.631600  181.332693  186.946601   

         2028        2029      
0  197.364420  202.357665 ...  
1  192.470312  197.394192 ...  

[2 rows x 35 columns]


In [5]:
# 1.2 recode age to age group using agegrp dictionary
remi_pop_input['age_group'] = digitize(remi_pop_input['age_group'], bin_census_agegrp)

In [6]:
# 1.3 Aggregate total population by large area, gender, race and age groups
remi_pop_sum=remi_pop_input.groupby(['large_area_id', 'gender','race_id','age_group']).sum()

In [7]:
#1.4 read Census household population data (aggregated by lgra)
#'census_reg_hh_pop.txt' field names should be 'large_area','gender','race','age_group','hh_pop'
census_hhpop_sum=pd.read_csv(census_hhpop_file,sep='\t',header=0,index_col=[0,1,2,3] )
census_hhpop_sum.index.names=['large_area_id','gender','race_id','age_group']


In [8]:
#census_hhpop_sum.rename(columns={'race': 'race_id'}, inplace=True)

In [9]:
census_hhpop_sum.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,hh_pop
large_area_id,gender,race_id,age_group,Unnamed: 4_level_1
93,1,1,1,5588
93,1,1,2,6357
93,1,1,3,6565
93,1,1,4,3643
93,1,1,5,1793


In [10]:
#calculate ratio between REMI total pop and baseline Census HH pop, then apply ratios to get future REMI HH pops
remi_census_join=pd.merge(remi_pop_sum,census_hhpop_sum, left_index=True,right_index=True,how='outer')
remi_census_join.base_ratio = remi_census_join['hh_pop']/remi_census_join['2010']
remi_hhpop = remi_census_join.multiply(remi_census_join.base_ratio, axis='index')
#print remi_hhpop.head(3)

In [11]:
remi_pop_sum.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,Unnamed: 24_level_0
large_area_id,gender,race_id,age_group,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
3,1,1,1,15172.702074,14496.556282,13867.927312,13228.573561,12750.403404,12279.563666,11885.001182,11516.827345,11181.05507,10893.440247,10650.288345,10448.298454,10286.997795,10162.072659,10063.264012,9985.727072,9925.562025,9876.107335,9829.379917,9783.464907,
3,1,1,2,17482.806683,16775.567292,15906.956433,15165.783881,14332.956314,13591.409922,12923.287153,12353.365421,11815.196991,11435.646058,11065.150261,10770.888328,10505.10502,10271.898269,10075.012208,9913.689852,9789.521933,9699.961424,9638.794065,9598.407626,
3,1,1,3,17097.141742,17066.866875,17079.55432,16929.739953,16628.143311,16315.936565,15608.680248,14786.119938,14129.477263,13379.099847,12719.324112,12132.883072,11646.413564,11193.272353,10888.272523,10584.797382,10353.973985,10146.772266,9966.24136,9818.004846,


In [12]:
remi_hhpop.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,Unnamed: 24_level_0
large_area_id,gender,race_id,age_group,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
3,1,1,1,29678,28355.450152,27125.843819,25875.259675,24939.952711,24018.98414,23247.214857,22527.062107,21870.287227,21307.708942,20832.1007,20437.005881,20121.499722,19877.144552,19683.873571,19532.210321,19414.526717,19317.792708,19226.393279,19136.58293,
3,1,1,2,32602,31283.137472,29663.348857,28281.207649,26728.147844,25345.309498,24099.391785,23036.599715,22033.021315,21325.233387,20634.331509,20085.590811,19589.957155,19155.072377,18787.918551,18487.084049,18255.535272,18088.522517,17974.457409,17899.144634,
3,1,1,3,32123,32066.118004,32089.955836,31808.476804,31241.821331,30655.231043,29326.401054,27780.93192,26547.197477,25137.349323,23897.728323,22795.892366,21881.888129,21030.50283,20457.453271,19887.268377,19453.585362,19064.283985,18725.09312,18446.57864,


In [13]:
remi_hhpop.columns

Index([u'2010', u'2011', u'2012', u'2013', u'2014', u'2015', u'2016', u'2017', u'2018', u'2019', u'2020', u'2021', u'2022', u'2023', u'2024', u'2025', u'2026', u'2027', u'2028', u'2029', u'2030', u'2031', u'2032', u'2033', u'2034', u'2035', u'2036', u'2037', u'2038', u'2039', u'2040', u'hh_pop'], dtype='object')

## Step 2. Convert REMI HH population to REMI households

In [14]:
#recode age_group to age_of_head
remi_hhpop=remi_hhpop.reset_index()
remi_hhpop.rename(columns={'age_group': 'age_of_head'}, inplace=True)
remi_hhpop['age_of_head'] = digitize(remi_hhpop['age_of_head'], bin_agp2aoh)
# aggregate REMI HH pop by 3 attributes large area, race and age groups
remi_hhpop_sum3=remi_hhpop.groupby(['large_area_id','race_id','age_of_head']).sum()
print remi_hhpop_sum3.columns

Index([u'gender', u'2010', u'2011', u'2012', u'2013', u'2014', u'2015', u'2016', u'2017', u'2018', u'2019', u'2020', u'2021', u'2022', u'2023', u'2024', u'2025', u'2026', u'2027', u'2028', u'2029', u'2030', u'2031', u'2032', u'2033', u'2034', u'2035', u'2036', u'2037', u'2038', u'2039', u'2040', u'hh_pop'], dtype='object')


In [15]:
###Compute synthesized HHs by 3 attributes 'large_area_id','race','age_of_head'
#read synthesized HHs
syn_hhs=pd.read_csv(syn_hh_file,sep=',',header=0)

# recode 'age_of_head'
syn_hhs['age_of_head'] = digitize(syn_hhs['age_of_head'], bin_ageofhead)

# recode 'persons'
syn_hhs['persons'] = digitize(syn_hhs['persons'], bin_persons)

# recode 'income'
syn_hhs['income']=pd.qcut(syn_hhs['income'], 4).labels
print syn_hhs[syn_hhs['income']==0].shape #check income==0 distribution

# recode 'workers','cars','children'
for attr in ['workers','cars','children']:
    syn_hhs[attr][syn_hhs[attr] >3] = 3
    
#aggregate synthesized HHs3 by 3 attributes 'large_area','race','age_of_head'
sum3=syn_hhs.groupby(['large_area_id','race_id','age_of_head']).size()
syn_hhs_sum3=pd.DataFrame(sum3,columns=['HHs_lra'])

(461768, 11)


In [17]:
syn_hhs.head(2)

Unnamed: 0,household_id,building_id,zone_id,large_area_id,cars,workers,income,persons,race_id,age_of_head,children
0,1000001,1484185,551,3,2,1,1,2,1,4,0
1,1000002,1872605,557,3,1,0,0,1,1,5,0


In [19]:
syn_hhs[(syn_hhs['workers']==0) & (syn_hhs['persons']== 8) & (syn_hhs['large_area_id']==3) & (syn_hhs['cars']==0) & (syn_hhs['race_id']==1)]

Unnamed: 0,household_id,building_id,zone_id,large_area_id,cars,workers,income,persons,race_id,age_of_head,children
265905,1268805,1565836,420,3,0,0,0,8,1,4,3
281307,1284936,1566219,422,3,0,0,0,8,1,4,3
367053,1376460,1563463,423,3,0,0,0,8,1,4,3
378804,1389797,1839927,688,3,0,0,0,8,1,4,3


In [20]:
if syn_hhs_sum3.shape[0]<>remi_hhpop_sum3.shape[0]:
    print "  * Warning, Syn HHs and REMI HH pop have different sub-categories", syn_hhs_sum3.shape[0],remi_hhpop_sum3.shape[0]




In [21]:

 # join synthesized HH and census HH pop and calculate forecast HHs
hh_hhpop_join=pd.merge(syn_hhs_sum3, remi_hhpop_sum3,left_index=True,right_index=True,how='outer')
#print hh_hhpop_join.columns

#calculate ratio between baseline HHs and HH pop
hh_hhpop_join.drop(['gender','hh_pop'],1,inplace=True)
hh_hhpop_join.fillna(0, inplace=True)
hh_hhpop_join.ratio = hh_hhpop_join['HHs_lra']/hh_hhpop_join['2010']

#apply ratio to forecast years to get future households by lra
remi_hhs = hh_hhpop_join.multiply(hh_hhpop_join.ratio, axis='index')


In [22]:
remi_hhs.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,HHs_lra,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025,2026,2027,2028,Unnamed: 23_level_0
large_area_id,race_id,age_of_head,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
3,1,1,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,1,2,0.018538,55,53.902763,52.566936,51.117782,49.572556,48.138645,46.654393,45.089937,43.557923,41.735523,40.01546,38.545317,37.107127,35.781264,34.669457,33.744004,32.92611,32.324762,31.781138,
3,1,3,15030.172511,54558,53360.17726,52258.905728,50838.07474,49010.354306,47055.083894,45465.419861,44386.902691,42981.694326,42833.407524,42376.763429,41931.541354,41523.207141,41220.243337,40969.583586,40363.452801,39824.65213,39304.520163,38841.932844,


##Step 3. Extend REMI HHs attributes 

In [23]:
 #aggregate synthesized HHs by all HH attributes
sum8=syn_hhs.groupby(control_attrs).size()
syn_hhs_sum8=pd.DataFrame(sum8,columns=['HHs8'])

#reindex synthetic HHs by large, race and age
syn_hhs_sum8_nonind=syn_hhs_sum8.reset_index()
syn_hhs_sum8_index3=syn_hhs_sum8_nonind.set_index(lst_3at)

In [24]:
#reset index for merge
syn_hhs_sum3_nonind=syn_hhs_sum3.reset_index()
syn_hhs_sum3_nonind.head(3)

Unnamed: 0,large_area_id,race_id,age_of_head,HHs_lra
0,3,1,2,55
1,3,1,3,54558
2,3,1,4,220398


In [25]:
#merge synthetic HHs aggregated by all attributes to synthetic HHs aggregated by lra
syn_hhs_83join=pd.merge(syn_hhs_sum8_index3,syn_hhs_sum3, left_index=True, right_index=True, how='outer')
syn_hhs_83join['ratio'] = syn_hhs_83join['HHs8']/syn_hhs_83join['HHs_lra']
syn_hhs_83join.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,persons,children,cars,workers,income,HHs8,HHs_lra,ratio
large_area_id,race_id,age_of_head,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3,1,2,3,2,1,3,0,18,55,0.327273
3,1,2,3,2,2,1,0,32,55,0.581818
3,1,2,4,2,1,3,0,1,55,0.018182


In [26]:
#merge syn hhs with ratio to REMI hhs
remi_hhs_83ratio=pd.merge(syn_hhs_83join,remi_hhs, left_index=True, right_index=True, how='outer')
remi_hhs_83ratio.reset_index(inplace=True)
remi_hhs_83ratio.set_index(control_attrs,inplace=True)
remi_hhs_83ratio.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,HHs8,HHs_lra_x,ratio,HHs_lra_y,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025,Unnamed: 28_level_0
large_area_id,race_id,age_of_head,persons,children,cars,workers,income,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
3,1,1,,,,,,,,,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,1,2,3.0,2.0,1.0,3.0,0.0,18.0,55.0,0.327273,0.018538,55,53.902763,52.566936,51.117782,49.572556,48.138645,46.654393,45.089937,43.557923,41.735523,40.01546,38.545317,37.107127,35.781264,34.669457,33.744004,
3,1,2,3.0,2.0,2.0,1.0,0.0,32.0,55.0,0.581818,0.018538,55,53.902763,52.566936,51.117782,49.572556,48.138645,46.654393,45.089937,43.557923,41.735523,40.01546,38.545317,37.107127,35.781264,34.669457,33.744004,


In [27]:

#remi_hhs_83ratio.ratio = remi_hhs_83ratio['HHs8']/remi_hhs_83ratio['HHs3']
remi_hhs_83ratio=remi_hhs_83ratio.multiply(remi_hhs_83ratio.ratio, axis='index')
remi_hhs_83ratio.fillna(0, inplace=True)
#remi_hhs_83ratio.head()

#drop unwanted columns
for col in remi_hhs_83ratio.columns:
    if not (col in  years):
        remi_hhs_83ratio.drop([col],1,inplace=True)

In [28]:
T=pd.DataFrame(remi_hhs_83ratio.sum(axis=1))

In [29]:
remi_hhs_83ratio.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,Unnamed: 28_level_0
large_area_id,race_id,age_of_head,persons,children,cars,workers,income,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
3,1,1,,,,,,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,1,2,3.0,2.0,1.0,3.0,0.0,18,17.640904,17.203724,16.729456,16.223746,15.754466,15.268711,14.756707,14.25532,13.658898,13.095969,12.614831,12.144151,11.710232,11.346368,11.043492,10.775818,10.579013,10.4011,10.269445,
3,1,2,3.0,2.0,2.0,1.0,0.0,32,31.361607,30.584399,29.741255,28.842215,28.007939,27.144374,26.234145,25.342791,24.282486,23.281722,22.426366,21.589601,20.81819,20.171321,19.632875,19.157009,18.807134,18.490844,18.256791,


## Step 4 adjust income to 4 quartiles

In [30]:
#pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)

In [31]:
#filter out rows with 0s from 2010 to 2040, this could reduce array size significantly
remi_hhs_83ratio['total']=remi_hhs_83ratio.sum(axis=1)
hhs_filtered=remi_hhs_83ratio[remi_hhs_83ratio['total']>0]

In [32]:
#hhs_filtered.to_csv('test2.csv')

In [33]:
hhs_inc_adj=hhs_filtered.reset_index()
hhs_inc_adj.head(2)

Unnamed: 0,large_area_id,race_id,age_of_head,persons,children,cars,workers,income,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 21
0,3,1,2,3,2,1,3,0,18,17.640904,17.203724,16.729456,16.223746,15.754466,15.268711,14.756707,14.25532,13.658898,13.095969,12.614831,...
1,3,1,2,3,2,2,1,0,32,31.361607,30.584399,29.741255,28.842215,28.007939,27.144374,26.234145,25.342791,24.282486,23.281722,22.426366,...


In [34]:
#hhs_inc_adj.reset_index().groupby('income').sum().to_csv('income_quartile_changes.csv') #number of HHs in inc quartile changes

In [35]:
hhs_inc_adj=hhs_filtered.reset_index()
inc_years=years[:]
inc_years.remove(baseyear)

for year in inc_years:
    const_quarter=hhs_inc_adj[year].sum()/4
    diff=100
    count=0
    while not(diff<50) and count<=100:
        df_temp=hhs_inc_adj[['income',year]]
        df_adj=pd.DataFrame((df_temp[year].sum()/4)/(df_temp.groupby(df_temp['income'])[year].sum()))
        df_adj.rename(columns={year: 'adj_rates'}, inplace=True)
        hhs_inc_adj=pd.merge(hhs_inc_adj,df_adj, left_on='income',right_index=True,how='left')
        hhs_inc_adj[year]=hhs_inc_adj[year]*hhs_inc_adj['adj_rates']
        hhs_inc_adj.drop(['adj_rates'],1,inplace=True)
        result=hhs_inc_adj.groupby(hhs_inc_adj['income'])[year].sum()
        diff=abs(result-const_quarter).sum()
        count+=1

In [36]:
#hhs_inc_adj.to_csv('test3.csv')

In [37]:
hhs_inc_adj.set_index(control_attrs,inplace=True)
hhs_inc_adj.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,Unnamed: 28_level_0
large_area_id,race_id,age_of_head,persons,children,cars,workers,income,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
3,1,2,3,2,1,3,0,18,17.620205,17.066283,16.521335,15.932075,15.356432,14.770702,14.162603,13.5818,12.901797,12.273186,11.738099,11.21726,10.747127,10.350109,10.010323,9.714686,9.494001,9.293657,9.141329,
3,1,2,3,2,2,1,0,32,31.32481,30.340059,29.371262,28.323688,27.300323,26.259026,25.177961,24.145422,22.936528,21.818997,20.867731,19.941796,19.106004,18.400194,17.79613,17.270553,16.878223,16.522056,16.251251,
3,1,2,4,2,1,3,0,1,0.9789,0.948127,0.917852,0.885115,0.853135,0.820595,0.786811,0.754544,0.716767,0.681844,0.652117,0.623181,0.597063,0.575006,0.556129,0.539705,0.527444,0.516314,0.507852,
3,1,2,4,2,2,2,0,4,3.915601,3.792507,3.671408,3.540461,3.41254,3.282378,3.147245,3.018178,2.867066,2.727375,2.608466,2.492725,2.38825,2.300024,2.224516,2.158819,2.109778,2.065257,2.031406,
3,1,3,1,0,0,0,0,416,406.389312,395.286221,382.813544,366.981298,349.726124,335.362783,324.820446,312.247364,308.497784,302.818471,297.503464,292.445829,288.451191,284.960399,278.974732,273.757104,268.956157,264.632445,260.06257,


In [38]:
hhs_inc_adj.columns

Index([u'2010', u'2011', u'2012', u'2013', u'2014', u'2015', u'2016', u'2017', u'2018', u'2019', u'2020', u'2021', u'2022', u'2023', u'2024', u'2025', u'2026', u'2027', u'2028', u'2029', u'2030', u'2031', u'2032', u'2033', u'2034', u'2035', u'2036', u'2037', u'2038', u'2039', u'2040', u'total'], dtype='object')

In [39]:
#hhs_sel.to_csv("hhs_sel.csv")

In [40]:
hhs_inc_adj.drop(['total'],1,inplace=True)

## Step 5. format control total table

In [41]:
#stack household control table and add year to index
dfhh=pd.DataFrame(hhs_inc_adj.stack(),columns=['total_number_of_households'])
indn=dfhh.index.names[:-1]+['year']
dfhh.index.names=indn
dfhh.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,total_number_of_households
large_area_id,race_id,age_of_head,persons,children,cars,workers,income,year,Unnamed: 9_level_1
3,1,2,3,2,1,3,0,2010,18.0
3,1,2,3,2,1,3,0,2011,17.620205
3,1,2,3,2,1,3,0,2012,17.066283
3,1,2,3,2,1,3,0,2013,16.521335
3,1,2,3,2,1,3,0,2014,15.932075


In [42]:
dfcc=dfhh.reset_index()
dfcc.head()

Unnamed: 0,large_area_id,race_id,age_of_head,persons,children,cars,workers,income,year,total_number_of_households
0,3,1,2,3,2,1,3,0,2010,18.0
1,3,1,2,3,2,1,3,0,2011,17.620205
2,3,1,2,3,2,1,3,0,2012,17.066283
3,3,1,2,3,2,1,3,0,2013,16.521335
4,3,1,2,3,2,1,3,0,2014,15.932075


In [43]:
#round 'total_number_of_households'
dfcc['total_number_of_households']=pd.Series.round(dfcc['total_number_of_households'],0)
dfcc.head()

Unnamed: 0,large_area_id,race_id,age_of_head,persons,children,cars,workers,income,year,total_number_of_households
0,3,1,2,3,2,1,3,0,2010,18
1,3,1,2,3,2,1,3,0,2011,18
2,3,1,2,3,2,1,3,0,2012,17
3,3,1,2,3,2,1,3,0,2013,17
4,3,1,2,3,2,1,3,0,2014,16


In [44]:
unique(dfcc['income'])

array([ 0.,  1.,  2.,  3.])

In [45]:
for col in dic_format.keys():
    
    print dfcc.groupby([col])['total_number_of_households'].sum()

cars
0        4778731
1       21271922
2       22938855
3       10318027
Name: total_number_of_households, dtype: float64
workers
0          16849288
1          23921112
2          14665396
3           3871739
Name: total_number_of_households, dtype: float64
persons
1          17710792
2          19415095
3           8981526
4           7420367
5           3493966
6           1363748
7            524647
8            220306
9             88465
10            88623
Name: total_number_of_households, dtype: float64
income
0         14828022
1         14826465
2         14826835
3         14826213
Name: total_number_of_households, dtype: float64
age_of_head
2                 10179
3              10167186
4              30911976
5              18218194
Name: total_number_of_households, dtype: float64
children
0           41599584
1            7655614
2            6315261
3            3737076
Name: total_number_of_households, dtype: float64


In [46]:
for hhattr in dic_format.keys():
    colmin,colmax=hhattr+'_min',hhattr+'_max'
    dfcc[colmin]=0
    dfcc[colmax]=0
    for value in dic_format[hhattr].keys():
        dfcc[colmin][dfcc[hhattr] == value]=dic_format[hhattr][value][0]
        dfcc[colmax][dfcc[hhattr] == value]=dic_format[hhattr][value][1]
        
        
        

In [47]:
for col in dic_format.keys():   
    print dfcc.groupby([col])['total_number_of_households'].sum()

cars
0        4778731
1       21271922
2       22938855
3       10318027
Name: total_number_of_households, dtype: float64
workers
0          16849288
1          23921112
2          14665396
3           3871739
Name: total_number_of_households, dtype: float64
persons
1          17710792
2          19415095
3           8981526
4           7420367
5           3493966
6           1363748
7            524647
8            220306
9             88465
10            88623
Name: total_number_of_households, dtype: float64
income
0         14828022
1         14826465
2         14826835
3         14826213
Name: total_number_of_households, dtype: float64
age_of_head
2                 10179
3              10167186
4              30911976
5              18218194
Name: total_number_of_households, dtype: float64
children
0           41599584
1            7655614
2            6315261
3            3737076
Name: total_number_of_households, dtype: float64


In [48]:
dfcc.groupby(['persons_min','persons_max'])['total_number_of_households'].sum()

persons_min  persons_max
1             1             17710792
2             2             19415095
3             3              8981526
4             4              7420367
5             5              3493966
6             6              1363748
7             7               524647
8             8               220306
9             9                88465
10           -1                88623
Name: total_number_of_households, dtype: float64

In [None]:
dfcc.head()

In [50]:
#dfcc[(dfcc['workers']==0) & (dfcc['persons']== 1) & (dfcc['large_area_id']==3) & (dfcc['cars']==0) & (dfcc['race_id']==4)].to_csv('test1.csv')

In [59]:
syn_hhs[(syn_hhs['large_area_id']==3) & (syn_hhs['cars']==0) &(syn_hhs['workers']==0) & (syn_hhs['persons']== 1) &   (syn_hhs['race_id']==4) &(syn_hhs['income']==3)& (syn_hhs['age_of_head']==5) &(syn_hhs['children']==0)]

Unnamed: 0,household_id,building_id,zone_id,large_area_id,cars,workers,income,persons,race_id,age_of_head,children
5661,1005705,1781626,945,3,0,0,3,1,4,5,0
35564,1035696,1784977,936,3,0,0,3,1,4,5,0
162427,1162909,1514047,869,3,0,0,3,1,4,5,0
242258,1244285,1529081,874,3,0,0,3,1,4,5,0
252055,1254410,1533025,875,3,0,0,3,1,4,5,0
279049,1282567,1533405,861,3,0,0,3,1,4,5,0
292334,1296482,2067475,881,3,0,0,3,1,4,5,0
311505,1316574,1902672,693,3,0,0,3,1,4,5,0
345967,1353361,1781899,946,3,0,0,3,1,4,5,0
365875,1375126,1781947,946,3,0,0,3,1,4,5,0


In [58]:
dfcc[(dfcc['large_area_id']==3) & (dfcc['cars']==0) &(dfcc['workers']==0) & (dfcc['persons']== 1) &   (dfcc['race_id']==4) &(dfcc['income']==3)& (dfcc['age_of_head']==5) &(dfcc['children']==0)]

Unnamed: 0,large_area_id,race_id,age_of_head,persons,children,cars,workers,income,year,total_number_of_households,cars_min,cars_max,workers_min,workers_max,persons_min,persons_max,income_min,income_max,age_of_head_min,age_of_head_max,Unnamed: 21
122698,3,4,5,1,0,0,0,3,2010,13,0,0,0,0,1,1,91338,-1,65,-1,...
122699,3,4,5,1,0,0,0,3,2011,14,0,0,0,0,1,1,91338,-1,65,-1,...
122700,3,4,5,1,0,0,0,3,2012,16,0,0,0,0,1,1,91338,-1,65,-1,...
122701,3,4,5,1,0,0,0,3,2013,17,0,0,0,0,1,1,91338,-1,65,-1,...
122702,3,4,5,1,0,0,0,3,2014,19,0,0,0,0,1,1,91338,-1,65,-1,...
122703,3,4,5,1,0,0,0,3,2015,20,0,0,0,0,1,1,91338,-1,65,-1,...
122704,3,4,5,1,0,0,0,3,2016,22,0,0,0,0,1,1,91338,-1,65,-1,...
122705,3,4,5,1,0,0,0,3,2017,23,0,0,0,0,1,1,91338,-1,65,-1,...
122706,3,4,5,1,0,0,0,3,2018,24,0,0,0,0,1,1,91338,-1,65,-1,...
122707,3,4,5,1,0,0,0,3,2019,26,0,0,0,0,1,1,91338,-1,65,-1,...


In [None]:
removefields=control_attrs[:]
removefields.remove('large_area_id')
removefields.remove('race_id')

In [None]:
removefields

In [None]:
dfcc.drop(removefields,1,inplace=True)

In [None]:
dfcc=dfcc.set_index('year')

In [None]:
dfcc.head(5)

In [None]:
dfcc=dfcc.astype(int64)

In [None]:
dfcc.dtypes

In [None]:
#pd.unique(dfcc['age_of_head_max'])

In [None]:
dfcc.to_csv("annual_household_control_totals_pandas.csv")

In [None]:
dfcc.shape

In [None]:
dfcc.reset_index().groupby('year')['total_number_of_households'].sum()

###end of program

In [None]:
df_test=pd.read_csv("household_control_totals.csv")

In [None]:
df_test.shape

In [None]:
df_test.head()

In [None]:
df_test.columns

In [None]:
dfcc.columns