In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import base64
from io import BytesIO

## preprocess DP and SF1 data

In [ ]:
#process PD data
st = pd.HDFStore('2010dp.h5', complib='zlib', complevel=5 )

dp_folder = '2010_dp/'
geowidth= [6,2,3,2,3,2,7,1,1,2,3,2,2,5,2,2,5,2,2,6,1,4]
geoflds= ['FILEID','STUSAB','SUMLEV','GEOCOMP','CHARITER', 'CIFSN','LOGRECNO',
'REGION','DIVISION' ,'STATE' ,'COUNTY' ,'COUNTYCC' ,'COUNTYSC' ,'COUSUB' ,'COUSUBCC' ,'COUSUBSC' , 'PLACE' ,'PLACECC' ,'PLACESC' ,'TRACT' ,'BLKGRP' ,'BLOCK']
dfdata = pd.read_fwf(dp_folder + 'migeo2010.dhc', widths =geowidth, names = geoflds, header = None, usecols = range(22))


dp_files = {'Segment1_demographicAndHousehold.csv':'mi000012010.dhc',
                'Segment2_householdAndGQ.csv':'mi000022010.dhc',
                'Segment14_housing.csv':'mi000142010.dhc' }
for k in dp_files.keys():               
    headers = pd.read_csv(dp_folder + k)
    headers = list(headers.columns)[4:]
    print(len(headers))
    data = pd.read_csv(dp_folder + dp_files[k], header=None, usecols = range(4, len(headers) + 4))
    print(data.shape)
    data.columns = headers
    
    dfdata = pd.merge(dfdata, data, left_on = 'LOGRECNO', right_on = 'LOGRECNO', how='left')

st['dp_2010']= dfdata
st.close()

In [ ]:
st = pd.HDFStore('2010sf1.h5', complib='zlib', complevel=5)

sf1_folder = '2010_sf1_sel/'
dfdata = pd.read_csv(sf1_folder + 'GEO_HEADER_SF1.txt', usecols = range(22))

sf1_files = ['SF1_00001.txt','SF1_00002.txt','SF1_00003.txt','SF1_00004.txt','SF1_00005.txt','SF1_00044.txt']
for f in sf1_files:
    print(f)               
    data = pd.read_csv(sf1_folder + f)
    data.drop(['FILEID','STUSAB','CHARITER','CIFSN'], axis=1,inplace=True)
    dfdata = pd.merge(dfdata, data, left_on = 'LOGRECNO', right_on = 'LOGRECNO', how='left')

data = pd.read_csv(sf1_folder + 'SF1_00006mod.csv')
dfdata = pd.merge(dfdata, data, left_on = 'LOGRECNO', right_on = 'LOGRECNO', how='left')

st['sf1_2010'] = dfdata
st.close()

## extract Table by name and sumlevel

In [ ]:
#build a lookup table table-> field, so it can be used to extract data by table id
# vlist = [col for col in stdp['/dp_2010'].columns if '0' in col ]
# tlist = [[col[:-3].replace('0',''), col] for col in vlist]
# tlookup = pd.DataFrame(tlist, columns=['table', 'code'])
# tlookup = tlookup.set_index('table')

In [ ]:
def pct_dif(df1, df2):
    """ generate a percentage based difference table
        df1 and df2 have same index and columns
        pct_dif set to 0 where df2-df1 equals to 0 
        pct_dif rows are removed if cell values are infs (df2-df1>0 but df1 is 0)    
    """
    df_diff = df2.subtract(df1, axis =1)
    df_pct_diff = df_diff.div(df1, axis=1) * 100.0
    df_pct_diff.update(df_diff[df_diff==0])
    df_pct_diff = df_pct_diff.replace([np.inf, -np.inf],np.nan).dropna(axis=0)

    return df_pct_diff

In [ ]:
def sub_error_plot(df, ax=None, tle_text=""):
    """
    df generated from pct_diff.describe()
    plot shows mean values   
    """
    if ax is None:
        ax = plt.gca()
    ax.vlines(0, 0, len(df)) #draw the y axis on 0
    ax.errorbar(df['mean'], range(len(df.index)), 
                xerr= [df['mean'] - df['min'], df['max'] - df['mean']], 
                fmt='ok', ecolor='gray', lw=1, capsize=8)
    ax.errorbar(df['mean'], range(len(df.index)), xerr=df['std'], fmt='ok', lw=4)

    for (x,y,l) in zip(df['mean'],  range(len(df.index)), round(df['mean'],3)):
        ax.text(50, y + 0.1, l, size=20)

    ax.set_title( tle_text + ' PCT ERRORS')
    ax.set_yticks(range(len(df.index)))
    ax.set_yticklabels(df.index)

    return ax

In [ ]:
def compare_plots(lst_dfs):
    """
    subplots with dimension 1 x X (X is the number of df being compared)
    df should have name
    """
    sns.set(font_scale=2)   
    fig, axs = plt.subplots(1, len(lst_dfs), sharex=True, sharey=True, 
                            figsize=(30, max(5, len(lst_dfs[0]))), 
                            gridspec_kw={'hspace': 0, 'wspace': 0.02})
    c=0
    for df in lst_dfs:
        sub_error_plot(df, axs[c], " ".join(df.name.split("_")))
        c += 1

    return fig

In [ ]:
stdp = pd.HDFStore('2010dp.h5', 'r' )
stsf1 = pd.HDFStore('2010sf1.h5', 'r' )

In [ ]:
#build an field name lookup table, so the plot can use name instead of field code
sf1_folder = '2010_sf1_sel/'
df_lookup = pd.read_excel(sf1_folder + 'SelectVariablesToCompare.xlsx')
df_lookup.dropna(axis =0, inplace = True)
tbl_lookup = df_lookup[['TABLE NUMBER', 'FIELD CODE']].set_index('TABLE NUMBER')
name_lookup = df_lookup[['FIELD CODE', 'FIELD NAME']].set_index('FIELD CODE')


In [ ]:
geo_levels = {"PLACE": ['STATE', 'COUNTY', 'COUSUB','PLACE'],
            "TRACT":['STATE', 'COUNTY', 'TRACT'],
            "BLKGRP":['STATE', 'COUNTY', 'TRACT', 'BLKGRP'] }
sum_levels = {"PLACE":70, "TRACT":140, "BLKGRP":150}

dp_2010 = stdp['/dp_2010'].loc[stdp['/dp_2010'].SUMLEV.isin(list(sum_levels.values()))]
sf1_2010 = stsf1['/sf1_2010'].loc[stsf1['/sf1_2010'].SUMLEV.isin(list(sum_levels.values()))]

    
dfdifs={}
dfps={}
for t in df_lookup['TABLE NUMBER'].unique():
    print('table:', t)
    vcodes = list(tbl_lookup.loc[[t]]['FIELD CODE'])

    for lev, geoids in geo_levels.items():
        
        dpdata = dp_2010.loc[dp_2010.SUMLEV == sum_levels[lev], geoids + vcodes ]
        sf1data = sf1_2010.loc[sf1_2010.SUMLEV == sum_levels[lev], geoids + vcodes]
        dpdata =  dpdata.set_index(geoids)
        sf1data =  sf1data.set_index(geoids)

        dfdifs[(t, lev)] = pct_dif(sf1data, dpdata)#keep difference tables for other uses
        dfp = dfdifs[(t, lev)].describe().T.sort_index(axis = 0, ascending = False)
        dfp.index=name_lookup.loc[dfp.index, 'FIELD NAME']
        dfps[(t, lev)] = dfp



In [ ]:
for t in df_lookup['TABLE NUMBER'].unique():
    print('table ', t )
    lst_dfs = []
    for geo in geo_levels.keys():
        df = dfps[(t, geo)]
        df.name = t +"_" + geo
        lst_dfs.append(df)
    fig= compare_plots(lst_dfs)
    fig.savefig('plots/' + t + '_error_plot.png', bbox_inches = "tight")
