In [1]:
import pandas as pd
import re
from collections import OrderedDict 
import numpy as np

In [2]:
#read geo info
#geo reader
def geo_reader(geo_folder, geo_file):
        geowidth= [6,2,3,2,3,2,7,1,1,2,3,2,2,5,2,2,5,2,2,6,1,4]
        geoflds= ['FILEID','STUSAB','SUMLEV','GEOCOMP','CHARITER', 'CIFSN','LOGRECNO','REGION','DIVISION' ,'STATE' ,'COUNTY' ,
                'COUNTYCC' ,'COUNTYSC' ,'COUSUB' ,'COUSUBCC' ,'COUSUBSC' , 'PLACE' ,'PLACECC' ,'PLACESC' ,'TRACT' ,'BLKGRP' ,'BLOCK']
                
        dfgeo = pd.read_fwf(geo_folder + geo_file, widths =geowidth, names = geoflds, header = None, usecols = range(22))
        dfgeo['geoid'] = dfgeo['STATE'] * 10000000000 + dfgeo['COUNTY'] * 10000000 + dfgeo['TRACT'] * 10 + dfgeo['BLKGRP']
        return dfgeo[['SUMLEV', 'LOGRECNO','geoid']]

# # summary level (https://www.census.gov/prod/cen2010/doc/sf1.pdf)
# 050, State-County
# 060, State-County-County Subdivision
# 067, State-County-County Subdivision-Subminor Civil Division
# 140, State-County-Census Tract
# 150, State-County-Census Tract-Block Group

In [3]:
dp_folder = '2010_dp/' # dp geo and state files should be in this folder
sf1_folder = '2010_sf1/'# sf1 geo and state files should be in this folder

In [4]:
#get lookup file for DP tables and all variables
header_files = {'Segment1_demographicAndHousehold.csv':'mi000012010.dhc',
                'Segment2_householdAndGQ.csv':'mi000022010.dhc',
                'Segment14_housing.csv':'mi000142010.dhc' }
dp_lookup = []
for hf in header_files:
    segh = pd.read_csv(dp_folder + hf)
    segh = segh.columns[5:].to_frame(name = 'FIELD CODE')
    segh["TABLE"] = segh['FIELD CODE'].str[:-4]
    segh["SEGMENT FILE"] = header_files[hf]
    segh["SEGMENT"] = segh["SEGMENT FILE"].str[5:7].astype(int)
    segh["FIELD NAME"] = segh["FIELD CODE"]
    for seg in segh.SEGMENT:
        lseg = len(segh.loc[segh.SEGMENT == seg])
        segh.loc[segh.SEGMENT == seg, 'FIELD POS'] =  range(5, lseg + 5)
    dp_lookup.append(segh)

dp_lookup = pd.concat(dp_lookup, axis=0)


In [ ]:
dp_lookup['TABLE'].unique()

In [ ]:
# #get lookup file for SF1 tables and all variables
# sf1_lookup = pd.read_excel(sf1_folder + 'DATA_FIELD_DESCRIPTORS.xlsx')
# sf1_lookup = sf1_lookup.loc[~sf1_lookup.DECIMAL.isnull()]
# sf1_lookup["TABLE"] = sf1_lookup['FIELD CODE'].str[:-4]
# sf1_lookup["SEGMENT FILE"] = "mi000" + sf1_lookup.SEGMENT.astype(str).str.zfill(2) + "2010.sf1"
# sf1_lookup['FIELD POS'] = 0
# for seg in sf1_lookup.SEGMENT:
#     lseg = len(sf1_lookup.loc[sf1_lookup.SEGMENT == seg])
#     sf1_lookup.loc[sf1_lookup.SEGMENT == seg, 'FIELD POS'] =  range(5, lseg + 5)

In [5]:
#get lookup file for sf1 tables and all variables
sf1_folder = '2010_sf1_sel/'
sf1_files = ['SF1_00001.txt','SF1_00002.txt','SF1_00003.txt','SF1_00004.txt','SF1_00005.txt','SF1_00044.txt']

sf1_lookup = []
for hf in sf1_files:
    segh = pd.read_csv(sf1_folder + hf, index_col=0, nrows=0).columns[4:]
    segh = segh.to_frame(name = 'FIELD CODE')
    segh["TABLE"] = segh['FIELD CODE'].str[:-4]
    segh["SEGMENT FILE"] = hf
    segh["SEGMENT"] = segh["SEGMENT FILE"].str[7:8].astype(int)

    for seg in segh.SEGMENT:
        lseg = len(segh.loc[segh.SEGMENT == seg])
        segh.loc[segh.SEGMENT == seg, 'FIELD POS'] =  range(5, lseg + 5)

    sf1_lookup.append(segh)
sf1_lookup = pd.concat(sf1_lookup)

#     segh = segh.columns[5:].to_frame(name = 'FIELD CODE')
#     segh["TABLE"] = segh['FIELD CODE'].str[:-4]
#     segh["SEGMENT FILE"] = header_files[hf]
#     segh["SEGMENT"] = segh["SEGMENT FILE"].str[5:7].astype(int)
#     segh["FIELD NAME"] = segh["FIELD CODE"]
#     for seg in segh.SEGMENT:
#         lseg = len(segh.loc[segh.SEGMENT == seg])
#         segh.loc[segh.SEGMENT == seg, 'FIELD POS'] =  range(5, lseg + 5)
#     dp_lookup.append(segh)

# dp_lookup = pd.concat(dp_lookup, axis=0)


name_lookup = pd.read_excel(sf1_folder + 'SelectVariablesToCompare.xlsx')
name_lookup = name_lookup.dropna(axis=0)
name_lookup = name_lookup[['FIELD CODE', 'FIELD NAME']].set_index('FIELD CODE')

In [6]:
sf1_lookup.head(2)

Unnamed: 0,FIELD CODE,TABLE,SEGMENT FILE,SEGMENT,FIELD POS
P0010001,P0010001,P001,SF1_00001.txt,0,5
P0020001,P0020001,P002,SF1_00002.txt,0,5


In [7]:
#look up segment number(1), table(P001), or field code(P0020004) and determine segment file, column position and column names(use either fld codes or fld names)
def extract_pos_names(lookupf, segment = None, table = None, fields = None):

    if (segment is not None) and (segment in list(lookupf.SEGMENT)):
        sublk = lookupf.loc[lookupf.SEGMENT	== segment]

    if (table is not None) and (table in list(lookupf.TABLE)):
        sublk = lookupf.loc[lookupf.TABLE == table]
        #print(sublk)

    if (fields is not None) and (fields in list(lookupf['FIELD CODE'])) :
        sublk = lookupf.loc[lookupf['FIELD CODE'].isin(fields)]

    segfile = sublk['SEGMENT FILE'].unique()[0]
    fldpos = list(sublk['FIELD POS'])
    fldcodes = list(sublk['FIELD CODE'])
    fldnames = list(sublk['FIELD NAME'])
    return (segfile, fldpos, fldcodes, fldnames) 


In [8]:
#read Census DP or SF1 segment by variable position 
def state_file_reader(fpath, fname, vpos = None, vnames = None):
    return pd.read_csv(fpath + fname, header = None, usecols = [4] + vpos, names = ['LOGRECNO'] + vnames)


In [ ]:
def get_table_data(source, lookupf, table_folder, table_name, sumlevel):
    segfile, fldpos, fldcodes, fldnames = extract_pos_names(lookupf, table = table_name )
    dpnames = pd.DataFrame(zip(fldcodes, fldnames), columns = ['code','name'])
    #fldcodes = [ x + '_' + source for x in fldcodes]
    df = state_file_reader(table_folder, segfile, fldpos, fldcodes)
    if source == 'dp':
        geo_suf = 'dhc'
    else:
        geo_suf = 'sf1'
    geo = geo_reader(table_folder, "migeo2010." + geo_suf)
    df = pd.merge(geo, df, left_on = "LOGRECNO",right_on = "LOGRECNO",  how = 'left', suffixes=("", "_y"))
    df = df.loc[df.geoid>0]
    df = df.loc[df.SUMLEV == sumlevel]
    return df, dpnames

In [ ]:
name_lookup = sf1_lookup[['FIELD CODE', 'FIELD NAME']].set_index('FIELD CODE')

In [ ]:
sf1_lookup.loc[sf1_lookup.TABLE=='P012']

In [ ]:
#for tb in dp_lookup.TABLE.unique():
tlist = ['P001', 'P003', 'P004', 'P005', 'P006', 'P007','P013',
       'P014', 'P015', 'P016', 'P018', 'P019', 'P020', 'P022', 'P023',
       'P024', 'P025', 'P026', 'P028', 'P038', 'P043', 'H0001', 'H003',
       'H006', 'H007', 'H010', 'H013']
tlist = ['P038', 'P043', 'H0001', 'H003',
       'H006', 'H007', 'H010', 'H013']
for tb in tlist:
    print(tb)
    df_dp, dpnames = get_table_data('dp', dp_lookup, dp_folder, tb, 150)
    df_sf1, sf1names = get_table_data('sf1', sf1_lookup, sf1_folder, tb, 150)
    df_dp.set_index('geoid',inplace=True)
    df_sf1.set_index('geoid',inplace=True)

    df_diff = df_dp.subtract(df_sf1, axis =1)
    df_pct_diff = df_diff.div(df_sf1, axis=1) * 100.0
    df_pct_diff.update(df_diff[df_diff==0])
    df_pct_diff = df_pct_diff.replace([np.inf, -np.inf],np.nan).dropna(axis=0)
    df_pct_diff.drop(columns =['SUMLEV', 'LOGRECNO'], inplace=True)

    dfp = dif_summary(df_pct_diff)
    dfp.index=name_lookup.loc[dfp.index, 'FIELD NAME']
    print('making plot...', tb)
    errorbars(dfp, 'plots/', tb)

In [ ]:
len(dfp)

In [ ]:
def dif_summary(df_dif):
    dfp = pd.DataFrame(data=[df_dif.mean(), df_dif.std(), df_dif.min(), df_dif.max()]).T
    dfp.columns = ['mean', 'std', 'min', 'max']
        #plt.gca().invert_yaxis()
    dfp = dfp.sort_index(axis = 0, ascending = False)
    return dfp
    

In [ ]:
def errorbars(df, plot_folder, plot_name):
    import matplotlib.pyplot as plt
    import numpy as np
    import seaborn as sns
    sns.set(font_scale=1)
    #plt.figure(figsize = (10, max(5, len(df)/1.5))
    plt.figure(figsize = (10, len(df))
    plt.rcParams.update({'font.size': 24})

    plt.vlines(0, 0, len(df))
    plt.errorbar(df['mean'], range(len(df.index)), xerr= [df['mean'] - df['min'], df['max'] - df['mean']],
                fmt='ok', ecolor='gray', lw=1, capsize=8)
    plt.errorbar(df['mean'], range(len(df.index)), xerr=df['std'], fmt='ok', lw=4)
    for (x,y,l) in zip(df['mean'],  range(len(df.index)), round(df['mean'],3)):
        plt.text(3, y + 0.1, l, size=12)
    plt.xticks()
    plt.yticks(range(len(df.index)), list(df.index))
    plt.title(plot_name + ' PCT Block Group Errors')

    plt.tight_layout()

    plt.savefig(plot_folder + plot_name + '_error.png')
    print('save plot to ' +plot_folder + plot_name + '_error.png')

In [ ]:
df_dp.head(2)

In [ ]:
df_sf1.head(2)

In [ ]:
len(df_pct_diff)

In [ ]:
df_pct_diff

In [ ]:
df_pct_diff

In [ ]:
df_pct_diff.head()

In [ ]:
df_pct_diff.loc[df_diff.P0030006==0,'P0030006' ] =0

In [ ]:
df_pct_diff

In [ ]:
sf1names.set_index('code', inplace=True) 

In [ ]:
df_comp = pd.merge(df_sf1, df_dp, left_on="geoid", right_on="geoid", how="left", suffixes=("", "_y"))

In [ ]:
sf1names

In [ ]:
tbln = "P003"
segfile, fldpos, fldcodes, fldnames = extract_pos_names(dp_lookup, table = tbln )
dpnamelookup = pd.DataFrame(zip(fldcodes, fldnames), columns = ['code','name'])
fldcodes = [ x + '_dp' for x in fldcodes]
df_dp = state_file_reader(dp_folder, segfile, fldpos, fldcodes )
geo_dp = geo_reader(dp_folder, "migeo2010.dhc")
df_dp = pd.merge(geo_dp, df_dp, left_on = "LOGRECNO",right_on = "LOGRECNO",  how = 'left', suffixes=("", "_y"))
df_dp = df_dp.loc[df_dp.geoid>0]
df_dp = df_dp.loc[df_dp.SUMLEV == 150]

segfile, fldpos, fldcodes, fldnames = extract_pos_names(sf1_lookup, table = tbln )
sf1namelookup = pd.DataFrame(zip(fldcodes, fldnames), columns = ['code','name'])
fldcodes = [ x + '_sf1' for x in fldcodes]
df_sf1 = state_file_reader(sf1_folder, segfile, fldpos, fldcodes )
geo_sf1 = geo_reader(sf1_folder, "migeo2010.sf1" )
df_sf1 = pd.merge(geo_sf1, df_sf1, left_on = "LOGRECNO",right_on = "LOGRECNO",  how = 'left', suffixes=("", "_y"))
df_sf1 = df_sf1.loc[df_sf1.geoid>0]
df_sf1 = df_sf1.loc[df_sf1.SUMLEV == 150]

In [ ]:
len(df_sf1), len(df_dp)

In [ ]:
df_comp = pd.merge(df_sf1, df_dp, left_on="geoid", right_on="geoid", how="left", suffixes=("", "_y"))

In [ ]:
df_comp.head()

In [ ]:
flds_sf1 = [x.replace('_sf1', '') for x in df_comp.columns.values if x.find('_sf1') >=0 ]
flds_dp = [x.replace('_dp', '') for x in df_comp.columns.values if x.find('_dp') >=0 ]

In [ ]:
pct_diffs = [] 
for fld in set(flds_sf1 + flds_dp):
    pct_diffs.append(fld + '_pct_diff')
    df_comp[fld + '_pct_diff'] = (df_comp[fld + '_dp'] - df_comp[fld + '_sf1'])/df_comp[fld + '_sf1'] * 100.0
    df_comp.loc[df_comp[fld + '_sf1'] == 0, fld + '_pct_diff'] = 300
    df_comp.loc[(df_comp[fld + '_dp'] - df_comp[fld + '_sf1']) == 0, fld + '_pct_diff'] = 0


In [ ]:
pct_diffs

In [ ]:
dfs = df_comp.copy()
dfs = dfs.replace([np.inf, -np.inf], np.nan)
dfs.fillna(0, inplace=True)

In [ ]:
dfs.T

In [ ]:
dfp = dfp.sort_index(axis = 0, ascending = False)

In [ ]:
dfp

In [ ]:
def errorbars(df, chartname):
    import matplotlib.pyplot as plt
    import numpy as np
    import seaborn as sns
    sns.set(font_scale=2)
    plt.figure(figsize = (20,30))
    plt.rcParams.update({'font.size': 24})

    plt.vlines(0, 0, len(df))
    plt.errorbar(df['mean'], range(len(df.index)), xerr= [df['mean'] - df['min'], df['max'] - df['mean']],
                fmt='ok', ecolor='gray', lw=2, capsize=8)
    plt.errorbar(df['mean'], range(len(df.index)), xerr=df['std'], fmt='ok', lw=4)
    for (x,y,l) in zip(df['mean'],  range(len(df.index)), round(df['mean'],3)):
        plt.text(3, y + 0.1, l, size=16)
    plt.xticks()
    plt.yticks(range(len(df.index)), list(df.name))
    plt.title(chartname + ' Block Group Errors')

    plt.tight_layout()

In [ ]:
errorbars(dfp,'P003')

In [ ]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)


In [ ]:
df_sf1.head(10)

In [ ]:
df_dp.head(10)

In [ ]:
df_comp.head()

In [ ]:
df_sf1.head(2)

In [ ]:
df_sf1.columns

In [ ]:
df_sf1.COUNTY

In [ ]:
df_sf1.COUNTY.unique()

In [ ]:
[1,2,3] .+ 5

In [ ]:
def dp_reader(folder, segfile, vars):
    print(folder, segfile, vars)
    return pd.read_csv(folder + segfile, header = 0, usecols=vars.values)

In [ ]:
for t in dp_tables.TABLE.unique():
    dp_seg = dp_tables.loc[dp_tables.TABLE == t, "SEGMENT"]
    dp_vars = dp_tables.loc[dp_tables.TABLE == t, "FIELD CODE"]
    #print(t, dp_seg, dp_vars)
    dp_reader('2010_dp/', dp_seg.unique()[0], dp_vars)

In [ ]:
dp_seg.unique()[0]

In [ ]:
pd.read_csv('2010_dp/'+dp_seg.unique()[0])

In [ ]:
# get header from DP data
for h in header_files:
    seg_header = pd.read_csv(folder + h)
    

#get unique table number from dp
df_dp = dfseg
df_table = df_dp.columns[5:].to_frame(name = 'field')
df_table['tablen'] = df_table['field'].str[:-4]
tn = df_table['tablen'].unique()

In [ ]:
df_dp.columns.values[5:]

In [ ]:
dfseg.columns

In [ ]:
def sf1_seg_reader(folder, segnum, dffld):
    geo_header = ['FILEID','STUSAB','CHARITER','CIFSN','LOGRECNO']
    fn = "mi000" + str(segnum).zfill(2) + "2010.sf1" 
    header = geo_header + list(dffld.loc[dffld['SEGMENT'] == segnum, 'FIELD CODE'].values)
    #print(header)
    dfdata = pd.read_csv(folder + fn, header = None, names = header )

    return dfdata  

In [ ]:
folder = '2010_sf1/'
df_sf1 = pd.read_fwf(folder + "migeo2010.sf1", widths =geowidth, names = geoflds , header = None, usecols = range(22))


In [ ]:
dffld = pd.read_excel('2010_sf1/DATA_FIELD_DESCRIPTORS.xlsx')
dffld = dffld.loc[~dffld.DECIMAL.isnull()]

In [ ]:
dp_sf1_seg = dffld.loc[dffld['FIELD CODE'].isin(dfseg.columns.values ),'SEGMENT'].unique()
print(dp_sf1_seg)

In [ ]:
geo_header = ['FILEID','STUSAB','CHARITER','CIFSN','LOGRECNO']

for x in dp_sf1_seg:
    dpseg = sf1_seg_reader(folder, x, dffld)
    df_sf1 = pd.merge(df_sf1, dpseg, left_on =geo_header, right_on = geo_header, how ='left' )

In [ ]:
df_sf1.columns.values

In [ ]:
df_dp.columns.values

In [ ]:
sumlevel = {'COUNTY': 50, 'TRACT': 140, 'Block Group': 150}


In [ ]:
df_sfbg = df_sf1.loc(df_sf1.SUMLEV = sumlevel['BLKGRP'])
df_sf1.loc(df_sf1.SUMLEV = sumlevel['BLKGRP'])

In [ ]:
def sf1_seg_reader(folder, segnum, dffld):
    geo_header = ['FILEID','STUSAB','CHARITER','CIFSN','LOGRECNO']
    fn = "mi000" + str(segnum).zfill(2) + "2010.sf1" 
    header = geo_header + list(dffld.loc[dffld['SEGMENT'] == segnum, 'FIELD CODE'].values)
    #print(header)
    dfdata = pd.read_csv(folder + fn, header = None, names = header )

    return dfdata   


In [ ]:
sf1_seg_reader('2010_sf1/', 1, dffld)

In [ ]:
str(2).zfill(2)

In [ ]:
list(dfseg.columns)[5]

In [ ]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


In [ ]:
dfsec1.head(2)

In [ ]:
dfgeo.head(2)

In [ ]:
len(dfgeo), len(dfsec1)


In [ ]:
dfm  = pd.merge(dfgeo, dfsec1, left_on ='LOGRECNO', right_on = 'LOGRECNO', how ='left', suffixes = ("", "_y") )

In [ ]:
dfm.head()

In [ ]:
f = open("wide_160/wide_dp1_codebook.txt", "r")
dict_v = OrderedDict()
dict_v = {'FILEID':'FILEID','STUSAB':'STUSAB','CHARITER':'CHARITER','CIFSN':'CIFSN','LOGRECNO':'LOGRECNO'}
geo_remove = list(dict_v.keys())[:4]
for x in f:
    #print(x)
    if bool(re.search("H.....:", x)):
        v = x.split(":")[0].strip()
        vname = v[:3] + "_" + x.replace(v + ":", '').strip()
        dict_v[v] = vname
        #print (v, ',', vname)

In [ ]:
#read differential privacy 2010 demo data
dfsec1 = pd.read_csv("mi000012010.dhc", header = None, names = list(dict_v.keys()) )
dfsec1.drop(columns=geo_remove, axis=1, inplace=True)

In [ ]:
# geoflds =  ['FILEID','STUSAB','SUMLEV','GEOCOMP','CHARITER','CIFSN','LOGRECNO']
# wlst = [6,2,3,2,3,2,7]
# wlst2= [6,2,3,2,3,2,7,1,1,2,3,2,2,5,2,2,5,2,2,6,1,4]
# geoflds2 = ['LOGRECNO','REGION','DIVISION' ,'STATE' ,'COUNTY' ,'COUNTYCC' ,'COUNTYSC' ,'COUSUB' ,'COUSUBCC' ,'COUSUBSC' ,'PLACE' ,'PLACECC' ,'PLACESC' ,'TRACT' ,'BLKGRP' ,'BLOCK']
# colss = [(18, 25), (25, 26), (26, 27), (27, 29), (29, 32), (32, 34), (34, 36), (36, 41), (41, 43), (43, 45), (45, 50), (50, 52), (52, 54), (54, 60), (60, 61), (61, 65)]

geowidth= [6,2,3,2,3,2,7,1,1,2,3,2,2,5,2,2,5,2,2,6,1,4]
geoflds= ['FILEID','STUSAB','SUMLEV','GEOCOMP','CHARITER','CIFSN','LOGRECNO','REGION','DIVISION' ,'STATE' ,'COUNTY' ,'COUNTYCC' ,'COUNTYSC' ,'COUSUB' ,'COUSUBCC' ,'COUSUBSC' ,'PLACE' ,'PLACECC' ,'PLACESC' ,'TRACT' ,'BLKGRP' ,'BLOCK']
keepflds = [ 'SUMLEV', 'GEOCOMP',
       'LOGRECNO', 'REGION', 'DIVISION', 'STATE', 'COUNTY', 'COUNTYCC',
       'COUNTYSC', 'COUSUB', 'COUSUBCC', 'COUSUBSC', 'PLACE', 'PLACECC',
       'PLACESC', 'TRACT', 'BLKGRP', 'BLOCK']

In [ ]:
dfgeo = pd.read_fwf("migeo2010.dhc", widths =geowidth, names = geoflds, header = None, usecols = range(22))


In [ ]:
dfgeo.columns.values

In [ ]:
len(dfgeo), len(dfsec1)

In [ ]:
dfm  = pd.merge(dfgeo, dfsec1, left_on ='LOGRECNO', right_on = 'LOGRECNO', how ='left' )

In [ ]:
list(dict_v.keys())[4:] + 

In [ ]:
#dfm.columns.values

In [ ]:
newcols = list(dfgeo.columns.values) + list(dict_v.values())[5:]

In [ ]:
dfm.loc[dfm.SUMLEV == 91].head()

In [ ]:
dfm.columns = newcols

In [ ]:
dfm.head()

# * process 2010 SF1

In [ ]:
dfgeosf = pd.read_fwf("2010_sf1/migeo2010.sf1", widths =geowidth, names = geoflds , header = None, usecols = range(22))

In [ ]:
dfgeosf.BLKGRP.unique()

In [ ]:
dffld = pd.read_excel('2010_sf1/DATA_FIELD_DESCRIPTORS.xlsx')
dffld = dffld.loc[~dffld.DECIMAL.isnull()]

In [ ]:
dffld

In [ ]:
geo_header = ['FILEID','STUSAB','CHARITER','CIFSN','LOGRECNO']


In [ ]:
dffld.head()

In [ ]:
# {'FILEID':'FILEID','STUSAB':'STUSAB','CHARITER':'CHARITER','CIFSN':'CIFSN','LOGRECNO':'LOGRECNO'}

seg_tables = list(dffld.loc[dffld.SEGMENT == 2]['TABLE NUMBER'].values)
seg_header = list(dffld.loc[dffld.SEGMENT == 2]['FIELD NAME'].values)
seg_header = [ x + y for x, y in zip(seg_tables, seg_header)]

In [ ]:
geo_header + seg_header

In [ ]:
seglist = [1,2,3]
df_segs = []
for seg in seglist:
    print('read segment', seg)
    seg_tables = list(dffld.loc[dffld.SEGMENT == seg]['FIELD CODE'].values)
    seg_header = list(dffld.loc[dffld.SEGMENT == seg]['FIELD NAME'].values)
    seg_header = [ x + y for x, y in zip(seg_tables, seg_header)]

    df_seg = pd.read_csv("2010_sf1/" + "mi" + str(seg).zfill(5) + '2010.sf1', header = None, names = geo_header + seg_header)
    df_segs.append(df_seg) 

In [ ]:
pd.concat(df_segs)

In [ ]:
seg_reader

In [ ]:
len(dfgeosf),

In [ ]:
lv = 
#DHCST MI420000000000000012326
#DHCST MI420000000000000022326

In [ ]:
dict_geo = OrderedDict()
dict_geo = {}
State/U.S. Abbreviation (USPS) . . . . . . . . . . 	STUSAB 
Summary Level . . . . . . . . . . . . . . . . . . . . . . 	SUMLEV 
Geographic Component . . . . . . . . . . . . . . . 	GEOCOMP 
Characteristic Iteration . . . . . . . . . . . . . . . . . 	CHARITER 
Characteristic Iteration File Sequence	
Number . . . . . . . . . . . . . . . . . . . . . . . . . . . 	CIFSN 
Logical Record Number . . . . . . . . . . . . . . . . 	LOGRECNO 


In [ ]:
df = pd.read_csv("wide_160/wide_dp1_160.csv", encoding='ISO-8859-1')


In [ ]:
diff = []
difp = []
for f in flist:
    df[f + '_dif'] = df[f + '_dp'] - df[f + '_sf']
    df[f + '_difp'] = df[f + '_dif']/df[f + '_sf']
    diff.append(f + '_dif')
    difp.append(f+'_difp')

In [ ]:
df[difp]

In [ ]:
df.gisjoin

In [ ]:
df.plot.scatter(x = 'H7X002_sf', y= 'H7X002_dp')


In [ ]:
df['H7V001_difp'](),.mean df['H7V001_difp'].std()

In [ ]:
a =df['H7V001_difp'].copy()

In [ ]:
a.fill