In [50]:
import pandas as pd
import geopandas as gpd
import ast,os,random
pd.set_option('display.float_format','{:.0f}'.format)
import warnings
warnings.filterwarnings('ignore')
import cudf
# pd.set_option('display.max_colwidth', -1)

#### Load Data

In [73]:
full_2020_path='census_2020_data/nhgis0007_csv/nhgis0007_ds248_2020_block.csv'
full_2010_path='census_2010_data/nhgis0001_ds172_2010_block.csv'

#### Prepare 2010 data

In [131]:
full = pd.read_csv(full_2010_path, usecols=['GISJOIN', 'H7V001' ,'STATEA'],encoding='unicode_escape')
small = full[ (full.STATEA ==15) | (full.STATEA ==10) | (full.STATEA ==11)] 
data10 = cudf.from_pandas(small)
del(full)

In [132]:
data10.GISJOIN = data10.GISJOIN.str.replace('G', '')
data10.rename(columns={'H7V001':'P10','STATEA':'STATE'},inplace=True)
data10.head()

Unnamed: 0,GISJOIN,STATE,P10
1704220,10000100401001000,10,77
1704221,10000100401001001,10,294
1704222,10000100401001002,10,20
1704223,10000100401001003,10,91
1704224,10000100401001004,10,53


In [133]:
def GISJOIN_to_ID(gisjoin):
    return  int(gisjoin[:2] + gisjoin[3:6] +gisjoin[7:])

In [134]:
data10['ID10'] = data10.to_pandas().GISJOIN.apply(GISJOIN_to_ID)
data10.GISJOIN = data10.GISJOIN.astype('int')

In [135]:
data10.head()

Unnamed: 0,GISJOIN,STATE,P10,ID10
1704220,10000100401001000,10,77,100010401001000
1704221,10000100401001001,10,294,100010401001001
1704222,10000100401001002,10,20,100010401001002
1704223,10000100401001003,10,91,100010401001003
1704224,10000100401001004,10,53,100010401001004


#### Prepare 2020 data

In [108]:
full = pd.read_csv(full_2020_path,skiprows=[1],encoding='unicode_escape',usecols=['GEOCODE','GISJOIN','STATEA','COUNTY','U7B001'])
small = full[ (full.STATEA ==15) | (full.STATEA ==10) | (full.STATEA ==11)] 
data20 = cudf.from_pandas(small)
del(full)

In [109]:
# df = small.copy()
data20.rename(columns={'U7B001':'P20','GEOCODE':'ID20','STATEA':'STATE'},inplace=True)
data20.GISJOIN = data20.GISJOIN.str.replace('G', '').astype('int')
data20.head()

Unnamed: 0,GISJOIN,ID20,STATE,COUNTY,P20
1216403,10000100401001000,100010401001000,10,Kent County,108
1216404,10000100401001001,100010401001001,10,Kent County,44
1216405,10000100401001002,100010401001002,10,Kent County,74
1216406,10000100401001003,100010401001003,10,Kent County,60
1216407,10000100401001004,100010401001004,10,Kent County,12


### Mapper 

#### Concat mapper files for states

In [138]:
states = { 15:"HI", 10:"DE",11:"DC"}

In [139]:
def concat_states_mapper(state_key_list): 
    
    concat_mapper = pd.DataFrame()
    for i in state_key_list:
        if i< 10:
            i_str = '0'+str(i)
        else:
            i_str = str(i)
        path = 'census_full/mapper_files/tab2010_tab2020_st%s_%s.csv'%(i_str,states[i].lower())
        if os.path.isfile(path):    
            mapper = pd.read_csv(path,delimiter='|')
            concat_mapper = pd.concat([concat_mapper,mapper])
        else:
            print("mapper file does not exist")
            continue
    return concat_mapper

In [140]:
mapper_df = concat_states_mapper(states.keys())
mapper_df.head(2)

Unnamed: 0,STATE_2010,COUNTY_2010,TRACT_2010,BLK_2010,BLKSF_2010,AREALAND_2010,AREAWATER_2010,BLOCK_PART_FLAG_O,STATE_2020,COUNTY_2020,TRACT_2020,BLK_2020,BLKSF_2020,AREALAND_2020,AREAWATER_2020,BLOCK_PART_FLAG_R,AREALAND_INT,AREAWATER_INT
0,15,1,20100,1000,,9903,0,,15,1,20100,3062,,211979,3464,p,9903,0
1,15,1,20100,1001,,174797,0,,15,1,20100,3062,,211979,3464,p,174797,0


In [141]:
mapper_df.columns

Index(['STATE_2010', 'COUNTY_2010', 'TRACT_2010', 'BLK_2010', 'BLKSF_2010',
       'AREALAND_2010', 'AREAWATER_2010', 'BLOCK_PART_FLAG_O', 'STATE_2020',
       'COUNTY_2020', 'TRACT_2020', 'BLK_2020', 'BLKSF_2020', 'AREALAND_2020',
       'AREAWATER_2020', 'BLOCK_PART_FLAG_R', 'AREALAND_INT', 'AREAWATER_INT'],
      dtype='object')

In [142]:
def create_id(row):
    row.iloc[0] = str((str(row.iloc[0]).rjust(2,'0')))
    row.iloc[1] = str(str(row.iloc[1]).rjust(3,'0'))
    row.iloc[2] = str(str(row.iloc[2]).rjust(6,'0'))
    row.iloc[3] = str(str(row.iloc[3]).rjust(4,'0'))

    # print(row[0],row[1],row[2],row[3])
    return int(row[0]+row[1]+row[2]+row[3])


In [143]:
mapper_df['ID10']= mapper_df[['STATE_2010','COUNTY_2010','TRACT_2010','BLK_2010']].apply(create_id,axis=1)
mapper_df['ID20']= mapper_df[['STATE_2020','COUNTY_2020','TRACT_2020','BLK_2020']].apply(create_id,axis=1)
mapper_df = mapper_df[['ID10','ID20','STATE_2020']].reset_index()
mapper_df.head()

Unnamed: 0,index,ID10,ID20,STATE_2020
0,0,150010201001000,150010201003062,15
1,1,150010201001001,150010201003062,15
2,2,150010201001002,150010201003062,15
3,3,150010201001003,150010201003062,15
4,4,150010201001004,150010201003052,15


#### Out of state blocks

In [77]:
mapper[mapper.STATE_2010!=5].sample(5)

Unnamed: 0,STATE_2010,COUNTY_2010,TRACT_2010,BLK_2010,BLKSF_2010,AREALAND_2010,AREAWATER_2010,BLOCK_PART_FLAG_O,STATE_2020,COUNTY_2020,TRACT_2020,BLK_2020,BLKSF_2020,AREALAND_2020,AREAWATER_2020,BLOCK_PART_FLAG_R,AREALAND_INT,AREAWATER_INT
214265,28,143,950100,3127,,23110700,100151,p,5,77,470100,1144,,5488666,18355,p,2746974,15338
214326,29,181,870200,2093,,9816817,0,p,5,121,960202,2007,,1925458,0,p,128,0
214385,47,97,50100,1029,,0,136119,p,5,93,10700,3080,,0,81984,p,0,28814
214219,22,27,950200,2009,,801699,0,p,5,27,950100,3076,,2514951,0,p,179,0
214274,28,143,950100,3370,,0,5331500,p,5,77,470100,1148,,0,1958738,p,0,624419


In [72]:
mapper[mapper.STATE_2010!=5].STATE_2010.value_counts()

47    85
28    63
29    60
40    27
48    8 
22    6 
Name: STATE_2010, dtype: int64

#### Create Mapped IDs

In [144]:
def map_to_ID10(x):
    filtered_df = mapper_df[mapper_df.ID20 ==x]
    ids = filtered_df.ID10.tolist()
    id_dict = {id10:len(mapper_df[mapper_df.ID10 == id10]) for id10 in ids}
    return id_dict

In [145]:
mapper_df['ID10_mapped'] = mapper_df.ID20.apply(map_to_ID10)
mapper_df.head()

Unnamed: 0,index,ID10,ID20,STATE_2020,ID10_mapped
0,0,150010201001000,150010201003062,15,"{150010201001000: 1, 150010201001001: 1, 15001..."
1,1,150010201001001,150010201003062,15,"{150010201001000: 1, 150010201001001: 1, 15001..."
2,2,150010201001002,150010201003062,15,"{150010201001000: 1, 150010201001001: 1, 15001..."
3,3,150010201001003,150010201003062,15,"{150010201001000: 1, 150010201001001: 1, 15001..."
4,4,150010201001004,150010201003052,15,"{150010201001004: 1, 150010201001005: 1}"


In [209]:
mapper_df =mapper_df.drop_duplicates('ID20')[['ID20','STATE_2020','ID10_mapped']].sort_values('ID20')
mapper_df.head()

Unnamed: 0,ID20,STATE_2020,ID10_mapped
27450,100010401001000,10,"{100010401002000: 1, 100010401002001: 1}"
27452,100010401001001,10,{100010401002002: 1}
27453,100010401001002,10,"{100010401002003: 1, 100010401002004: 1}"
27455,100010401001003,10,{100010401002005: 1}
27456,100010401001004,10,{100010401002006: 2}


In [211]:
# #Load mapper
# mapper_df.to_csv('census_full/mapped_blocks.csv')
mapper_df = pd.read_csv('census_full/mapped_blocks.csv').drop(columns='Unnamed: 0',axis=1)
mapper_df.head()

#### Calculate new P1

In [212]:
def calculate_P10_equivalent(id20):
    # print(id20)
    P1 = 0
    d1 = ast.literal_eval(mapper_df[mapper_df.ID20==id20].ID10_mapped.iloc[0])
    for index,parts in list(d1.items()):
        P1+= data10[data10.ID10==index]['P10'].iloc[0]/parts
        #print(P1)
    return P1

In [None]:
# pd.set_option('display.max_colwidth', -1)

In [215]:
data20['P10_new'] = data20[data20['ID20'].notnull()].to_pandas().ID20.apply(calculate_P10_equivalent)
data20.head()

Unnamed: 0,GISJOIN,ID20,STATE,COUNTY,P20,P10_new
1216403,10000100401001000,100010401001000,10,Kent County,108,103
1216404,10000100401001001,100010401001001,10,Kent County,44,31
1216405,10000100401001002,100010401001002,10,Kent County,74,57
1216406,10000100401001003,100010401001003,10,Kent County,60,94
1216407,10000100401001004,100010401001004,10,Kent County,12,46


In [223]:
data20.to_csv('HI_DE_DC_mapped_data.csv')

### BYPASS OTHER STATES

In [186]:
def map_to_ID10_bypass(x):
    filtered_df = mapper_df[(mapper_df.ID20 ==x) & (mapper_df.ID10 >= 5*10**13) & (mapper_df.ID10 <=6*10**13)] # 2010 equivalent blocks of other states bypassed
    ids = filtered_df.ID10.tolist()
    id_dict = {id10:len(mapper_df[mapper_df.ID10 == id10]) for id10 in ids}
    return id_dict

In [187]:
mapper_df['ID10_mapped_bypass'] = mapper_df.ID20.apply(map_to_ID10_bypass)
mapper_df.head()

KeyboardInterrupt: 

In [155]:
# mapper_df.to_csv('mapped_blocks_bypass.csv')

In [188]:
# mapper_df = pd.read_csv('mapped_blocks_bypass.csv').drop(columns='Unnamed: 0',axis=1)
# mapper_df.head(2)

Unnamed: 0,index,ID10,ID20,STATE_2020,ID10_mapped,ID10_mapped_bypass
0,0,50014801001000,50014801001000,5,"{50014801001000: 1, 50014801001034: 3}","{50014801001000: 1, 50014801001034: 3}"
1,1,50014801001001,50014801001001,5,"{50014801001001: 1, 50014801001002: 1, 50014801001003: 1, 50014801001004: 1, 50014801001005: 1, 50014801001006: 1, 50014801001027: 1, 50014801001029: 1, 50014801001030: 1, 50014801001032: 1, 50014801001034: 3}","{50014801001001: 1, 50014801001002: 1, 50014801001003: 1, 50014801001004: 1, 50014801001005: 1, 50014801001006: 1, 50014801001027: 1, 50014801001029: 1, 50014801001030: 1, 50014801001032: 1, 50014801001034: 3}"


#### Debug mapper

In [205]:
mapper_df[mapper_df.ID20==51499526003079]

Unnamed: 0,index,ID10,ID20,STATE_2020,ID10_mapped,ID10_mapped_bypass
214215,214215,51499526003114,51499526003079,5,{51499526003114: 1},{51499526003114: 1}


In [206]:
block10[block10.GEOID10==51499526003114]

Unnamed: 0,GEOID10,H7V001
3005741,51499526003114,8


In [207]:
merged_df[merged_df.GEOID10==51499526003114]

Unnamed: 0,GEOID10,P1,GEOID20,P2,STATEA,INTPTLAT,INTPTLON,COUNTY
227418,51499526003114,8,,,,,,


#### Bypass continue

In [221]:
def calculate_P1_equivalent_bypass(id20):
    P1 = 0
    if pd.notnull(id20):
        d1 = ast.literal_eval(mapper_df[mapper_df.ID20==id20].ID10_mapped_bypass.iloc[0])
        # print(d1)
        # print(id20)
        for index,parts in list(d1.items()):
            #print(index,parts)
            #print(merged_df[merged_df.GEOID10==index]['P1'])
            P1+= merged_df[merged_df.GEOID10==index]['P1'].iloc[0]/parts
            #print(P1)
        return P1

In [203]:
# mapper_df.to_csv('mapped_blocks_bypass.csv')

In [223]:
merged_df['P1_new'] = merged_df['GEOID20'].apply(calculate_P1_equivalent_bypass)
merged_df.head()

Unnamed: 0,GEOID10,P1,GEOID20,P2,STATEA,INTPTLAT,INTPTLON,COUNTY,P1_new
0,50014801001000,0,50014801001000,0,5,35,-91,Arkansas County,0
1,50014801001001,101,50014801001001,103,5,35,-91,Arkansas County,172
2,50014801001002,0,50014801001002,0,5,35,-91,Arkansas County,0
3,50014801001003,0,50014801001003,10,5,35,-91,Arkansas County,17
4,50014801001004,0,50014801001004,66,5,35,-91,Arkansas County,74


In [224]:
# merged_df.to_csv('mapped_data_AR.csv')