In [2]:
import pandas as pd,numpy as np
import geopandas as gpd
import ast,os,random
pd.set_option('display.float_format','{:.0f}'.format)
import warnings
warnings.filterwarnings('ignore')
import cudf
# pd.set_option('display.max_colwidth', -1) 

#### Load Data

In [3]:
full_2020_path='census_2020_data/nhgis0007_csv/nhgis0007_ds248_2020_block.csv'
full_2010_path='census_2010_data/nhgis0001_ds172_2010_block.csv'

#### Prepare 2010 data

In [4]:
full = pd.read_csv(full_2010_path, usecols=['GISJOIN', 'H7V001' ,'STATEA'],encoding='unicode_escape')
small = full[ (full.STATEA ==10) | (full.STATEA ==11)] 
data10 = cudf.from_pandas(small)
del(full)

In [5]:
data10.GISJOIN = data10.GISJOIN.str.replace('G', '')
data10.rename(columns={'H7V001':'P10','STATEA':'STATE'},inplace=True)
data10.head()

Unnamed: 0,GISJOIN,STATE,P10
1704220,10000100401001000,10,77
1704221,10000100401001001,10,294
1704222,10000100401001002,10,20
1704223,10000100401001003,10,91
1704224,10000100401001004,10,53


In [11]:
def GISJOIN_to_ID(gisjoin):
    return  int(str(gisjoin)[:2] + str(gisjoin)[3:6] + str(gisjoin)[7:])

In [12]:
data10['ID10'] = data10.to_pandas().GISJOIN.apply(GISJOIN_to_ID)
data10.GISJOIN = data10.GISJOIN.astype('int')

In [13]:
data10.head()

Unnamed: 0,GISJOIN,STATE,P10,ID10
1704220,10000100401001000,10,77,100010401001000
1704221,10000100401001001,10,294,100010401001001
1704222,10000100401001002,10,20,100010401001002
1704223,10000100401001003,10,91,100010401001003
1704224,10000100401001004,10,53,100010401001004


#### Prepare 2020 data

In [19]:
full = pd.read_csv(full_2020_path,skiprows=[1],encoding='unicode_escape',usecols=['GEOCODE','GISJOIN','STATEA','COUNTY','U7B001'])
small = full[ (full.STATEA ==10) | (full.STATEA ==11)] 
data20 = cudf.from_pandas(small)
del(full)

In [20]:
# df = small.copy()
data20.rename(columns={'U7B001':'P20','GEOCODE':'ID20','STATEA':'STATE'},inplace=True)
data20.GISJOIN = data20.GISJOIN.str.replace('G', '').astype('int')
data20.head()

Unnamed: 0,GISJOIN,ID20,STATE,COUNTY,P20
1216403,10000100401001000,100010401001000,10,Kent County,108
1216404,10000100401001001,100010401001001,10,Kent County,44
1216405,10000100401001002,100010401001002,10,Kent County,74
1216406,10000100401001003,100010401001003,10,Kent County,60
1216407,10000100401001004,100010401001004,10,Kent County,12


### Mapper 

#### Concat mapper files for states

In [21]:
states = { 10:"DE",11:"DC"}

In [22]:
def concat_states_mapper(state_key_list): 
    
    concat_mapper = cudf.DataFrame()
    for i in state_key_list:
        if i< 10:
            i_str = '0'+str(i)
        else:
            i_str = str(i)
        path = 'census_full/mapper_files/tab2010_tab2020_st%s_%s.csv'%(i_str,states[i].lower())
        if os.path.isfile(path):    
            mapper = cudf.read_csv(path,delimiter='|')
            concat_mapper = cudf.concat([concat_mapper,mapper])
        else:
            print("mapper file does not exist")
            continue
    return concat_mapper

In [23]:
mapper_df = concat_states_mapper(states.keys())
mapper_df.rename(columns={'﻿STATE_2010':'STATE_2010'},inplace=True) # weird formatting error
mapper_df.head(2)

Unnamed: 0,STATE_2010,COUNTY_2010,TRACT_2010,BLK_2010,BLKSF_2010,AREALAND_2010,AREAWATER_2010,BLOCK_PART_FLAG_O,STATE_2020,COUNTY_2020,TRACT_2020,BLK_2020,BLKSF_2020,AREALAND_2020,AREAWATER_2020,BLOCK_PART_FLAG_R,AREALAND_INT,AREAWATER_INT
0,10,1,40100,1000,,509097,0,,10,1,40100,3000,,509097,0,,509097,0
1,10,1,40100,1001,,3674392,0,,10,1,40100,3001,,3674392,0,,3674392,0


In [24]:
# UDF REQUIRED

def create_id(row):
    print(row)
    row.iloc[0] = str((str(row.iloc[0]).rjust(2,'0')))
    row.iloc[1] = str(str(row.iloc[1]).rjust(3,'0'))
    row.iloc[2] = str(str(row.iloc[2]).rjust(6,'0'))
    row.iloc[3] = str(str(row.iloc[3]).rjust(4,'0'))

    # print(row[0],row[1],row[2],row[3])
    return int(row[0]+row[1]+row[2]+row[3])


In [25]:
test = mapper_df.to_pandas()
test.head(2)

Unnamed: 0,STATE_2010,COUNTY_2010,TRACT_2010,BLK_2010,BLKSF_2010,AREALAND_2010,AREAWATER_2010,BLOCK_PART_FLAG_O,STATE_2020,COUNTY_2020,TRACT_2020,BLK_2020,BLKSF_2020,AREALAND_2020,AREAWATER_2020,BLOCK_PART_FLAG_R,AREALAND_INT,AREAWATER_INT
0,10,1,40100,1000,,509097,0,,10,1,40100,3000,,509097,0,,509097,0
1,10,1,40100,1001,,3674392,0,,10,1,40100,3001,,3674392,0,,3674392,0


In [26]:
type(mapper_df)

cudf.core.dataframe.DataFrame

In [None]:
# LEARN UDF

# import cudf
# import numpy as np
# df = cudf.DataFrame()
# nelem = 3
# df['in1'] = mapper_df['STATE_2010']
# df['in2'] = mapper_df['COUNTY_2010']
# df['in3'] = mapper_df['TRACT_2010']
# df['in4'] = mapper_df['BLK_2010']

# in1 = df['in1']
# in2 = df['in2']
# in3 = df['in3']
# in4 = df['in4']
# def kernel(in1, in2, in3, out1, out2, kwarg1, kwarg2):
#     for i, (x, y, z,w) in enumerate(zip(in1, in2, in3,in4)):
#         out1[i] = kwarg2 * x - kwarg1 * y *w
#         out2[i] = y - kwarg1 * z
# df.apply_rows(kernel,
#               incols=['in1', 'in2', 'in3','in4'],
#               outcols=dict(out1=np.float64, out2=np.float64),
#               kwargs=dict(kwarg1=3, kwarg2=4))

In [None]:
# import cudf
# import numpy as np
# df = cudf.DataFrame()
# nelem = 3
# df['in1'] = np.arange(nelem)
# df['in2'] = np.arange(nelem)
# df['in3'] = np.arange(nelem)

In [None]:
# in1 = mapper_df['STATE_2010']
# in2 = mapper_df['COUNTY_2010']
# in3 = mapper_df['TRACT_2010']
# in4 = mapper_df['BLK_2010']
# def kernel(in1, in2, in3,in4, out1, out2, kwarg1, kwarg2):
#     for i, (x, y, z) in enumerate(zip(in1, in2, in3,in4)):
#         out1[i] = kwarg2 * x - kwarg1 * y
#         out2[i] = y - kwarg1 * z

In [None]:
# df.apply_rows(kernel,
#               incols=['in1', 'in2', 'in3','in4'],
#               outcols=dict(out1=np.float64, out2=np.float64),
#               kwargs=dict(kwarg1=3, kwarg2=4))

In [None]:
# def tp(in1,in2,out1,kwarg1):
#     for i, (x,y) in enumerate(zip(in1,in2)):
#         out1[i]= x + y

In [None]:
# mapper_df.apply_rows(tp,incols=['in1','in2'],outcols=dict(out1=np.float64),kwargs=dict(kwarg1=3))

In [None]:
mapper_df['ID10']= mapper_df[['STATE_2010','COUNTY_2010','TRACT_2010','BLK_2010']].apply(create_id,axis=1)
mapper_df['ID20']= mapper_df[['STATE_2020','COUNTY_2020','TRACT_2020','BLK_2020']].apply(create_id,axis=1)
mapper_df = mapper_df[['ID10','ID20','STATE_2020']].reset_index()
mapper_df.head()

#### Out of state blocks

In [None]:
mapper[mapper.STATE_2010!=5].sample(5)

In [None]:
mapper[mapper.STATE_2010!=5].STATE_2010.value_counts()

#### Create Mapped IDs

In [None]:
# UDF required


def map_to_ID10(x):
    filtered_df = mapper_df[mapper_df.ID20 ==x]
    ids = filtered_df.ID10.tolist()
    id_dict = {id10:len(mapper_df[mapper_df.ID10 == id10]) for id10 in ids}
    return id_dict

In [None]:
mapper_df['ID10_mapped'] = mapper_df.ID20.apply(map_to_ID10)
mapper_df.head()

In [None]:
mapper_df =mapper_df.drop_duplicates('ID20')[['ID20','STATE_2020','ID10_mapped']].sort_values('ID20')
mapper_df.head()

In [None]:
# #Load mapper
# mapper_df.to_csv('census_full/mapped_blocks.csv')
# mapper_df = pd.read_csv('census_full/mapped_blocks.csv').drop(columns='Unnamed: 0',axis=1)
# mapper_df.head()

#### Calculate new P1

In [None]:
# UDF required

def calculate_P10_equivalent(id20):
    # print(id20)
    P1 = 0
    d1 = ast.literal_eval(mapper_df[mapper_df.ID20==id20].ID10_mapped.iloc[0])
    for index,parts in list(d1.items()):
        P1+= data10[data10.ID10==index]['P10'].iloc[0]/parts
        #print(P1)
    return P1

In [None]:
# pd.set_option('display.max_colwidth', -1)

In [None]:
data20['P10_new'] = data20[data20['ID20'].notnull()].to_pandas().ID20.apply(calculate_P10_equivalent)
data20.head()

In [None]:
# data20.to_csv('HI_DE_DC_mapped_data.csv')

### BYPASS OTHER STATES

In [None]:
def map_to_ID10_bypass(x):
    filtered_df = mapper_df[(mapper_df.ID20 ==x) & (mapper_df.ID10 >= 5*10**13) & (mapper_df.ID10 <=6*10**13)] # 2010 equivalent blocks of other states bypassed
    ids = filtered_df.ID10.tolist()
    id_dict = {id10:len(mapper_df[mapper_df.ID10 == id10]) for id10 in ids}
    return id_dict

In [None]:
mapper_df['ID10_mapped_bypass'] = mapper_df.ID20.apply(map_to_ID10_bypass)
mapper_df.head()

In [None]:
# mapper_df.to_csv('mapped_blocks_bypass.csv')

In [None]:
# mapper_df = pd.read_csv('mapped_blocks_bypass.csv').drop(columns='Unnamed: 0',axis=1)
# mapper_df.head(2)

#### Debug mapper

In [None]:
mapper_df[mapper_df.ID20==51499526003079]

In [None]:
block10[block10.GEOID10==51499526003114]

In [None]:
merged_df[merged_df.GEOID10==51499526003114]

#### Bypass continue

In [None]:
def calculate_P1_equivalent_bypass(id20):
    P1 = 0
    if pd.notnull(id20):
        d1 = ast.literal_eval(mapper_df[mapper_df.ID20==id20].ID10_mapped_bypass.iloc[0])
        # print(d1)
        # print(id20)
        for index,parts in list(d1.items()):
            #print(index,parts)
            #print(merged_df[merged_df.GEOID10==index]['P1'])
            P1+= merged_df[merged_df.GEOID10==index]['P1'].iloc[0]/parts
            #print(P1)
        return P1

In [None]:
# mapper_df.to_csv('mapped_blocks_bypass.csv')

In [None]:
merged_df['P1_new'] = merged_df['GEOID20'].apply(calculate_P1_equivalent_bypass)
merged_df.head()

In [None]:
# merged_df.to_csv('mapped_data_AR.csv')