#### Objective:- map blocks and compute population for 2010 and 2020 census data 

In [1]:
import pandas as pd
import geopandas as gpd
import ast,os,random
pd.set_option('display.float_format','{:.1f}'.format)
import warnings
warnings.filterwarnings('ignore')
import cudf, cupy as cp
import numpy as np
import time
# pd.set_option('display.max_colwidth', -1)

#### Load Data

In [3]:
full_2020_path='data/nhgis0007_csv/nhgis0007_ds248_2020_block.csv'
full_2010_path='data/nhgis0001_csv/nhgis0001_ds172_2010_block.csv'

#### Prepare 2010 data

In [3]:
full = cudf.read_csv(full_2010_path, usecols=['GISJOIN', 'H7V001' ,'STATEA'],dtype={'GISJOIN':'str','H7V001':'str','STATEA':'int'})
# full.STATEA = full.STATEA.astype('int')

In [4]:
# small = full[ (full.STATEA ==11) | (full.STATEA ==10)] 
# data10 = small.copy()
data10 = full.copy()

del(full)

In [5]:
data10.GISJOIN = data10.GISJOIN.str.replace('G', '')
data10.rename(columns={'H7V001':'P10','STATEA':'STATE'},inplace=True)
data10.head()

Unnamed: 0,GISJOIN,STATE,P10
0,1000100201001000,1,61
1,1000100201001001,1,0
2,1000100201001002,1,0
3,1000100201001003,1,75
4,1000100201001004,1,0


In [6]:
len(data10)

11155486

In [7]:
data10['ID10'] = (data10.GISJOIN.str.slice(start=0,stop=2) + data10.GISJOIN.str.slice(start=3,stop=6) + data10.GISJOIN.str.slice(start=7)).astype('int64')

In [8]:
data10.head()

Unnamed: 0,GISJOIN,STATE,P10,ID10
0,1000100201001000,1,61,10010201001000
1,1000100201001001,1,0,10010201001001
2,1000100201001002,1,0,10010201001002
3,1000100201001003,1,75,10010201001003
4,1000100201001004,1,0,10010201001004


#### Prepare 2020 data

In [4]:
full = cudf.read_csv(full_2020_path,usecols=['GEOCODE','GISJOIN','STATEA','COUNTY','U7B001'],dtype={'GEOCODE':'int64','COUNTY':'str','GISJOIN':'str','U7B001':'int32','STATEA':'int32'})
# full.STATEA = full.STATEA.astype('int')
# small = full[ (full.STATEA ==11) | (full.STATEA ==10)] 
# data20 = cudf.from_pandas(small)
data20 = full.copy()
del(full)

In [16]:
data20[data20.COUNTY=='Montgomery County'].STATEA.value_counts()

42    12463
39     9932
24     9513
48     6920
1      4793
20     3150
47     2954
17     2846
36     2562
51     1832
29     1787
18     1705
37     1460
19     1385
5      1041
28      796
13      740
21      645
Name: STATEA, dtype: int32

In [10]:
# df = small.copy()
data20.rename(columns={'U7B001':'P20','GEOCODE':'ID20','STATEA':'STATE'},inplace=True)
data20.GISJOIN = data20.GISJOIN.str.replace('G', '').astype('int64')
data20.head()

Unnamed: 0,GISJOIN,ID20,STATE,COUNTY,P20
0,1000100201001000,10010201001000,1,Autauga County,21
1,1000100201001001,10010201001001,1,Autauga County,34
2,1000100201001002,10010201001002,1,Autauga County,29
3,1000100201001003,10010201001003,1,Autauga County,17
4,1000100201001004,10010201001004,1,Autauga County,0


In [11]:
len(data20)

8174955

### Mapper 

#### Concat mapper files for states

In [12]:
states = {1 :"AL",2 :"AK",4 :"AZ",5 :"AR",6 :"CA",8 :"CO",9 :"CT",10:"DE",11:"DC",12:"FL",13:"GA",15:"HI",
          16:"ID",17:"IL",18:"IN",19:"IA",20:"KS",21:"KY",22:"LA",23:"ME",24:"MD",25:"MA",26:"MI",27:"MN",
          28:"MS",29:"MO",30:"MT",31:"NE",32:"NV",33:"NH",34:"NJ",35:"NM",36:"NY",37:"NC",38:"ND",39:"OH",
          40:"OK",41:"OR",42:"PA",44:"RI",45:"SC",46:"SD",47:"TN",48:"TX",49:"UT",50:"VT",51:"VA",53:"WA",
          54:"WV",55:"WI",56:"WY",72:"PR"}

# states = {11:"DC",10:"DE"}


In [13]:
def concat_states_mapper(state_key_list): 
    
    concat_mapper = cudf.DataFrame()
    for i in state_key_list:
        if i< 10:
            i_str = '0'+str(i)
        else:
            i_str = str(i)
        path = 'data/block_rel_files/tab2010_tab2020_st%s_%s.csv'%(i_str,states[i].lower())
        if os.path.isfile(path):    
            mapper = cudf.read_csv(path,delimiter='|')
            concat_mapper = cudf.concat([concat_mapper,mapper])
        else:
            print("mapper file does not exist")
            continue
    return concat_mapper

In [14]:
mapper_df = concat_states_mapper(states.keys())
mapper_df.rename(columns={'﻿STATE_2010':'STATE_2010'},inplace=True)
mapper_df.head(2)

Unnamed: 0,STATE_2010,COUNTY_2010,TRACT_2010,BLK_2010,BLKSF_2010,AREALAND_2010,AREAWATER_2010,BLOCK_PART_FLAG_O,STATE_2020,COUNTY_2020,TRACT_2020,BLK_2020,BLKSF_2020,AREALAND_2020,AREAWATER_2020,BLOCK_PART_FLAG_R,AREALAND_INT,AREAWATER_INT
0,1,1,20100,1000,,482628,0,p,1,1,20100,1000,,288702,0,,288702,0
1,1,1,20100,1000,,482628,0,p,1,1,20100,1001,,194408,0,p,193926,0


In [15]:
mapper_df['ID10'] = (mapper_df.STATE_2010.astype('str').str.rjust(2,'0') + mapper_df.COUNTY_2010.astype('str').str.rjust(3,'0') + mapper_df.TRACT_2010.astype('str').str.rjust(6,'0') + mapper_df.BLK_2010.astype('str').str.rjust(4,'0')).astype('int64')
mapper_df['ID20'] = (mapper_df.STATE_2020.astype('str').str.rjust(2,'0') + mapper_df.COUNTY_2020.astype('str').str.rjust(3,'0') + mapper_df.TRACT_2020.astype('str').str.rjust(6,'0') + mapper_df.BLK_2020.astype('str').str.rjust(4,'0')).astype('int64')
mapper_df = mapper_df[['ID10','ID20','STATE_2020']].reset_index()
mapper_df.head()

Unnamed: 0,index,ID10,ID20,STATE_2020
0,0,10010201001000,10010201001000,1
1,1,10010201001000,10010201001001,1
2,2,10010201001001,10010201001001,1
3,3,10010201001002,10010201001002,1
4,4,10010201001003,10010201001002,1


#### Create Mapped IDs

#### Single Mapping

In [16]:
freq_df = mapper_df.ID10.value_counts().reset_index().sort_values('index')
freq_df.rename(columns={'ID10':'freq'},inplace=True)
freq_df.rename(columns={'index':'ID10'},inplace=True)
freq_df = freq_df.reset_index(drop=True)
data10.P10 = data10.P10.astype('float32')
freq_df = cudf.merge(freq_df,data10[['ID10','P10']],on='ID10',how='right').sort_values('ID10')
freq_df['weights'] = freq_df['P10'].divide(freq_df['freq'])
freq_df.head()

Unnamed: 0,ID10,P10,freq,weights
7392,10010201001000,61.0,2,30.5
7393,10010201001001,0.0,1,0.0
7394,10010201001002,0.0,1,0.0
7395,10010201001003,75.0,2,37.5
7396,10010201001004,0.0,1,0.0


In [17]:
weighted_mapper = cudf.merge(mapper_df,freq_df[['ID10','weights']],on='ID10',how='left').sort_values('ID20').reset_index(drop=True)

In [18]:
eq_10 = weighted_mapper.groupby('ID20')['weights'].sum().reset_index().sort_values('ID20').reset_index(drop=True) 
eq_10.head()

Unnamed: 0,ID20,weights
0,10010201001000,30.5
1,10010201001001,30.5
2,10010201001002,51.8
3,10010201001003,13.3
4,10010201001004,0.0


In [19]:
weighted_mapper['eq_P10'] = eq_10['weights']
weighted_mapper.head()

Unnamed: 0,index,ID10,ID20,STATE_2020,weights,eq_P10
0,0,10010201001000,10010201001000,1,30.5,30.5
1,1,10010201001000,10010201001001,1,30.5,30.5
2,2,10010201001001,10010201001001,1,0.0,51.8
3,3,10010201001002,10010201001002,1,0.0,13.3
4,4,10010201001003,10010201001002,1,37.5,0.0


In [20]:
data20['eq_P10'] = weighted_mapper['eq_P10'].copy()
data20.head()

Unnamed: 0,GISJOIN,ID20,STATE,COUNTY,P20,eq_P10
0,1000100201001000,10010201001000,1,Autauga County,21,30.5
1,1000100201001001,10010201001001,1,Autauga County,34,30.5
2,1000100201001002,10010201001002,1,Autauga County,29,51.8
3,1000100201001003,10010201001003,1,Autauga County,17,13.3
4,1000100201001004,10010201001004,1,Autauga County,0,0.0


In [24]:
# data20.to_csv('data/mapped_blocks_full.csv')