In [1]:
import pandas as pd
import geopandas as gpd
import ast,os,random
pd.set_option('display.float_format','{:.1f}'.format)
import warnings
warnings.filterwarnings('ignore')
import cudf, cupy as cp
import numpy as np
import time
# pd.set_option('display.max_colwidth', -1)

#### Load Data

In [2]:
full_2020_path='data/nhgis0007_csv/nhgis0007_ds248_2020_block.csv'
full_2010_path='data/nhgis0001_csv/nhgis0001_ds172_2010_block.csv'

#### Prepare 2010 data

In [3]:
full = cudf.read_csv(full_2010_path, usecols=['GISJOIN', 'H7V001' ,'STATEA'],dtype={'GISJOIN':'str','H7V001':'str','STATEA':'int'})
# full.STATEA = full.STATEA.astype('int')

In [4]:
# small = full[ (full.STATEA ==11) | (full.STATEA ==10)] 
# data10 = small.copy()
data10 = full.copy()

del(full)

In [5]:
data10.GISJOIN = data10.GISJOIN.str.replace('G', '')
data10.rename(columns={'H7V001':'P10','STATEA':'STATE'},inplace=True)
data10.head()

Unnamed: 0,GISJOIN,STATE,P10
0,1000100201001000,1,61
1,1000100201001001,1,0
2,1000100201001002,1,0
3,1000100201001003,1,75
4,1000100201001004,1,0


In [6]:
len(data10)

11155486

In [7]:
data10['ID10'] = (data10.GISJOIN.str.slice(start=0,stop=2) + data10.GISJOIN.str.slice(start=3,stop=6) + data10.GISJOIN.str.slice(start=7)).astype('int64')

In [8]:
data10.head()

Unnamed: 0,GISJOIN,STATE,P10,ID10
0,1000100201001000,1,61,10010201001000
1,1000100201001001,1,0,10010201001001
2,1000100201001002,1,0,10010201001002
3,1000100201001003,1,75,10010201001003
4,1000100201001004,1,0,10010201001004


#### Prepare 2020 data

In [9]:
full = cudf.read_csv(full_2020_path,usecols=['GEOCODE','GISJOIN','STATEA','COUNTY','U7B001'],dtype={'GEOCODE':'int64','COUNTY':'str','GISJOIN':'str','U7B001':'int32','STATEA':'int32'})
# full.STATEA = full.STATEA.astype('int')
# small = full[ (full.STATEA ==11) | (full.STATEA ==10)] 
# data20 = cudf.from_pandas(small)
data20 = full.copy()
del(full)

In [10]:
# df = small.copy()
data20.rename(columns={'U7B001':'P20','GEOCODE':'ID20','STATEA':'STATE'},inplace=True)
data20.GISJOIN = data20.GISJOIN.str.replace('G', '').astype('int64')
data20.head()

Unnamed: 0,GISJOIN,ID20,STATE,COUNTY,P20
0,1000100201001000,10010201001000,1,Autauga County,21
1,1000100201001001,10010201001001,1,Autauga County,34
2,1000100201001002,10010201001002,1,Autauga County,29
3,1000100201001003,10010201001003,1,Autauga County,17
4,1000100201001004,10010201001004,1,Autauga County,0


In [11]:
len(data20)

8174955

### Mapper 

#### Concat mapper files for states

In [12]:
states = {1 :"AL",2 :"AK",4 :"AZ",5 :"AR",6 :"CA",8 :"CO",9 :"CT",10:"DE",11:"DC",12:"FL",13:"GA",15:"HI",
          16:"ID",17:"IL",18:"IN",19:"IA",20:"KS",21:"KY",22:"LA",23:"ME",24:"MD",25:"MA",26:"MI",27:"MN",
          28:"MS",29:"MO",30:"MT",31:"NE",32:"NV",33:"NH",34:"NJ",35:"NM",36:"NY",37:"NC",38:"ND",39:"OH",
          40:"OK",41:"OR",42:"PA",44:"RI",45:"SC",46:"SD",47:"TN",48:"TX",49:"UT",50:"VT",51:"VA",53:"WA",
          54:"WV",55:"WI",56:"WY",72:"PR"}

# states = {11:"DC",10:"DE"}


In [13]:
def concat_states_mapper(state_key_list): 
    
    concat_mapper = cudf.DataFrame()
    for i in state_key_list:
        if i< 10:
            i_str = '0'+str(i)
        else:
            i_str = str(i)
        path = 'data/block_rel_files/tab2010_tab2020_st%s_%s.csv'%(i_str,states[i].lower())
        if os.path.isfile(path):    
            mapper = cudf.read_csv(path,delimiter='|')
            concat_mapper = cudf.concat([concat_mapper,mapper])
        else:
            print("mapper file does not exist")
            continue
    return concat_mapper

In [14]:
mapper_df = concat_states_mapper(states.keys())
mapper_df.rename(columns={'﻿STATE_2010':'STATE_2010'},inplace=True)
mapper_df.head(2)

Unnamed: 0,STATE_2010,COUNTY_2010,TRACT_2010,BLK_2010,BLKSF_2010,AREALAND_2010,AREAWATER_2010,BLOCK_PART_FLAG_O,STATE_2020,COUNTY_2020,TRACT_2020,BLK_2020,BLKSF_2020,AREALAND_2020,AREAWATER_2020,BLOCK_PART_FLAG_R,AREALAND_INT,AREAWATER_INT
0,1,1,20100,1000,,482628,0,p,1,1,20100,1000,,288702,0,,288702,0
1,1,1,20100,1000,,482628,0,p,1,1,20100,1001,,194408,0,p,193926,0


In [15]:
mapper_df['ID10'] = (mapper_df.STATE_2010.astype('str').str.rjust(2,'0') + mapper_df.COUNTY_2010.astype('str').str.rjust(3,'0') + mapper_df.TRACT_2010.astype('str').str.rjust(6,'0') + mapper_df.BLK_2010.astype('str').str.rjust(4,'0')).astype('int64')
mapper_df['ID20'] = (mapper_df.STATE_2020.astype('str').str.rjust(2,'0') + mapper_df.COUNTY_2020.astype('str').str.rjust(3,'0') + mapper_df.TRACT_2020.astype('str').str.rjust(6,'0') + mapper_df.BLK_2020.astype('str').str.rjust(4,'0')).astype('int64')
mapper_df = mapper_df[['ID10','ID20','STATE_2020']].reset_index()
mapper_df.head()

Unnamed: 0,index,ID10,ID20,STATE_2020
0,0,10010201001000,10010201001000,1
1,1,10010201001000,10010201001001,1
2,2,10010201001001,10010201001001,1
3,3,10010201001002,10010201001002,1
4,4,10010201001003,10010201001002,1


#### Create Mapped IDs

In [16]:
# CPU function
# def map_to_ID10(x):
#     filtered_df = mapper_df[mapper_df.ID20 ==x]
#     ids = filtered_df.ID10.tolist() # get id10s
#     id_dict = {id10:len(mapper_df[mapper_df.ID10 == id10]) for id10 in ids} # Use id10 to get freq
#     return id_dict

In [17]:
mapped_df = cudf.merge(mapper_df[['ID10','ID20','STATE_2020']],mapper_df.groupby('ID20')['ID10'].unique().reset_index(),on='ID20',how='left').sort_values('ID10_x')
mapped_df = mapped_df.drop_duplicates('ID20')
mapped_df.head()

Unnamed: 0,ID10_x,ID20,STATE_2020,ID10_y
2160,10010201001000,10010201001000,1,[10010201001000]
2161,10010201001000,10010201001001,1,"[10010201001000, 10010201001001]"
2163,10010201001002,10010201001002,1,"[10010201001002, 10010201001003, 1001020100100..."
2170,10010201001007,10010201001003,1,"[10010201001007, 10010201001010, 1001020100101..."
2172,10010201001008,10010201001004,1,[10010201001008]


In [18]:
len(mapped_df)

8174955

In [20]:
lengths = mapper_df.groupby('ID20')['ID10'].nunique().to_cupy()

In [18]:
flattened_ID10_arr = mapped_df.ID10_y.explode('ID10_y').to_cupy()
len(flattened_ID10_arr)

12689707

In [19]:
flattened_ID10_arr

array([ 10010201001000,  10010201001000,  10010201001001, ...,
       721537506022020, 721537506022022, 721537506022023])

In [114]:
freq_df = mapped_df.ID10_y.explode('ID10_y').value_counts().reset_index().sort_values('index')
freq_df.rename(columns={'index':'ID10','ID10_y':'freq'},inplace=True)
freq_df = freq_df.reset_index(drop=True)
freq_df.head(2)

Unnamed: 0,ID10,freq
0,10010201001000,2
1,10010201001001,1


In [118]:
f1 = cudf.DataFrame(mapped_df.ID10_y.explode('ID10_y')).reset_index()
f1.rename(columns={'ID10_y':'ID10','index':'order'},inplace=True)
f1

Unnamed: 0,order,ID10
0,0,10010201001000
1,1,10010201001000
2,2,10010201001001
3,3,10010201001002
4,4,10010201001003
...,...,...
12689702,12689702,721537506022018
12689703,12689703,721537506022019
12689704,12689704,721537506022020
12689705,12689705,721537506022022


In [120]:
df1 = cudf.merge(f1,freq_df,on='ID10',how='left' )
# df1.drop(columns=['ID10_y'],inplace=True)
df2 = cudf.merge(df1,data10[['ID10','P10']],on='ID10')
df2.P10 = df2.P10.astype('float32')
df2['weights'] = df2.P10.divide(df2.freq)
df2 = df2.sort_values('order')
df2.head(15)

Unnamed: 0,order,ID10,freq,P10,weights
992,0,10010201001000,2,61.0,30.5
993,1,10010201001000,2,61.0,30.5
994,2,10010201001001,1,0.0,0.0
995,3,10010201001002,1,0.0,0.0
996,4,10010201001003,2,75.0,37.5
997,5,10010201001005,1,1.0,1.0
998,6,10010201001007,3,23.0,7.7
999,7,10010201001018,3,17.0,5.7
1000,8,10010201001007,3,23.0,7.7
1001,9,10010201001010,1,0.0,0.0


In [48]:
import time

In [123]:
print(data10[data10.ID10==10010201001008]['P10'])
print(data10[data10.ID10==10010201001012]['P10'])
print(data10[data10.ID10==10010201001013]['P10'])

8    0
Name: P10, dtype: object
12    2
Name: P10, dtype: object
13    2
Name: P10, dtype: object


In [124]:
freq_df.head(15)

Unnamed: 0,ID10,freq
0,10010201001000,2
1,10010201001001,1
2,10010201001002,1
3,10010201001003,2
4,10010201001004,1
5,10010201001005,1
6,10010201001006,1
7,10010201001007,3
8,10010201001008,2
9,10010201001009,1


In [122]:
mapped_df.reset_index(drop=True).head(17)

Unnamed: 0,ID10_x,ID20,STATE_2020,ID10_y
0,10010201001000,10010201001000,1,[10010201001000]
1,10010201001000,10010201001001,1,"[10010201001000, 10010201001001]"
2,10010201001002,10010201001002,1,"[10010201001002, 10010201001003, 10010201001005, 10010201001007, 10010201001018]"
3,10010201001007,10010201001003,1,"[10010201001007, 10010201001010, 10010201001011, 10010201001018]"
4,10010201001008,10010201001004,1,[10010201001008]
5,10010201001007,10010201001005,1,"[10010201001007, 10010201001011]"
6,10010201001009,10010201001006,1,[10010201001009]
7,10010201001017,10010201001007,1,[10010201001017]
8,10010201001008,10010201001008,1,"[10010201001008, 10010201001012, 10010201001013]"
9,10010201001013,10010201001009,1,[10010201001013]


In [98]:
df2.reset_index(drop=True).head(30)

Unnamed: 0,ID10,freq,P10,weights
0,10010201001000,2,61.0,30.5
1,10010201001000,2,61.0,30.5
2,10010201001001,1,0.0,0.0
3,10010201001002,1,0.0,0.0
4,10010201001003,2,75.0,37.5
5,10010201001003,2,75.0,37.5
6,10010201001004,1,0.0,0.0
7,10010201001005,1,1.0,1.0
8,10010201001006,1,0.0,0.0
9,10010201001007,3,23.0,7.7


In [125]:
print(lengths[:30])

[1 2 5 4 1 2 1 1 3 1 1 1 1 1 1 2 1 1 2 1 1 1 1 1 2 1 1 2 1 1]


#### Calculate new P10

In [131]:
sum_list = cp.array([])
c=0
end =0
start_time = time.time()
for f in lengths:
    c+=1
    # print(f)
    start = end
    end += int(f)
    arr =cp.arange(start,end)   
    # print(arr)
    total = df2['weights'].take(arr).sum()
    sum_list = cp.append(sum_list,total)
    # print(total)
    if c%1000000==0:
        # print(start,end)
        # print(df2['weights'].take(arr))
        # print(total)
        # print(arr)
        # print(sum_list)
        end_time = time.time()
        print(end_time-start_time)
        start_time = end_time
        print(c)
        print(c/12689707*100)

1176.3128459453583
1000000
7.880402597160045
1234.8324863910675
2000000
15.76080519432009
1305.8084828853607
3000000
23.641207791480134
1385.7390730381012
4000000
31.52161038864018
1434.0461013317108
5000000
39.40201298580022
1480.6101310253143
6000000
47.28241558296027
1558.3004684448242
7000000
55.16281818012031
1742.213505744934
8000000
63.04322077728036


In [159]:
# cp.save(open('sum_list_copy',"wb"),sum_list,allow_pickle=True)

In [150]:
eq_P10 = cp.load(open('sum_list',"rb"))
len(eq_P10)

8174955

In [157]:
data20['eq_P10'] = cudf.Series(eq_P10)
data20.head()

Unnamed: 0,GISJOIN,ID20,STATE,COUNTY,P20,eq_P10
0,1000100201001000,10010201001000,1,Autauga County,21,30.5
1,1000100201001001,10010201001001,1,Autauga County,34,30.5
2,1000100201001002,10010201001002,1,Autauga County,29,51.8
3,1000100201001003,10010201001003,1,Autauga County,17,13.3
4,1000100201001004,10010201001004,1,Autauga County,0,0.0


In [158]:
# data20.to_csv('data/mapped_data_full.csv')

In [205]:
mapper_df[mapper_df.ID20==51499526003079]

Unnamed: 0,index,ID10,ID20,STATE_2020,ID10_mapped,ID10_mapped_bypass
214215,214215,51499526003114,51499526003079,5,{51499526003114: 1},{51499526003114: 1}


In [206]:
block10[block10.GEOID10==51499526003114]

Unnamed: 0,GEOID10,H7V001
3005741,51499526003114,8


In [207]:
merged_df[merged_df.GEOID10==51499526003114]

Unnamed: 0,GEOID10,P1,GEOID20,P2,STATEA,INTPTLAT,INTPTLON,COUNTY
227418,51499526003114,8,,,,,,
