In [None]:
import cudf
from pyproj import Transformer, CRS
import pandas as pd
import numpy as np
import sys,os,datetime,random
import geopandas as gpd
from shapely.geometry import Point

In [None]:
full_path='census_2020_data/census2020.csv'
small_path = 'census_2020_data/census2020_small.csv' 
small_indv_path='census_2020_data/census2020_individuals_sm.csv'

In [None]:
df = pd.read_csv(small_path,skiprows=[1],encoding='unicode_escape',usecols=['GEOCODE','STATE','STATEA','REGIONA','COUNTYA','BLOCKA','AREALAND','AREAWATR','INTPTLAT','INTPTLON','U7B001','U7B002','U7B009','U7B026','U7B047','U7B063','U7B070']).drop(0)
df.head()

### Random points

In [None]:
states = {1 :"AL",2 :"AK",4 :"AZ",5 :"AR",6 :"CA"}

In [None]:
df.head(2)

In [None]:
path = 'census_2020_data/tl_2021_01_tabblock20/tl_2021_01_tabblock20.shp'
gpdf = gpd.read_file(path)
gpdf.head()

In [None]:
def random_points_in_polygon(number, polygon):
    points_x = np.array([])
    points_y = np.array([])
    min_x, min_y, max_x, max_y = polygon.bounds
    i= 0
    while i < number:
        point_x = random.uniform(min_x, max_x)
        point_y = random.uniform(min_y, max_y)
        if polygon.contains(Point(point_x, point_y)):
            points_x = np.append(points_x, point_x)
            points_y = np.append(points_y, point_y)
            i += 1
    return points_x, points_y # returns list of points(lat), list of points(long)

In [None]:
def generate_data(state, df_temp, gpdf):
    t1 = datetime.datetime.now()
    geoid_index_df = df_temp.index.to_numpy()
    final_points_x = np.array([])
    final_points_y = np.array([])
    geoid = np.array([])
    
    for index, row in gpdf.iterrows():
        points_x = np.array([])
        points_y = np.array([])
        geoid_temp = np.array([])
        if row['GEOID20'] in geoid_index_df and df_temp.loc[row['GEOID20']]>0:
            num_points = df_temp.loc[row['GEOID20']]
            polygon = row['geometry']
            if polygon is not None:
                points_x, points_y = random_points_in_polygon(num_points, polygon)
                geoid_temp = np.array([row['GEOID20']]*len(points_x))
                geoid = np.append(geoid,geoid_temp)
                final_points_x = np.append(final_points_x, points_x)
                # print(final_points_x)
                final_points_y = np.append(final_points_y, points_y)
                print('Processing '+str(state)+' - Completed:', "{0:0.2f}".format((index/len(gpdf))*100), '%', end='')
                print('', end='\r')

    print('Processing for '+str(state)+' complete \n total time', datetime.datetime.now() - t1)
    
    df_fin = cudf.DataFrame({'GEOID20': geoid,'x': final_points_x, 'y':final_points_y})
    df_fin.GEOID20 = df_fin.GEOID20[1:].astype('int').astype('str')
    df_fin.to_csv('census_2020_data/population_'+str(state)+'.csv', index=False)

In [None]:
def exec_data(state_key_list):
    c=0
    for i in state_key_list:
        print(i)
        c+=1
        if i< 10:
            i_str = '0'+str(i)
        else:
            i_str = str(i)
        path = 'census_2020_data/tl_2021_%s_tabblock20/tl_2021_%s_tabblock20.shp'%(i_str,i_str)
        #print(path)
        print("started reading shape file for state ", states[i])
        if os.path.isfile(path):    
            gpdf = gpd.read_file(path)[['GEOID20', 'geometry']]
            gpdf.GEOID20 = gpdf.GEOID20[1:].astype('int64')
            print("completed reading shape file for state ", states[i])
            df_temp = df.query('STATEA == @i')[['GEOCODE', 'U7B001']]
            df_temp.index = df_temp.GEOCODE
            df_temp = df_temp['U7B001']
            # print(gpdf.head(3))
            # print(gpdf)
            print("starting to generate data for "+str(states[i])+"... ")
            generate_data(states[i], df_temp, gpdf)
            del(df_temp)
        else:
            print("shape file does not exist")
            continue
        # if c==2:
        #     break

In [None]:
exec_data(states.keys())

### Concat state

In [None]:
df.head(2)

In [None]:
df3 = cudf.read_csv('census_2020_data/population_%s.csv'%('AL'), usecols=['GEOID20','x', 'y'])
df3.head(3)

In [None]:
inProj = '+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=37.5 +lon_0=-96 +x_0=0 +y_0=0 +datum=NAD83 +units=m +no_defs' # Latitude and longitudes
outProj = 'epsg:4326' # 2D projected points
transformer = Transformer.from_crs(inProj, outProj, always_xy=True)
transformer.transform(df3['x'].to_numpy(), df3['y'].to_numpy())

In [None]:
def read_state(state):
    print(state)
    print('reading '+state,end='\r')
    df2 = cudf.read_csv('census_2020_data/population_%s.csv'%(state), usecols=['GEOID20','x', 'y'])
    #print(df2)
    df2.GEOID20 = df2.GEOID20.fillna(method='bfill') # first row in every state has NA
    df2.GEOID20 = df2.GEOID20.fillna(method='ffill') # first row in every state has NA
    # print(df2[['x','y']])
    
    inProj = '+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=37.5 +lon_0=-96 +x_0=0 +y_0=0 +datum=NAD83 +units=m +no_defs' # Latitude and longitudes
    outProj = 'epsg:4326' # 2D projected points
    transformer = Transformer.from_crs(inProj, outProj, always_xy=True)
    df2['x1'], df2['y1'] = transformer.transform(df2['x'].to_numpy(), df2['y'].to_numpy()) # Apply transformation
    # print('completed',end='\r')
    # print(df2[['x1','y1']])
    return df2

In [None]:
df.rename(columns={"GEOCODE":"GEOID20"},inplace=True)
df.head(2)

In [None]:
df1 = [read_state(x) for x in list(states.values())]
final_df = cudf.concat(df1)
del(df1)
final_df = final_df.reset_index(drop=True)
final_df['p_id'] = final_df.index.astype('int32')
dataset = cudf.merge(final_df[['GEOID20','x','y','p_id']],cudf.from_pandas(df),on='GEOID20')
dataset.head()

In [None]:
df3 = dataset.to_pandas()
df3.GEOID20 = df3.GEOID20.apply(lambda x: int(x))
final_data = cudf.from_pandas(df3)
final_data.head(2)

In [None]:
final_data.tail()

In [None]:
final_data.columns

In [None]:
# final_data.to_csv('census_2020_data/census2020_individuals_sm.csv')