This code takes the full set of PoIs, assigns their FIPS codes, and saves them separately by state.

In [None]:
import os
import pandas as pd
import geopandas as geopd
import us
from tqdm import tqdm

In [None]:
root = ''
path = root + 'Data/'
path_US_data = root + 'Data/geodata/'
result_path = root + 'final_data/'
path_IRA = root + 'Data/IRA/1.0-shapefile-codebook/usa/'

In [None]:
# Create folder if not exists
if not os.path.exists(result_path + 'Dewey/'):
    os.makedirs(result_path + 'Dewey/')

# Read Dewey / Safegraph data: PoI

In [None]:
# Read in Dewey data
file = path + 'Dewey_original/compiled.csv'
df_dewey = pd.read_csv(file,index_col=['placekey'])
df_dewey.drop('Unnamed: 0',axis=1,inplace=True)
print(df_dewey['region'].unique())

# Split Dewey data by state

In [None]:
# Read station data
df_stations_wbgs = pd.read_csv(result_path + '00_alt_fuel_stations (Apr 3 2023)_wFIPS.csv',index_col=0)
df_stations_wbgs.head(3)

In [None]:
# Get states as used by final station data
states_AFDC = list(df_stations_wbgs['State'].unique())

In [None]:
# Split Dewey data by state and delete charging stations
for col in tqdm(states_AFDC):
    # Split
    df_dewey_state = df_dewey.loc[df_dewey['region'] == col]

    # Remove charging stations from PoI to avoid double counting
    df_dewey_chargingstations = df_dewey_state.loc[df_dewey_state['sub_category'] == 'Other Gasoline Stations'] # Checked: they are all charging stations!
    df_dewey_chargingstations.to_csv(result_path + 'Dewey/compiled_'+col+'_chargingstations.csv')
    df_dewey_state = df_dewey_state.loc[df_dewey_state['sub_category'] != 'Other Gasoline Stations']

    # Save
    df_dewey_state.to_csv(result_path + 'Dewey/compiled_'+col+'.csv')

# Assign FIPS

In [None]:
# Converts Dewey data to geodataframe
def assign_FIPS(df_dewey_state,gdf_county,col):
    # Convert to geodataframe
    gdf_dewey_state = geopd.GeoDataFrame(df_dewey_state, geometry=geopd.points_from_xy(df_dewey_state['longitude'], df_dewey_state['latitude'], crs="EPSG:4326"))
    
    # County
    # Check alignment with county shape file
    assert gdf_dewey_state.crs == gdf_county.crs
    # Use shapefile to identify county by geometry
    gdf_dewey_wcounties = gdf_dewey_state.sjoin(gdf_county[['GEOID','geometry']], how='left', predicate='within')
    gdf_dewey_wcounties.drop(columns='index_right',inplace=True)
    gdf_dewey_wcounties.rename(columns={'GEOID':'COUNTYFP'},inplace=True)
    if len(gdf_dewey_state) != len(gdf_dewey_wcounties):
        print('Test: gdf_dewey_state == gdf_dewey_wcounties')
    
    # CT
    # Use IRA files to identify CT by county
    if col == 'DC':
        state_fips = '11'
        state_name = 'District of Columbia'
    else:
        state_fips = us.states.lookup(col).fips
        state_name = us.states.lookup(state_fips).name
    # Read IRA file
    gdf_IRA = geopd.read_file(path_IRA + state_name.replace(' ','') + '.shp')
    gdf_IRA = gdf_IRA.to_crs(gdf_dewey_wcounties.crs)
    # Assign tracts
    gdf_dewey_wCT = gdf_dewey_wcounties.sjoin(gdf_IRA[['GEOID10','geometry']], how='left', predicate='within')
    gdf_dewey_wCT.drop(columns='index_right',inplace=True)
    gdf_dewey_wCT.rename(columns={'GEOID10':'TRACTFP'},inplace=True)
    if len(gdf_dewey_wCT) != len(gdf_dewey_wcounties):
        print('Test: gdf_dewey_wcounties == gdf_dewey_wCT failed')

    # BG
    # Read BG file
    file_bg = path + 'geodata/tl_bg/tl_2020_'+state_fips+'_bg/tl_2020_'+state_fips+'_bg.shp'
    gdf_bg = geopd.read_file(file_bg)
    gdf_bg = gdf_bg.to_crs(gdf_dewey_wCT.crs)
    # Assign tracts
    gdf_dewey_wBG = gdf_dewey_wCT.sjoin(gdf_bg[['GEOID','ALAND','geometry']], how='left', predicate='within')
    gdf_dewey_wBG.drop(columns='index_right',inplace=True)
    gdf_dewey_wBG.rename(columns={'GEOID':'BGFP'},inplace=True)
    # gdf_dewey_wBG.rename(columns={'ALAND':'BGALAND'},inplace=True)
    if len(gdf_dewey_wCT) != len(gdf_dewey_wBG):
        print('Test: gdf_dewey_wCT == gdf_dewey_wBG failed')

    # Check for unique index
    if gdf_dewey_wBG.index.duplicated().sum() != 0:
        print('Test: unique index failed')

    # STATEFP
    gdf_dewey_wBG['STATEFP'] = gdf_dewey_wBG['COUNTYFP'].str[:2]
    if len(set(gdf_dewey_wBG['STATEFP'])) != 1:
        print('Test: Multiple STATEFPs')
        print(set(gdf_dewey_wBG['STATEFP']))

        # How many to remove?
        print('Removing [%]:')
        print((1.-len(gdf_dewey_wBG.loc[gdf_dewey_wBG['STATEFP'] == state_fips])/len(gdf_dewey_wBG))*100)
        gdf_dewey_wBG = gdf_dewey_wBG.loc[gdf_dewey_wBG['STATEFP'] == state_fips]

    return gdf_dewey_wBG

In [None]:
# Read county data
# Read county shape file
gdf_county = geopd.read_file(path_US_data + 'tl_2022_us_county/tl_2022_us_county.shp')
gdf_county = gdf_county.to_crs("EPSG:4326")

In [None]:
# Assign FIPS to Dewey data
for col in tqdm(sorted(states_AFDC)):
    print(col)
    # Read state data 
    df_dewey_state = pd.read_csv(result_path + 'Dewey/compiled_'+col+'.csv')
    # Assign FIPS
    gdf_dewey_state = assign_FIPS(df_dewey_state,gdf_county,col)
    # Save
    df_dewey_state = gdf_dewey_state.drop(columns='geometry')
    df_dewey_state.to_csv(result_path + 'Dewey/01_compiled_'+col+'.csv')