This code reads in the original AFDC stations dataset, assigns a unique ID and BG-level FIPS codes, and drops irrelevant columns and stations (e.g. out of territory of interest).

In [None]:
import pandas as pd
import geopandas as geopd
import strgen
from tqdm import tqdm
import state_name_crs_mappings_ML as crsm
import us
import os

In [None]:
# Relevant paths
root = ''
path = root + 'Data/'
path_US_data = root + 'Data/geodata/'
result_path = root + 'final_data/'
path_IRA = root + 'Data/IRA/1.0-shapefile-codebook/usa/'

# Read in original station data

In [None]:
# Read file
df_stations = pd.read_csv(path + 'AFDC/alt_fuel_stations (Apr 3 2023).csv')
print(len(df_stations))

# Assign UID based on original dataset

In [None]:
# Create unique ID

# Street part
df_stations['temp'] = df_stations['Street Address'].str.replace(' ','').str[:5]
# For stations without street address, create a random string
for ind in tqdm(df_stations.loc[df_stations['temp'].isna()].index.to_list()):
    df_stations.loc[ind,'temp'] = strgen.StringGenerator("[\w\d]{5}").render()

# Random part
df_stations['temp2'] = ''
for ind in tqdm(df_stations.index):
    df_stations.loc[ind,'temp2'] = strgen.StringGenerator("[\w\d]{5}").render()

# Combine
df_stations['unique_ID'] = df_stations['State'] + df_stations['ZIP'].astype(str) + df_stations['temp'] + df_stations['temp2']

In [None]:
# UID is nan if no state or ZIP is provided - fill these with unique IDs
index_UID_NaN = df_stations.loc[df_stations['unique_ID'].isna()].index
df_stations.loc[index_UID_NaN, 'unique_ID'] = 'US' + df_stations.loc[index_UID_NaN]['ZIP'].astype(str) + df_stations.loc[index_UID_NaN]['temp'] + df_stations.loc[index_UID_NaN]['temp2']

In [None]:
# Clean up the data
df_stations.drop('temp',axis=1,inplace=True)
df_stations.drop('temp2',axis=1,inplace=True)

In [None]:
# Set index
df_stations.set_index('unique_ID',inplace=True)

In [None]:
# Save the data
df_stations.to_csv(result_path + '00_alt_fuel_stations (Apr 3 2023)_wUID.csv')

# Drop irrelevant columns

In [None]:
# Read data
df_stations = pd.read_csv(result_path + '00_alt_fuel_stations (Apr 3 2023)_wUID.csv',index_col=0)
df_stations.head(3)

In [None]:
# Drop some columns which are certaintly not needed because they are empty or not relevant for EV charging
df_stations.drop(columns='Plus4',inplace=True)
df_stations.drop(columns='BD Blends',inplace=True)
df_stations.drop(columns='NG Fill Type Code',inplace=True)
df_stations.drop(columns='NG PSI',inplace=True)
df_stations.drop(columns='Hydrogen Status Link',inplace=True)
df_stations.drop(columns='NG Vehicle Class',inplace=True)
df_stations.drop(columns='LPG Primary',inplace=True)
df_stations.drop(columns='E85 Blender Pump',inplace=True)
df_stations.drop(columns='Intersection Directions (French)',inplace=True)
df_stations.drop(columns='Access Days Time (French)',inplace=True)
df_stations.drop(columns='BD Blends (French)',inplace=True)
df_stations.drop(columns='Groups With Access Code (French)',inplace=True)
df_stations.drop(columns='Hydrogen Is Retail',inplace=True)
df_stations.drop(columns='CNG Dispenser Num',inplace=True)
df_stations.drop(columns='CNG On-Site Renewable Source',inplace=True)
df_stations.drop(columns='CNG Total Compression Capacity',inplace=True)
df_stations.drop(columns='CNG Storage Capacity',inplace=True)
df_stations.drop(columns='LNG On-Site Renewable Source',inplace=True)
df_stations.drop(columns='E85 Other Ethanol Blends',inplace=True)
df_stations.drop(columns='EV Pricing (French)',inplace=True)
df_stations.drop(columns='LPG Nozzle Types',inplace=True)
df_stations.drop(columns='Hydrogen Pressures',inplace=True)
df_stations.drop(columns='Hydrogen Standards',inplace=True)
df_stations.drop(columns='CNG Fill Type Code',inplace=True)
df_stations.drop(columns='CNG PSI',inplace=True)
df_stations.drop(columns='CNG Vehicle Class',inplace=True)
df_stations.drop(columns='LNG Vehicle Class',inplace=True)
df_stations.drop(columns='RD Blends',inplace=True)
df_stations.drop(columns='RD Blends (French)',inplace=True)
df_stations.drop(columns='RD Blended with Biodiesel',inplace=True)
df_stations.drop(columns='RD Maximum Biodiesel Level',inplace=True)

# Assign FIPS information

In [None]:
# Convert to geodata
gdf_stations = geopd.GeoDataFrame(df_stations, geometry=geopd.points_from_xy(df_stations.Longitude, df_stations.Latitude, crs="EPSG:4326"))

## State info (actually given but some are missing)

In [None]:
# Read state shape file and convert to stations' crs
gdf_states = geopd.read_file(path_US_data + 'tl_2017_us_state/tl_2017_us_state.shp')
gdf_states = gdf_states.to_crs(gdf_stations.crs)

In [None]:
# Use state shapefile to identify state by geometry to check for mistakes
# New state column assigned by geometry: STUSPS
gdf_stations_wstates = gdf_stations.sjoin(gdf_states[['STUSPS','geometry']], how='left', predicate='within')
gdf_stations_wstates.drop(columns='index_right',inplace=True)
gdf_stations_wstates.head(3)

In [None]:
# Add state FIPS code (str)
gdf_stations_wstates['STATEFP'] = ''
for state in gdf_stations['State'].unique():
    try:
        gdf_stations_wstates.loc[gdf_stations_wstates['State']==state,'STATEFP'] = us.states.lookup(state).fips
    except:
        # Handle missing or invalid state names
        print(state)

In [None]:
# Manually assign DC FIPS code
gdf_stations_wstates.loc[gdf_stations_wstates['State'] == 'DC','STATEFP'] = '11'

In [None]:
# Correct stations with QC state code but located in CA
gdf_stations_wstates.loc['QC913165566Y8XvpW','State'] = 'CA'
gdf_stations_wstates.loc['QC913165566Y8XvpW','STATEFP'] = '06'

In [None]:
# Check for ON -- Canadian, remove
gdf_stations_wstates = gdf_stations_wstates.loc[gdf_stations_wstates['State'] != 'ON']

In [None]:
# Check for KA -- Wrongly listed, remove
gdf_stations_wstates = gdf_stations_wstates.loc[gdf_stations_wstates['State'] != 'KA']

In [None]:
# Could some not be assigned? --> Remove them/wrong geolocation
print(len(gdf_stations_wstates.loc[gdf_stations_wstates['STUSPS'].isna()]))
gdf_stations_wstates = gdf_stations_wstates.loc[~gdf_stations_wstates['STUSPS'].isna()]

In [None]:
# Remove stations where state by attribute does not match state by geometry
print(len(gdf_stations_wstates.loc[gdf_stations_wstates['State'] != gdf_stations_wstates['STUSPS']]))
gdf_stations_wstates = gdf_stations_wstates.loc[gdf_stations_wstates['State'] == gdf_stations_wstates['STUSPS']]

In [None]:
# Remaining stations all have State == STUSPS--> Drop STUSPS column
assert (gdf_stations_wstates['State'] == gdf_stations_wstates['STUSPS']).all()
gdf_stations_wstates.drop(columns='STUSPS',inplace=True)

In [None]:
# Remove PR
gdf_stations_wstates = gdf_stations_wstates.loc[gdf_stations_wstates['State'] != 'PR']

In [None]:
# Save
# Commented to keep original file
df_stations_wstates = gdf_stations_wstates.drop(columns='geometry')
df_stations_wstates.to_csv(result_path + '00_alt_fuel_stations (Apr 3 2023)_wFIPS.csv')

# County

In [None]:
# Read county shape file
gdf_county = geopd.read_file(path_US_data + 'tl_2022_us_county/tl_2022_us_county.shp')
gdf_county = gdf_county.to_crs(gdf_stations.crs)
gdf_county.head(3)

In [None]:
# Use state shapefile to identify county by geometry
gdf_stations_wcounties = gdf_stations_wstates.sjoin(gdf_county[['GEOID','geometry']], how='left', predicate='within')
gdf_stations_wcounties.drop(columns='index_right',inplace=True)
gdf_stations_wcounties.rename(columns={'GEOID':'COUNTYFP'},inplace=True)
gdf_stations_wcounties.head(3)

In [None]:
# Save
df_stations_wcounties = gdf_stations_wcounties.drop(columns='geometry')
df_stations_wcounties.to_csv(result_path + '00_alt_fuel_stations (Apr 3 2023)_wFIPS.csv')

# Tract

In [None]:
# Assign state name - required to read IRA files later
gdf_stations_wcounties['State_Name'] = None
for state_fips in gdf_stations_wcounties['STATEFP'].unique():
    if state_fips == '11':
        state_name = 'District of Columbia'
    else:
        state_name = us.states.lookup(state_fips).name
    gdf_stations_wcounties.loc[gdf_stations_wcounties['STATEFP'] == state_fips,'State_Name'] = state_name

In [None]:
# Assign tracts
gdf_stations_wtracts = pd.DataFrame()
for state in tqdm(gdf_stations_wcounties['STATEFP'].unique()):
    # Filter stations in state
    gdf = gdf_stations_wcounties.loc[gdf_stations_wcounties['STATEFP'] == state]
    state_name = gdf['State_Name'].iloc[0]
    # Read IRA file
    gdf_IRA = geopd.read_file(path_IRA + state_name.replace(' ','') + '.shp')
    gdf_IRA = gdf_IRA.to_crs(gdf_stations.crs)
    # Assign tracts
    gdf = gdf.sjoin(gdf_IRA[['GEOID10','geometry']], how='left', predicate='within')
    gdf.drop(columns='index_right',inplace=True)
    # Append
    if len(gdf_stations_wtracts) > 0:
        gdf_stations_wtracts = pd.concat([gdf_stations_wtracts,gdf])
    else:
        gdf_stations_wtracts = gdf.copy()

In [None]:
# Rename
gdf_stations_wtracts.rename(columns={'GEOID10':'TRACTFP'},inplace=True)

In [None]:
# Save
df_stations_wtracts = gdf_stations_wtracts.drop(columns='geometry')
df_stations_wtracts.to_csv(result_path + '00_alt_fuel_stations (Apr 3 2023)_wFIPS.csv')

# BGs

In [None]:
# Assign BGs
gdf_stations_wbgs = pd.DataFrame()
for fips in tqdm(gdf_stations_wtracts['STATEFP'].unique()):
    # Filter stations in state
    gdf = gdf_stations_wtracts.loc[gdf_stations_wtracts['STATEFP'] == fips]
    # Read BG file
    file_bg = path + 'geodata/tl_bg/tl_2020_'+fips+'_bg/tl_2020_'+fips+'_bg.shp'
    gdf_bg = geopd.read_file(file_bg)
    gdf_bg = gdf_bg.to_crs(gdf_stations_wtracts.crs)
    # Assign tracts
    gdf = gdf.sjoin(gdf_bg[['GEOID','ALAND','geometry']], how='left', predicate='within')
    gdf.drop(columns='index_right',inplace=True)
    # Append
    if len(gdf_stations_wbgs) > 0:
        gdf_stations_wbgs = pd.concat([gdf_stations_wbgs,gdf])
    else:
        gdf_stations_wbgs = gdf.copy()

In [None]:
# Rename
gdf_stations_wbgs.rename(columns={'GEOID':'BGFP'},inplace=True)

In [None]:
# Save shapefile
gdf_stations_wbgs.to_file(result_path + '00_alt_fuel_stations (Apr 3 2023)_wFIPS.shp')

In [None]:
# Save
df_stations_wbgs = gdf_stations_wbgs.drop(columns='geometry')
df_stations_wbgs.to_csv(result_path + '00_alt_fuel_stations (Apr 3 2023)_wFIPS.csv')