This file compiles the main station-level dataset used for the clustering and the definition of the station topology.

In [None]:
import pandas as pd
import us
from tqdm import tqdm
import numpy as np
import category_groupings_250403 as cg

In [None]:
root = ''
path = root + 'Data/'
path_US_data = root + 'Data/geodata/'
result_path = root + 'final_data/'
path_IRA = root + 'Data/IRA/1.0-shapefile-codebook/usa/'

# Generation state-specific station-level datasets

In [None]:
# Which states to compile
states = []
for state in us.states.STATES:
    states +=[state.abbr]
states += ['DC']
print(states)

In [None]:
# label
label = '250415'

# Read in stations

In [None]:
# Read original stations wFIPS
df_stations = pd.read_csv(result_path + '00_alt_fuel_stations (Apr 3 2023)_wFIPS.csv',index_col=0) # ,dtype={'STATEFP':int,'COUNTYFP':int,'BGFP':int})
df_stations['STATEFP'] = df_stations['STATEFP'].astype(str).str.zfill(2)
df_stations['COUNTYFP'] = df_stations['COUNTYFP'].astype(str).str.zfill(5)

In [None]:
# Remove NaN in BGFP and convert to str
df_stations = df_stations.loc[~df_stations['BGFP'].isna()]
df_stations['BGFP'] = df_stations['BGFP'].astype(int).astype(str).str.zfill(12)

In [None]:
# Filter for relevant states
df_stations = df_stations[df_stations.State.isin(states)]

## Add BG socioeconomics

In [None]:
# Inspect
df_socioecon = pd.read_csv(result_path + 'BGlevel/level_BG.csv')
df_socioecon['STATEFP'] = df_socioecon['STATEFP'].astype(str).str.zfill(2)
df_socioecon['COUNTYFP'] = df_socioecon['COUNTYFP'].astype(str).str.zfill(5)
df_socioecon['BGFP'] = df_socioecon['BGFP'].astype(str).str.zfill(12)
df_socioecon.set_index('BGFP', inplace=True)
df_socioecon.head(3)

In [None]:
# Add socioeconomic data
for state in (states):
    print(state)
    if state == 'DC':
        fips = '11'
    else:
        fips = us.states.lookup(state).fips
    
    # Filter for stations in state
    df_stations_state = df_stations.loc[df_stations['STATEFP'] == fips]
    df_stations_state.reset_index(inplace=True)

    # Filter for socioeocnomic data in state
    df_socioecon_state = df_socioecon.loc[df_socioecon['STATEFP'] == fips]

    # Drop one of the COUNTYFP columns
    df_socioecon_state = df_socioecon_state.drop(['COUNTYFP'],axis=1,inplace=False)

    # Merge stations and income
    df_stations_state = pd.merge(df_stations_state, df_socioecon_state, how='left', on=['BGFP','STATEFP']) #,right_index=True)
    df_stations_state.set_index('unique_ID', inplace=True)

    # Save
    df_stations_state.to_csv(result_path + 'stationlevel/20_level_stations_' + state + '_compiled.csv')

## Add nb income

In [None]:
# Column duplicates
cols_dup = ['median_household_income_byBG','total_pop_byBG']

In [None]:
# Add nb income
df_stations_US = pd.DataFrame()
for state in tqdm(states):
    if state == 'DC':
        fips = '11'
    else:
        fips = us.states.lookup(state).fips
    # Read current station file
    df_stations_state = pd.read_csv(result_path + 'stationlevel/20_level_stations_' + state + '_compiled.csv',index_col=0)
    df_stations_state['STATEFP'] = df_stations_state['STATEFP'].astype(str).str.zfill(2)
    df_stations_state['COUNTYFP'] = df_stations_state['COUNTYFP'].astype(str).str.zfill(5)
    df_stations_state['BGFP'] = df_stations_state['BGFP'].astype(int).astype(str).str.zfill(12)
    # Read nb income
    df_nbincome_state = pd.read_csv(result_path + 'BGlevel/level_BG_'+state+'_wnbincome.csv')
    df_nbincome_state['STATEFP'] = df_nbincome_state['STATEFP'].astype(str).str.zfill(2)
    df_nbincome_state['COUNTYFP'] = df_nbincome_state['COUNTYFP'].astype(str).str.zfill(5)
    df_nbincome_state['BGFP'] = df_nbincome_state['BGFP'].astype(str).str.zfill(12)
    # Drop duplicated columns
    for col_del in cols_dup:
        df_nbincome_state.drop(col_del,axis=1,inplace=True)
    # Drop one of the COUNTYFP columns
    df_nbincome_state.drop('COUNTYFP',axis=1,inplace=True)
    # Merge   
    df_stations_state['unique_ID'] = df_stations_state.index 
    df_stations_state = df_stations_state.merge(df_nbincome_state, how='left', on=['BGFP','STATEFP'])
    df_stations_state.set_index('unique_ID', inplace=True)
    
    # Save data
    df_stations_state.to_csv(result_path + 'stationlevel/20_level_stations_' + state + '_compiled.csv')

## Add PoI

In [None]:
# PoIs within which distance
distance = 500

In [None]:
# Get all meta categories used
list_top_cats = []
for key in cg.category_grouping('a',return_key_list=True):
    list_top_cats += [cg.category_grouping(key)]
list_top_cats = sorted(set(list_top_cats))
list_top_cats

In [None]:
# Include new columns: no of POI and share of each POI type
df_stations_US['no_PoI_'+str(distance)] = 0
for top_cat in list_top_cats:
    df_stations_US['no_'+top_cat.replace(' ','_')+'_'+str(distance)] = 0
    df_stations_US['share_'+top_cat.replace(' ','_')+'_'+str(distance)] = 0.0    

In [None]:
# Add PoI number and shares
for state in (states):
    print(state)
    # Read station data
    df_stations_state = pd.read_csv(result_path + 'stationlevel/20_level_stations_' + state + '_compiled.csv',index_col=0)
    df_stations_state['STATEFP'] = df_stations_state['STATEFP'].astype(str).str.zfill(2)
    df_stations_state['COUNTYFP'] = df_stations_state['COUNTYFP'].astype(str).str.zfill(5)
    df_stations_state['BGFP'] = df_stations_state['BGFP'].astype(int).astype(str).str.zfill(12)
    # Read PoI
    df_dewey_state = pd.read_csv(result_path + 'Dewey/04b_compiled_'+state+'_addinfo.csv',index_col=0)
    df_dewey_state.set_index('placekey',inplace=True)
    # Iterate over counties
    for countyfp in tqdm(df_stations_state['COUNTYFP'].unique()):
        # Filter for stations
        df_stations_county = df_stations_state.loc[df_stations_state['COUNTYFP'] == countyfp]
        # Read distance matrix
        df_distance_matrix_county = pd.read_csv(result_path + 'distancematrices_uniqueID/'+state+'_'+countyfp + '_distancematrix.csv',index_col=0)
        # Iterate over stations in county
        for unique_ID in df_stations_county.index:
            # Filter for PoI within range
            df = df_distance_matrix_county.loc[df_distance_matrix_county[unique_ID] <= distance]
            df_stations_state.loc[unique_ID,'no_PoI_500'] = len(df)
            # If there is at least one PoI, add number of PoI by category
            if len(df) > 0:
                df2 = df_dewey_state.loc[df.index]
                for top_cat in df2.top_category_edit.unique():
                    if str(top_cat) != 'nan':
                        df3 = df2.loc[df2['top_category_edit'] == top_cat]
                        df_stations_state.loc[unique_ID,'no_' + top_cat.replace(' ','_') + '_500'] = len(df3)
                        df_stations_state.loc[unique_ID,'share_' + top_cat.replace(' ','_') + '_500'] = len(df3)/len(df)
    df_stations_state.to_csv(result_path + 'stationlevel/20_level_stations_' + state + '_compiled.csv')


## Add patterns

In [None]:
# Read patterns data
df_patterns = pd.read_csv(result_path + 'patternsonly_uniqueID_20250401.csv',index_col=0)
df_patterns.set_index('unique_ID', inplace=True)
df_patterns.drop('State',axis=1,inplace=True)
df_patterns.head()

In [None]:
# Merge patterns and stations
for state in states:
    df_stations_state = pd.read_csv(result_path + 'stationlevel/20_level_stations_' + state + '_compiled.csv',index_col=0)
    df_stations_state['BGFP'] = df_stations_state['BGFP'].astype(int).astype(str).str.zfill(12)
    df_stations_state = df_stations_state.merge(df_patterns, how='left', left_index=True, right_index=True)
    df_stations_state.to_csv(result_path + 'stationlevel/20_level_stations_' + state + '_compiled.csv')

## Add county-level

In [None]:
# Read counry-level data
gdf_county = pd.read_csv(result_path + 'level_county.csv',index_col=0)
gdf_county['COUNTYFP'] = gdf_county['COUNTYFP'].astype(str).str.zfill(5)
gdf_county.drop('STATEFP', axis=1, inplace=True) # to avoid double columns
gdf_county.head(3)

In [None]:
# Merge county data and stations
for state in states:
    # Read stations
    df_stations_state = pd.read_csv(result_path + 'stationlevel/20_level_stations_' + state + '_compiled.csv',index_col=0)
    df_stations_state['COUNTYFP'] = df_stations_state['COUNTYFP'].astype(int).astype(str).str.zfill(5)
    df_stations_state['unique_ID'] = df_stations_state.index
    # Merge
    df_stations_state = df_stations_state.merge(gdf_county, how='left', left_on='COUNTYFP', right_on='COUNTYFP')
    df_stations_state.set_index('unique_ID', inplace=True)
    # Save
    df_stations_state.to_csv(result_path + 'stationlevel/20_level_stations_' + state + '_compiled.csv')

## Add state-level

In [None]:
# Add state-level info
df_state_level = pd.read_csv(result_path + 'level_state.csv')
df_state_level['STATEFP'] = df_state_level['STATEFP'].astype(str).str.zfill(2)
df_state_level.head(3)

In [None]:
# Merge county data and stations
for state in states:
    # Read stations
    df_stations_state = pd.read_csv(result_path + 'stationlevel/20_level_stations_' + state + '_compiled.csv',index_col=0)
    df_stations_state['STATEFP'] = df_stations_state['STATEFP'].astype(int).astype(str).str.zfill(2)
    df_stations_state['unique_ID'] = df_stations_state.index
    # Merge
    df_stations_state = df_stations_state.merge(df_state_level, how='left', on='STATEFP')
    df_stations_state.set_index('unique_ID', inplace=True)
    # Save
    df_stations_state.to_csv(result_path + 'stationlevel/20_level_stations_' + state + '_compiled.csv')

## Add highways

In [None]:
# Merge hwys and stations
for state in states:
    # Read stations
    df_stations_state = pd.read_csv(result_path + 'stationlevel/20_level_stations_' + state + '_compiled.csv',index_col=0)
    df_stations_state['BGFP'] = df_stations_state['BGFP'].astype(int).astype(str).str.zfill(12)
    # Read hwy distance
    gdf_hwy_state = pd.read_csv(result_path + 'stationlevel/level_stations_'+state+'_hwy.csv',index_col=0)
    # Merge
    df_stations_state = df_stations_state.merge(gdf_hwy_state, how='left', left_index=True, right_index=True)
    df_stations_state.to_csv(result_path + 'stationlevel/20_level_stations_' + state + '_compiled.csv')

## Add number of stations nearby

In [None]:
# Add number of stations nearby
for state in (states):
    print(state)
    # Read station data
    df_stations_state = pd.read_csv(result_path + 'stationlevel/20_level_stations_' + state + '_compiled.csv',index_col=0)
    df_stations_state['BGFP'] = df_stations_state['BGFP'].astype(int).astype(str).str.zfill(12)
    df_stations_state['nostations_nearby_'+str(distance)] = 0
    # Read distance matrix
    df_distance_matrix = pd.read_csv(result_path + 'distancematrices_stations_uniqueID/'+state+'_stations_distancematrix.csv',index_col=0)
    # Iterate over stations in county
    for unique_ID in tqdm(df_stations_state.index):
        # Filter for PoI within range
        df = df_distance_matrix.loc[df_distance_matrix[unique_ID] <= distance]
        df_stations_state.loc[unique_ID,'nostations_nearby_'+str(distance)] = len(df) - 1 # minus own station
    df_stations_state.to_csv(result_path + 'stationlevel/20_level_stations_' + state + '_compiled.csv')


# Combine state-specific datasets for entire US

In [None]:
# All states
states = []
for state in us.states.STATES:
    states +=[state.abbr]
states += ['DC']
print(states)

In [None]:
# Piece state-level datasets together
df_stations_US = pd.DataFrame()
for state in states:
    # Read data
    df_stations_state = pd.read_csv(result_path + 'stationlevel/20_level_stations_' + state + '_compiled.csv',index_col=0)
    # Compile
    if state == states[0]:
        df_stations_US = df_stations_state.copy()
    else:
        df_stations_US = pd.concat([df_stations_US, df_stations_state], axis=0)

In [None]:
# Rename
df_stations_US.rename(columns={'ResCars_pp_BG_byBG':'ResCars_pp_BG'}, inplace=True)
df_stations_US.rename(columns={'PopDensity_byBG':'PopDensity_inBG'}, inplace=True)

In [None]:
# Save
df_stations_US.to_csv(result_path + 'stationlevel/20_level_stations_US_compiled_'+label+'.csv')