This file compiles the BG-level dataset, used for the analysis of the neighborhood effect.

In [None]:
import pandas as pd
import us
from tqdm import tqdm
import numpy as np
import category_groupings_250403 as cg

In [None]:
root = ''
path = root + 'Data/'
path_US_data = root + 'Data/geodata/'
result_path = root + 'final_data/'
path_IRA = root + 'Data/IRA/1.0-shapefile-codebook/usa/'

# Compile state-specific BG-level datasets

In [None]:
# Which states to compile
states = []
for state in us.states.STATES:
    states +=[state.abbr]
states += ['DC']
print(states)

## Load socioeconomics on BG-level

In [None]:
# Inspect
df_socioecon = pd.read_csv(result_path + 'BGlevel/level_BG.csv')
df_socioecon['STATEFP'] = df_socioecon['STATEFP'].astype(str).str.zfill(2)
df_socioecon['COUNTYFP'] = df_socioecon['COUNTYFP'].astype(str).str.zfill(5)
df_socioecon['BGFP'] = df_socioecon['BGFP'].astype(str).str.zfill(12)
df_socioecon.set_index('BGFP', inplace=True)
df_socioecon.head(3)

In [None]:
# Split socioeconomic data by state
for state in tqdm(states):
    if state == 'DC':
        fips = '11'
    else:
        fips = us.states.lookup(state).fips

    # Filter for stations in state
    df_socioecon_state = df_socioecon.loc[df_socioecon['STATEFP'] == fips]
    # df_socioecon_state.reset_index(inplace=True)

    # Save
    df_socioecon_state.to_csv(result_path + 'BGlevel/21_level_BG_' + state + '_compiled.csv')

## Add nb income

In [None]:
# Column duplicates which we will not merge on
cols_dup = ['median_household_income_byBG','total_pop_byBG','STATEFP','COUNTYFP']

In [None]:
# Add nb income
for state in tqdm(states):
    if state == 'DC':
        fips = '11'
    else:
        fips = us.states.lookup(state).fips
    # Read current socioecon file
    df_socioecon_state = pd.read_csv(result_path + 'BGlevel/21_level_BG_' + state + '_compiled.csv')
    df_socioecon_state['STATEFP'] = df_socioecon_state['STATEFP'].astype(str).str.zfill(2)
    df_socioecon_state['COUNTYFP'] = df_socioecon_state['COUNTYFP'].astype(str).str.zfill(5)
    df_socioecon_state['BGFP'] = df_socioecon_state['BGFP'].astype(int).astype(str).str.zfill(12)
    df_socioecon_state.set_index('BGFP', inplace=True)
    # Read nb income
    df_nbincome_state = pd.read_csv(result_path + 'BGlevel/level_BG_'+state+'_wnbincome.csv')
    df_nbincome_state['BGFP'] = df_nbincome_state['BGFP'].astype(str).str.zfill(12)
    df_nbincome_state.set_index('BGFP', inplace=True)
    # Drop duplicated columns
    for col_del in cols_dup:
        df_nbincome_state.drop(col_del,axis=1,inplace=True)
    # Merge   
    df_socioecon_state = df_socioecon_state.merge(df_nbincome_state, how='left', left_index=True, right_index=True)
    
    # Save data
    df_socioecon_state.to_csv(result_path + 'BGlevel/21_level_BG_' + state + '_compiled.csv')

## Add Res Cars

In [None]:
# Rename
for state in tqdm(states):
    if state == 'DC':
        fips = '11'
    else:
        fips = us.states.lookup(state).fips
    # Read current socioecon file
    df_socioecon_state = pd.read_csv(result_path + 'BGlevel/21_level_BG_' + state + '_compiled.csv')
    df_socioecon_state['STATEFP'] = df_socioecon_state['STATEFP'].astype(str).str.zfill(2)
    df_socioecon_state['COUNTYFP'] = df_socioecon_state['COUNTYFP'].astype(str).str.zfill(5)
    df_socioecon_state['BGFP'] = df_socioecon_state['BGFP'].astype(int).astype(str).str.zfill(12)
    df_socioecon_state.set_index('BGFP', inplace=True)
    # Rename
    df_socioecon_state.rename(columns={'ResCars_pp_BG_byBG':'total_ResCars_byBG'}, inplace=True)    
    # Save data
    df_socioecon_state.to_csv(result_path + 'BGlevel/21_level_BG_' + state + '_compiled.csv')

## Add county-level

In [None]:
# Read counry-level data
gdf_county = pd.read_csv(result_path + 'level_county.csv',index_col=0)
gdf_county['COUNTYFP'] = gdf_county['COUNTYFP'].astype(str).str.zfill(5)
gdf_county.drop('STATEFP', axis=1, inplace=True) # to avoid double columns
gdf_county.head(3)

In [None]:
# Merge county data and stations
for state in tqdm(states):
    # Read stations
    df_socioecon_state = pd.read_csv(result_path + 'BGlevel/21_level_BG_' + state + '_compiled.csv') # ,index_col=0)
    df_socioecon_state['STATEFP'] = df_socioecon_state['STATEFP'].astype(str).str.zfill(2)
    df_socioecon_state['COUNTYFP'] = df_socioecon_state['COUNTYFP'].astype(str).str.zfill(5)
    df_socioecon_state['BGFP'] = df_socioecon_state['BGFP'].astype(int).astype(str).str.zfill(12)
    # Merge
    df_socioecon_state = df_socioecon_state.merge(gdf_county, how='left', left_on='COUNTYFP', right_on='COUNTYFP')
    df_socioecon_state.set_index('BGFP', inplace=True)
    # Save
    df_socioecon_state.to_csv(result_path + 'BGlevel/21_level_BG_' + state + '_compiled.csv')

## Add state-level

In [None]:
# Add state-level info
df_state_level = pd.read_csv(result_path + 'level_state.csv')
df_state_level['STATEFP'] = df_state_level['STATEFP'].astype(str).str.zfill(2)
df_state_level.head(3)

In [None]:
# Merge county data and stations
for state in tqdm(states):
    # Read stations
    df_socioecon_state = pd.read_csv(result_path + 'BGlevel/21_level_BG_' + state + '_compiled.csv') # ,index_col=0)
    df_socioecon_state['STATEFP'] = df_socioecon_state['STATEFP'].astype(str).str.zfill(2)
    df_socioecon_state['COUNTYFP'] = df_socioecon_state['COUNTYFP'].astype(str).str.zfill(5)
    df_socioecon_state['BGFP'] = df_socioecon_state['BGFP'].astype(int).astype(str).str.zfill(12)
    # Merge
    df_socioecon_state = df_socioecon_state.merge(df_state_level, how='left', on='STATEFP')
    df_socioecon_state.set_index('BGFP', inplace=True)
    # Save
    df_socioecon_state.to_csv(result_path + 'BGlevel/21_level_BG_' + state + '_compiled.csv')

## Nearest highway

In [None]:
# Add highways
gdf_hwy_state = pd.read_csv(result_path + 'BGlevel/level_bg_'+state +'_hwy.csv',index_col=0)
gdf_hwy_state.head()

In [None]:
# Merge hwys and stations
for state in tqdm(states):
    # Read stations
    df_socioecon_state = pd.read_csv(result_path + 'BGlevel/21_level_BG_' + state + '_compiled.csv') # ,index_col=0)
    df_socioecon_state['BGFP'] = df_socioecon_state['BGFP'].astype(int).astype(str).str.zfill(12)
    # Read hwy distance
    gdf_hwy_state = pd.read_csv(result_path + 'BGlevel/level_bg_'+state +'_hwy.csv',index_col=0)
    gdf_hwy_state['BGFIPS'] = gdf_hwy_state['BGFIPS'].astype(int).astype(str).str.zfill(12)
    gdf_hwy_state.rename(columns={'BGFIPS':'BGFP'},inplace=True)
    # Merge
    df_socioecon_state = df_socioecon_state.merge(gdf_hwy_state, how='left', on=['BGFP'])
    df_socioecon_state.set_index('BGFP',inplace=True)
    # Save
    df_socioecon_state.to_csv(result_path + 'BGlevel/21_level_BG_' + state + '_compiled.csv')

## Add number of stations

In [None]:
# Read original stations wFIPS
df_stations = pd.read_csv(result_path + '00_alt_fuel_stations (Apr 3 2023)_wFIPS.csv',index_col=0) # ,dtype={'STATEFP':int,'COUNTYFP':int,'BGFP':int})
df_stations = df_stations.loc[~df_stations['BGFP'].isna()]
df_stations['STATEFP'] = df_stations['STATEFP'].astype(str).str.zfill(2)
df_stations['COUNTYFP'] = df_stations['COUNTYFP'].astype(str).str.zfill(5)
df_stations['BGFP'] = df_stations['BGFP'].astype(int).astype(str).str.zfill(12)

In [None]:
# 10 largest charging networks
list_networks = df_stations.groupby('EV Network').size().sort_values(ascending=False).index.tolist()[:10]

In [None]:
# Merge BG and no of stations
for state in tqdm(states):
    # Read stations
    df_socioecon_state = pd.read_csv(result_path + 'BGlevel/21_level_BG_' + state + '_compiled.csv') # ,index_col=0)
    df_socioecon_state['BGFP'] = df_socioecon_state['BGFP'].astype(int).astype(str).str.zfill(12)
    # Filter stations
    df_stations_state = df_stations.loc[df_stations['State'] == state]
    # All stations (public and private)
    s = df_stations_state.groupby('BGFP').size()
    df = pd.DataFrame(index=s.index,columns=['no_stations_all'],data=s.values)
    if 'no_stations_all' in df_socioecon_state.columns.to_list():
        df_socioecon_state.drop('no_stations_all',axis=1,inplace=True)
    df_socioecon_state = df_socioecon_state.merge(df,how='left',left_on='BGFP',right_index=True)
    df_socioecon_state['no_stations_all'] = df_socioecon_state['no_stations_all'].fillna(0)
    # Keep name to avoid conflicts later
    df_socioecon_state['no_stations'] = df_socioecon_state['no_stations_all']
    if True:
        # All private stations (for robustness)
        df_stations_state_private = df_stations_state.loc[df_stations_state['Access Code'] != 'public']
        s = df_stations_state_private.groupby('BGFP').size()
        df = pd.DataFrame(index=s.index,columns=['no_stations_privateonly'],data=s.values)
        if 'no_stations_privateonly' in df_socioecon_state.columns.to_list():
            df_socioecon_state.drop('no_stations_privateonly',axis=1,inplace=True)
        df_socioecon_state = df_socioecon_state.merge(df,how='left',left_on='BGFP',right_index=True)
        df_socioecon_state['no_stations_privateonly'] = df_socioecon_state['no_stations_privateonly'].fillna(0)
        # All public stations
        df_stations_state_public = df_stations_state.loc[df_stations_state['Access Code'] != 'private']
        s = df_stations_state_public.groupby('BGFP').size()
        df = pd.DataFrame(index=s.index,columns=['no_stations_publiconly'],data=s.values)
        if 'no_stations_publiconly' in df_socioecon_state.columns.to_list():
            df_socioecon_state.drop('no_stations_publiconly',axis=1,inplace=True)
        df_socioecon_state = df_socioecon_state.merge(df,how='left',left_on='BGFP',right_index=True)
        df_socioecon_state['no_stations_publiconly'] = df_socioecon_state['no_stations_publiconly'].fillna(0)
        # L2 (based on all private + public stations)
        df_stations_state_L2 = df_stations_state[df_stations_state['EV DC Fast Count'].isna()]
        s = df_stations_state_L2.groupby('BGFP').size()
        df = pd.DataFrame(index=s.index,columns=['no_L2_stations'],data=s.values)
        if 'no_L2_stations' in df_socioecon_state.columns.to_list():
            df_socioecon_state.drop('no_L2_stations',axis=1,inplace=True)
        df_socioecon_state = df_socioecon_state.merge(df,how='left',left_on='BGFP',right_index=True)
        df_socioecon_state['no_L2_stations'] = df_socioecon_state['no_L2_stations'].fillna(0)
        # DC (based on all private + public stations)
        df_stations_state_DC = df_stations_state[~df_stations_state['EV DC Fast Count'].isna()]
        s = df_stations_state_DC.groupby('BGFP').size()
        df = pd.DataFrame(index=s.index,columns=['no_DC_stations'],data=s.values)
        if 'no_DC_stations' in df_socioecon_state.columns.to_list():
            df_socioecon_state.drop('no_DC_stations',axis=1,inplace=True)
        df_socioecon_state = df_socioecon_state.merge(df,how='left',left_on='BGFP',right_index=True)
        df_socioecon_state['no_DC_stations'] = df_socioecon_state['no_DC_stations'].fillna(0)
        # Networks (based on all private + public stations)
        for network in list_networks:
            # All stations
            df_stations_state_network = df_stations_state.loc[df_stations_state['EV Network'] == network]
            col_network = 'no_stations_' + network.replace(' ','_')
            s = df_stations_state_network.groupby('BGFP').size()
            df = pd.DataFrame(index=s.index,columns=[col_network],data=s.values)
            if col_network in df_socioecon_state.columns.to_list():
                df_socioecon_state.drop(col_network,axis=1,inplace=True)
            df_socioecon_state = df_socioecon_state.merge(df,how='left',left_on='BGFP',right_index=True)
            df_socioecon_state[col_network] = df_socioecon_state[col_network].fillna(0)
        # Merge
        df_socioecon_state.set_index('BGFP',inplace=True)
    # Save
    df_socioecon_state.to_csv(result_path + 'BGlevel/21_level_BG_' + state + '_compiled.csv')

## Add no PoI

In [None]:
# Get all categories
top_cats_edited = []
for key in cg.category_grouping('key', return_key_list=True):
    top_cats_edited += [cg.category_grouping(key)]
top_cats_edited = sorted(set(top_cats_edited))

In [None]:
# Merge BG and no of PoI
for state in tqdm(states):
    # Read BGs
    df_socioecon_state = pd.read_csv(result_path + 'BGlevel/21_level_BG_' + state + '_compiled.csv') # ,index_col=0)
    df_socioecon_state['BGFP'] = df_socioecon_state['BGFP'].astype(int).astype(str).str.zfill(12)
    # Read PoI
    df_dewey_state = pd.read_csv(result_path + 'Dewey/01_compiled_'+state+'.csv',index_col=0)
    df_dewey_state = df_dewey_state.loc[~df_dewey_state['BGFP'].isna()]
    df_dewey_state['BGFP'] = df_dewey_state['BGFP'].astype(int).astype(str).str.zfill(12)
    # All BGs
    s = df_dewey_state.groupby('BGFP').size()
    df = pd.DataFrame(index=s.index,columns=['no_PoI'],data=s.values)
    df_socioecon_state = df_socioecon_state.merge(df,how='left',left_on='BGFP',right_index=True)
    df_socioecon_state['no_PoI'] = df_socioecon_state['no_PoI'].fillna(0)
    if True:
        # Assign edited top categories
        df_dewey_state['top_category_edit'] = None
        all_top_cats = cg.category_grouping('key', return_key_list=True)
        for key in all_top_cats:
            df_dewey_state.loc[df_dewey_state['top_category'] == key, 'top_category_edit'] = cg.category_grouping(key)
        # Top categories
        for top_cat in top_cats_edited:
            df_dewey_state_topcat = df_dewey_state.loc[df_dewey_state['top_category_edit'] == top_cat]
            col_topcat = 'no_PoI_' + top_cat.replace(' ','_')
            s = df_dewey_state_topcat.groupby('BGFP').size()
            df = pd.DataFrame(index=s.index,columns=[col_topcat],data=s.values)
            df_socioecon_state = df_socioecon_state.merge(df,how='left',left_on='BGFP',right_index=True)
            df_socioecon_state[col_topcat] = df_socioecon_state[col_topcat].fillna(0)
        # Merge
        df_socioecon_state.set_index('BGFP',inplace=True)
    # Save
    df_socioecon_state.to_csv(result_path + 'BGlevel/21_level_BG_' + state + '_compiled.csv')

## Add IRA info

### Whether BG is IRA

In [None]:
# Merge BG and IRA info
for state in tqdm(states):
    # Read BGs
    df_socioecon_state = pd.read_csv(result_path + 'BGlevel/21_level_BG_' + state + '_compiled.csv') # ,index_col=0)
    df_socioecon_state['BGFP'] = df_socioecon_state['BGFP'].astype(int).astype(str).str.zfill(12)
    # Read IRA
    df_IRA = pd.read_csv(result_path + 'BGlevel/level_BG_'+state+'_IRA.csv') # ,index_col=0)
    df_IRA['BGFP'] = df_socioecon_state['BGFP'].astype(int).astype(str).str.zfill(12)
    df_IRA.drop('BGFIPS', axis=1, inplace=True) # to avoid double columns
    # Merge
    df_socioecon_state = df_socioecon_state.merge(df_IRA, how='left', on='BGFP')
    # Save
    df_socioecon_state.set_index('BGFP',inplace=True)
    df_socioecon_state.to_csv(result_path + 'BGlevel/21_level_BG_' + state + '_compiled.csv')

### Whether BG has disadvantaged neighoring BGs

In [None]:
# NB IRA
for state in tqdm(states):
    # Read BGs
    df_socioecon_state = pd.read_csv(result_path + 'BGlevel/21_level_BG_' + state + '_compiled.csv') # ,index_col=0)
    df_socioecon_state['BGFP'] = df_socioecon_state['BGFP'].astype(int).astype(str).str.zfill(12)
    # Read IRA
    df_IRA = pd.read_csv(result_path + 'BGlevel/level_BG_'+state+'_nbIRA.csv',index_col=0)
    df_IRA['BGFP'] = df_socioecon_state['BGFP'].astype(int).astype(str).str.zfill(12)
    df_IRA.drop('BGFIPS', axis=1, inplace=True) # to avoid double columns
    # Merge
    df_socioecon_state = df_socioecon_state.merge(df_IRA, how='left', on='BGFP')
    # Save
    df_socioecon_state.set_index('BGFP',inplace=True)
    df_socioecon_state.to_csv(result_path + 'BGlevel/21_level_BG_' + state + '_compiled.csv')

# Combine all states

In [None]:
# All states
states = []
for state in us.states.STATES:
    states +=[state.abbr]
states += ['DC']
print(states)

In [None]:
# Combine to single US-level file
df_BG_US = pd.DataFrame()
for state in states:
    # Read data
    df_BG_state = pd.read_csv(result_path + 'BGlevel/21_level_BG_' + state + '_compiled.csv',index_col=0)
    # Compile
    if state == states[0]:
        df_BG_US = df_BG_state.copy()
    else:
        df_BG_US = pd.concat([df_BG_US, df_BG_state], axis=0)

# Save

In [None]:
# Save
label = '250702'
df_BG_US.to_csv(result_path + 'BGlevel/21_level_BG_US_compiled_'+label+'.csv')