# Run regressions for case study on the Bipartisan Infrastructure Law (BIL)
Using the BIL suggested labeling of disadvantaged communities. 

In [1]:
savedate = '20251031'

In [2]:
data_path = '' # path to data

In [3]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import statsmodels.formula.api as smf
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.family'] = 'Arial'
from stargazer.stargazer import Stargazer, LineLocation
from statsmodels.iolib.summary2 import summary_col
import us
state_list = [state.abbr for state in us.states.STATES]

# Read data

In [4]:
# Load data
df_bg_level = pd.read_csv(data_path + '23_level_BG_US_imputedquantiles_20250722.zip',dtype={'GEOID':str,'STATEFP':str,'COUNTYFP':str,'BGFIPS':str},index_col=0)
df_bg_level['STATEFP'] = df_bg_level['STATEFP'].str.zfill(2)
df_bg_level['COUNTYFP'] = df_bg_level['COUNTYFP'].str.zfill(5)

for col in ['median_household_income_imputed', 'av_income_10km_withoutBG_imputed', 'av_income_50km_withoutBG_imputed']:
    df_bg_level[col+'_k'] = df_bg_level[col] / 1000.0
    df_bg_level[col+'_10k'] = df_bg_level[col] / 10000.0
    
col = 'nearest_highway_km_BG'
df_bg_level[col+'_100'] = df_bg_level[col] / 100

# Regression 1: BIL labels as covariates to income regression

In [5]:
# Run a single regression
def run_regression(df_data,dep_var,indep_var,covs=[],state_FE=False,which='OLS'):
    df_data_noNaN = df_data.dropna(subset=[indep_var,dep_var] + covs)
#     print(dep_var + ' ~ ' + indep_var + ' + '+ str(covs) + ' + ' + str(state_FE))
    X = df_data_noNaN[[indep_var] + covs]  # Independent variable(s)
    if state_FE:
        assert type(df_data_noNaN['STATEFP'].iloc[0]) == str  # Ensure STATEFP is a string
        dummies = pd.get_dummies(df_data_noNaN['STATEFP'], dtype=int, drop_first=True)
        X = pd.concat([X, dummies], axis=1)
    X = sm.add_constant(X)  # Adds a constant term to the predictor
    Y = df_data_noNaN[dep_var]  # Dependent variable
    if which == 'OLS':
        model = sm.OLS(Y, X).fit()
    elif which == 'Logit':
        model = sm.Logit(Y, X).fit()
    return model

In [6]:
# Regressions for BG-level data
OLS_corr = run_regression(df_bg_level,'no_stations','median_household_income_imputed_10k',['is_disadvantaged_IRA'],state_FE=None,which='OLS')
OLS_StateFE = run_regression(df_bg_level,'no_stations','median_household_income_imputed_10k',['is_disadvantaged_IRA'],state_FE='STATEFP',which='OLS')
OLS_BG_10 = run_regression(df_bg_level,'no_stations','median_household_income_imputed_10k',['av_income_10km_withoutBG_imputed_10k', 'is_disadvantaged_IRA'],state_FE='STATEFP',which='OLS')
OLS_BG_50 = run_regression(df_bg_level,'no_stations','median_household_income_imputed_10k',['av_income_50km_withoutBG_imputed_10k', 'is_disadvantaged_IRA'],state_FE='STATEFP',which='OLS')
OLS_BG_10_IRA = run_regression(df_bg_level,'no_stations','is_disadvantaged_IRA',['shareIRA_10km'],state_FE='STATEFP',which='OLS')
OLS_BG_50_IRA = run_regression(df_bg_level,'no_stations','is_disadvantaged_IRA',['shareIRA_50km'],state_FE='STATEFP',which='OLS')

In [7]:
list_covs_cleaned = ['median_household_income_imputed_10k',
                     'av_income_10km_withoutBG_imputed_10k', 'av_income_50km_withoutBG_imputed_10k', 
                     'is_disadvantaged_IRA', 'shareIRA_10km', 'shareIRA_50km']
dict_rename_covs = {'median_household_income_imputed_10k': 'BG median household income [10k USD]',
                    'av_income_10km_withoutBG_imputed_10k':'Average neighbouring income [10k USD], BG/10km',
                    'av_income_50km_withoutBG_imputed_10k':'Average neighbouring income [10k USD], BG/50km',
                    'is_disadvantaged_IRA':'BG in DAC', 
                    'shareIRA_10km':'DAC share, BG/10km', 
                    'shareIRA_50km':'DAC share, BG/50km'}

In [8]:
stargazer = Stargazer([OLS_corr,OLS_StateFE,OLS_BG_10,OLS_BG_50,OLS_BG_10_IRA,OLS_BG_50_IRA])
stargazer.significance_levels([0.05, 0.01, 0.001])

stargazer.dependent_variable_name('Number of stations')
stargazer.covariate_order(list_covs_cleaned)
stargazer.rename_covariates(dict_rename_covs)

stargazer.significant_digits(3)
stargazer.show_degrees_of_freedom(False)

stargazer.add_line('Unit of observation', ['BG', 'BG', 'BG', 'BG','BG', 'BG'], LineLocation.FOOTER_TOP)
stargazer.add_line('State FE', ['No', 'Yes', 'Yes', 'Yes','Yes', 'Yes'], LineLocation.FOOTER_TOP)

stargazer

0,1,2,3,4,5,6
,,,,,,
,Dependent variable: Number of stations,Dependent variable: Number of stations,Dependent variable: Number of stations,Dependent variable: Number of stations,Dependent variable: Number of stations,Dependent variable: Number of stations
,,,,,,
,(1),(2),(3),(4),(5),(6)
,,,,,,
BG median household income [10k USD],0.008***,-0.003**,-0.015***,-0.009***,,
,(0.001),(0.001),(0.001),(0.001),,
"Average neighbouring income [10k USD], BG/10km",,,0.053***,,,
,,,(0.002),,,
"Average neighbouring income [10k USD], BG/50km",,,,0.052***,,


In [9]:
print(stargazer.render_latex())

\begin{table}[!htbp] \centering
\begin{tabular}{@{\extracolsep{5pt}}lcccccc}
\\[-1.8ex]\hline
\hline \\[-1.8ex]
& \multicolumn{6}{c}{\textit{Dependent variable: Number of stations}} \
\cr \cline{2-7}
\\[-1.8ex] & (1) & (2) & (3) & (4) & (5) & (6) \\
\hline \\[-1.8ex]
 BG median household income [10k USD] & 0.008$^{***}$ & -0.003$^{**}$ & -0.015$^{***}$ & -0.009$^{***}$ & & \\
& (0.001) & (0.001) & (0.001) & (0.001) & & \\
 Average neighbouring income [10k USD], BG/10km & & & 0.053$^{***}$ & & & \\
& & & (0.002) & & & \\
 Average neighbouring income [10k USD], BG/50km & & & & 0.052$^{***}$ & & \\
& & & & (0.003) & & \\
 BG in DAC & -0.088$^{***}$ & -0.111$^{***}$ & -0.073$^{***}$ & -0.103$^{***}$ & -0.065$^{***}$ & -0.071$^{***}$ \\
& (0.008) & (0.008) & (0.008) & (0.008) & (0.009) & (0.008) \\
 DAC share, BG/10km & & & & & -0.135$^{***}$ & \\
& & & & & (0.017) & \\
 DAC share, BG/50km & & & & & & -0.298$^{***}$ \\
& & & & & & (0.020) \\
\hline \\[-1.8ex]
 Unit of observation & BG & BG 

# Regression 2: Income regression in DAC or non-DAC labelled communities

In [10]:
OLS_BG_10_DAC = run_regression(df_bg_level[df_bg_level['is_disadvantaged_IRA']==1],'no_stations','median_household_income_imputed_10k',['av_income_10km_withoutBG_imputed_10k', 'is_disadvantaged_IRA'],state_FE='STATEFP',which='OLS')
OLS_BG_50_DAC = run_regression(df_bg_level[df_bg_level['is_disadvantaged_IRA']==1],'no_stations','median_household_income_imputed_10k',['av_income_50km_withoutBG_imputed_10k', 'is_disadvantaged_IRA'],state_FE='STATEFP',which='OLS')
OLS_BG_10_nonDAC = run_regression(df_bg_level[df_bg_level['is_disadvantaged_IRA']==0],'no_stations','median_household_income_imputed_10k',['av_income_10km_withoutBG_imputed_10k', 'is_disadvantaged_IRA'],state_FE='STATEFP',which='OLS')
OLS_BG_50_nonDAC = run_regression(df_bg_level[df_bg_level['is_disadvantaged_IRA']==0],'no_stations','median_household_income_imputed_10k',['av_income_50km_withoutBG_imputed_10k', 'is_disadvantaged_IRA'],state_FE='STATEFP',which='OLS')

In [11]:
list_covs_cleaned = ['median_household_income_imputed_10k', 'av_income_10km_withoutBG_imputed_10k', 'av_income_50km_withoutBG_imputed_10k']

stargazer = Stargazer([OLS_BG_10_DAC, OLS_BG_50_DAC, OLS_BG_10_nonDAC, OLS_BG_50_nonDAC])
stargazer.significance_levels([0.05, 0.01, 0.001])
stargazer.dependent_variable_name('Number of stations')
stargazer.covariate_order(list_covs_cleaned)
stargazer.rename_covariates(dict_rename_covs)

stargazer.significant_digits(3)
stargazer.show_degrees_of_freedom(False)

stargazer

0,1,2,3,4
,,,,
,Dependent variable: Number of stations,Dependent variable: Number of stations,Dependent variable: Number of stations,Dependent variable: Number of stations
,,,,
,(1),(2),(3),(4)
,,,,
BG median household income [10k USD],-0.008**,-0.003,-0.017***,-0.012***
,(0.003),(0.003),(0.001),(0.001)
"Average neighbouring income [10k USD], BG/10km",0.052***,,0.052***,
,(0.004),,(0.003),
"Average neighbouring income [10k USD], BG/50km",,0.030***,,0.065***


In [12]:
print(stargazer.render_latex())

\begin{table}[!htbp] \centering
\begin{tabular}{@{\extracolsep{5pt}}lcccc}
\\[-1.8ex]\hline
\hline \\[-1.8ex]
& \multicolumn{4}{c}{\textit{Dependent variable: Number of stations}} \
\cr \cline{2-5}
\\[-1.8ex] & (1) & (2) & (3) & (4) \\
\hline \\[-1.8ex]
 BG median household income [10k USD] & -0.008$^{**}$ & -0.003$^{}$ & -0.017$^{***}$ & -0.012$^{***}$ \\
& (0.003) & (0.003) & (0.001) & (0.001) \\
 Average neighbouring income [10k USD], BG/10km & 0.052$^{***}$ & & 0.052$^{***}$ & \\
& (0.004) & & (0.003) & \\
 Average neighbouring income [10k USD], BG/50km & & 0.030$^{***}$ & & 0.065$^{***}$ \\
& & (0.004) & & (0.004) \\
\hline \\[-1.8ex]
 Observations & 83508 & 83511 & 156258 & 156265 \\
 $R^2$ & 0.010 & 0.009 & 0.014 & 0.014 \\
 Adjusted $R^2$ & 0.009 & 0.008 & 0.014 & 0.013 \\
 Residual Std. Error & 1.508 & 1.509 & 1.706 & 1.706 \\
 F Statistic & 16.193$^{***}$ & 13.808$^{***}$ & 43.892$^{***}$ & 41.864$^{***}$ \\
\hline
\hline \\[-1.8ex]
\textit{Note:} & \multicolumn{4}{r}{$^{*}$p