# Run *# stations ~ income* regressions by cluster and by state

By cluster, we run the regression with local and 10km neighbourhood income.

By state, we run the regression with local income only, and the regression with local and 10km neighborhood income.

In [1]:
data_path = '' # path to data
figure_data_folder = '' # where to save data for figures

In [2]:
import pandas as pd
pd.options.mode.chained_assignment = None 
import geopandas as geopd
import statsmodels.formula.api as smf
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.family'] = 'Arial'
import us
state_list = [state.abbr for state in us.states.STATES]

## Read data

In [3]:
df_bg_level = pd.read_csv(data_path + '23_level_BG_US_imputedquantiles_20250722.zip',dtype={'GEOID':str,'STATEFP':str,'COUNTYFP':str,'BGFIPS':str},index_col=0)
df_bg_level['STATEFP'] = df_bg_level['STATEFP'].str.zfill(2)
df_bg_level['COUNTYFP'] = df_bg_level['COUNTYFP'].str.zfill(5)

for col in ['median_household_income_imputed', 'av_income_10km_withoutBG_imputed']:
    df_bg_level[col + '_k'] = df_bg_level[col] / 1000.0
    df_bg_level[col + '_10k'] = df_bg_level[col] / 10000.0

## Run regressions

In [4]:
def run_regression(df_data, dep_var, indep_var, covs=[], state_FE=False, which='OLS'):
    #     print(dep_var + ' ~ ' + indep_var + ' + '+ str(covs) + ' + ' + str(state_FE))

    # Prepare data
    df_data_noNaN = df_data.dropna(subset=[indep_var,dep_var] + covs)
    X = df_data_noNaN[[indep_var] + covs]  # Independent variable(s)
    if state_FE:
        assert type(df_data_noNaN['STATEFP'].iloc[0]) == str  # Ensure STATEFP is a string
        dummies = pd.get_dummies(df_data_noNaN['STATEFP'], dtype=int, drop_first=True)
        X = pd.concat([X, dummies], axis=1)
    X = sm.add_constant(X)  # Adds a constant term to the predictor
    Y = df_data_noNaN[dep_var]  # Dependent variable
    
    # Run model
    if which == 'OLS':
        model = sm.OLS(Y, X).fit()
    elif which == 'Logit':
        model = sm.Logit(Y, X).fit()
        
    return model

### By Cluster

In [5]:
label_mapping = {}
label_mapping['Column Names'] = {0:'University_college_campuses', 
                                 1:'Weekday_daytime_short_stops', 
                                 2:'City_neighborhoods', 
                                 3:'Weekday_daytime_long_stops', 
                                 4:'Gas_stations_very_short_stops', 
                                 5:'Weekend_evening_recreation', 
                                 6:'High_density_city', 
                                 7:'Hotels'}

cluster_order_numbers8_v3 = [1, 3, 5, 2, 6, 7, 4, 0]

In [6]:
# Run regression for each cluster
bg10_all = {}
for i, clustnum in enumerate(cluster_order_numbers8_v3):
    col = 'no_stations_k8_'+label_mapping['Column Names'][clustnum]
    bg10_all[clustnum] = run_regression(df_data = df_bg_level, 
                                        dep_var = col, 
                                        indep_var = 'median_household_income_imputed_10k', 
                                        covs = ['av_income_10km_withoutBG_imputed_10k'], 
                                        state_FE = 'STATEFP', which = 'OLS')

In [7]:
# Compile into df to save
savedf = None
for clustnum in range(8):
    
    # Assemble for each cluster
    tmp1 = pd.DataFrame(bg10_all[clustnum].params.loc[['const', 'median_household_income_imputed_10k', 'av_income_10km_withoutBG_imputed_10k']], columns=['Coeff'])
    tmp2 = pd.DataFrame(bg10_all[clustnum].pvalues.loc[['const', 'median_household_income_imputed_10k', 'av_income_10km_withoutBG_imputed_10k']], columns=['Pvals'])
    tmp3 = pd.DataFrame(bg10_all[clustnum].bse.loc[['const', 'median_household_income_imputed_10k', 'av_income_10km_withoutBG_imputed_10k']], columns=['BSE'])
    tmp4 = pd.concat((tmp1, tmp2, tmp3), axis=1)
    tmp4['Cluster'] = clustnum
    
    # Compile
    if savedf is None:
        savedf = tmp4.copy()
    else:
        savedf = pd.concat((savedf, tmp4), axis=0)

In [8]:
savedf.to_csv(figure_data_folder + 'FigureData/figure4d_cluster_regression_coefficients.csv')

### By State

In [9]:
# Run regressions for each state
param_dfs_states = {}
for state in state_list + ['DC']:
    if state == 'DC':
        fips = '11'
    else: 
        fips = us.states.lookup(state).fips

    # Local income only
    model_STATE = run_regression(df_bg_level[df_bg_level['STATEFP']==fips], 
                                 'no_stations', 'median_household_income_imputed_10k', [], state_FE=False, which='OLS')
    # Local and neighborhood income
    model_STATE_10km = run_regression(df_bg_level[df_bg_level['STATEFP']==fips], 
                                      'no_stations', 'median_household_income_imputed_10k', 
                                      ['av_income_10km_withoutBG_imputed_10k'], state_FE=False, which='OLS')
    param_dfs_tmp = {}
    names = ['R'+str(i) for i in [0, 1]]
    for name, model in {'R0':model_STATE, 'R1':model_STATE_10km}.items():
        param_dfs_tmp[name] = pd.concat((pd.DataFrame(model.params, columns=[name+' Coeff']),
                                     pd.DataFrame(model.pvalues, columns=[name+' P-values']),
                                     pd.DataFrame(model.bse, columns=[name+' std err'])), axis=1)

    param_dfs_print_tmp = pd.concat(tuple([param_dfs_tmp[name] for name in names]), axis=1, ignore_index=False)
    param_dfs_states[state] = param_dfs_print_tmp.loc[['const', 'median_household_income_imputed_10k', 'av_income_10km_withoutBG_imputed_10k'], :].fillna('-')
    

In [10]:
def fill_sign_df(rowname, colname, row1, r1, df_input, df_output):
    
    if type(df_input.loc[row1, r1+' P-values']) != str:
        if df_input.loc[row1, r1+' P-values'] < 0.05:
            if df_input.loc[row1, r1+' Coeff'] < 0:
                df_output.loc[rowname, colname] = 'NEG'
            else:
                df_output.loc[rowname, colname] = 'POS'
        else:
            df_output.loc[rowname, colname] = 'INSIG'

    return df_output

In [11]:
# Compile into big df to save
state_categories_all = None
for state in state_list + ['DC']:
    state_categories_tmp = pd.DataFrame(index=['localBG_only', 'localBG_10km', 'neighBG_10km'], columns=['All'])
    state_categories_tmp = fill_sign_df('localBG_only', 'All', 'median_household_income_imputed_10k', 'R0', param_dfs_states[state], state_categories_tmp)
    state_categories_tmp = fill_sign_df('localBG_10km', 'All', 'median_household_income_imputed_10k', 'R1', param_dfs_states[state], state_categories_tmp)
    state_categories_tmp = fill_sign_df('neighBG_10km', 'All', 'av_income_10km_withoutBG_imputed_10k', 'R1', param_dfs_states[state], state_categories_tmp)
    if state_categories_all is None:
        state_categories_all = state_categories_tmp.rename(columns={'All':state}).copy()
    else:
        state_categories_all = pd.concat((state_categories_all, state_categories_tmp.rename(columns={'All':state}).copy()), axis=1)
        
state_categories_all.to_csv(figure_data_folder + 'FigureData/figure6_nbdadvantage_significance_bystate.csv')

In [12]:
coeffs_by_state = pd.DataFrame({state:param_dfs_states[state]['R1 Coeff'] for state in state_list})
coeffs_by_state.to_csv(figure_data_folder + 'FigureData/figure6b_nbdadvantage_coefficients_bystate.csv')