In [16]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from data_gen_Monte_Carlo_AR1 import generate_staggered_law_ar1_data

n_simulations = 100 # Define the number of simulations
reject_count = 0     # Initialize the count of rejections

num_individuals = 500
T = 20
rho = 0.8
N = 50
for simulation in range(n_simulations):
   data = generate_staggered_law_ar1_data(N, T, rho, num_individuals)  # Replace with your data generation function

   cps_agg = data.groupby(['state', 'time'])[['value', 'TREATMENT']].mean().reset_index()

   # One-hot encode STATEFIP and YEAR
   dummy_df_state = pd.get_dummies(cps_agg['state'], prefix='state', drop_first=True)
   dummy_df_year = pd.get_dummies(cps_agg['time'], prefix='time', drop_first=True)

   # Concatenate the dummy variables with the original DataFrame
   cps_agg = pd.concat([cps_agg, dummy_df_state, dummy_df_year], axis=1)

   # Convert True and False to 1 and 0 in the specified columns
   boolean_columns = ['state_2', 'state_3', 'state_4', 'state_5',
      'state_6', 'state_7', 'state_8', 'state_9', 'state_10', 'state_11',
      'state_12', 'state_13', 'state_14', 'state_15', 'state_16', 'state_17',
      'state_18', 'state_19', 'state_20', 'state_21', 'state_22', 'state_23',
      'state_24', 'state_25', 'state_26', 'state_27', 'state_28', 'state_29',
      'state_30', 'state_31', 'state_32', 'state_33', 'state_34', 'state_35',
      'state_36', 'state_37', 'state_38', 'state_39', 'state_40', 'state_41',
      'state_42', 'state_43', 'state_44', 'state_45', 'state_46', 'state_47',
      'state_48', 'state_49', 'state_50', 'time_1', 'time_2', 'time_3',
      'time_4', 'time_5', 'time_6', 'time_7', 'time_8', 'time_9', 'time_10',
      'time_11', 'time_12', 'time_13', 'time_14', 'time_15', 'time_16',
      'time_17', 'time_18', 'time_19']

   cps_agg[boolean_columns] = cps_agg[boolean_columns].astype(int)

   data = cps_agg.copy()


   # First stage OLS regression
   model = smf.ols(formula="value ~ TREATMENT + 1", data=data).fit()
   
   # Extract residuals
   data['u'] = model.resid

   # Sorting and lagging
   df = data.copy()
   df.sort_values(['state', 'time'], inplace=True)
   df['u_1'] = df.groupby('state')['u'].shift(1)
   df = df.dropna()

   # Estimate AR(1) coefficient
   model_ar = sm.OLS(df['u'], sm.add_constant(df['u_1'])).fit()
   rho_estimate = model_ar.params['u_1']

   # Generate state and time dummies
   state_dummies = pd.get_dummies(data['state'], prefix='state', drop_first=True)
   time_dummies = pd.get_dummies(data['time'].astype(int), prefix='time', drop_first=True)

   # Concatenate dummies with the data
   data = pd.concat([data, state_dummies, time_dummies], axis=1)

   # Transformation for FGLS
   data['Y_lag'] = data.groupby('state')['value'].shift(1)
   data['Y_transformed'] = data['value'] - rho_estimate * data['Y_lag']
   
   data = data.dropna()

   boolean_columns = ['state_2', 'state_3', 'state_4', 'state_5',
      'state_6', 'state_7', 'state_8', 'state_9', 'state_10', 'state_11',
      'state_12', 'state_13', 'state_14', 'state_15', 'state_16', 'state_17',
      'state_18', 'state_19', 'state_20', 'state_21', 'state_22', 'state_23',
      'state_24', 'state_25', 'state_26', 'state_27', 'state_28', 'state_29',
      'state_30', 'state_31', 'state_32', 'state_33', 'state_34', 'state_35',
      'state_36', 'state_37', 'state_38', 'state_39', 'state_40', 'state_41',
      'state_42', 'state_43', 'state_44', 'state_45', 'state_46', 'state_47',
      'state_48', 'state_49', 'state_50', 'time_1', 'time_2', 'time_3',
      'time_4', 'time_5', 'time_6', 'time_7', 'time_8', 'time_9', 'time_10',
      'time_11', 'time_12', 'time_13', 'time_14', 'time_15', 'time_16',
      'time_17', 'time_18', 'time_19']

   data[boolean_columns] = data[boolean_columns].astype(int)

   # Define the model
   X = data[['TREATMENT',  'state_2', 'state_3', 'state_4', 'state_5',
      'state_6', 'state_7', 'state_8', 'state_9', 'state_10', 'state_11',
      'state_12', 'state_13', 'state_14', 'state_15', 'state_16', 'state_17',
      'state_18', 'state_19', 'state_20', 'state_21', 'state_22', 'state_23',
      'state_24', 'state_25', 'state_26', 'state_27', 'state_28', 'state_29',
      'state_30', 'state_31', 'state_32', 'state_33', 'state_34', 'state_35',
      'state_36', 'state_37', 'state_38', 'state_39', 'state_40', 'state_41',
      'state_42', 'state_43', 'state_44', 'state_45', 'state_46', 'state_47',
      'state_48', 'state_49', 'state_50', 'time_1', 'time_2', 'time_3',
      'time_4', 'time_5', 'time_6', 'time_7', 'time_8', 'time_9', 'time_10',
      'time_11', 'time_12', 'time_13', 'time_14', 'time_15', 'time_16',
      'time_17', 'time_18', 'time_19']]  # List all independent variables including dummies
   

   X = sm.add_constant(X)
   Y = data['Y_transformed']


   # FGLS regression
   model_fgls = sm.OLS(Y, X).fit()

   # Check the p-value for the treatment effect
   p_value = model_fgls.pvalues['TREATMENT']
   if p_value < 0.05:
      reject_count += 1

# Output the count of rejections
print(f"Number of rejections (out of {n_simulations} simulations): {reject_count}")

   
   

   

Number of rejections (out of 100 simulations): 5
