In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [2]:
def generate_powercheck_data(N, T, rho, num_individuals, mean=0, std_dev=1):
    # Generate random white noise for each individual
    white_noise = np.random.normal(mean, std_dev, size=(N, num_individuals, T))

    # Initialize the array to store the data
    data = np.zeros((N, num_individuals, T))

    # Generate the AR(1) process data for each individual
    for i in range(N):
        for j in range(num_individuals):
            for t in range(T):
                if t == 0:
                    data[i, j, t] = white_noise[i, j, t]
                else:
                    data[i, j, t] = rho * data[i, j, t - 1] + white_noise[i, j, t]

    # Reshape the data array for easier DataFrame creation
    reshaped_data = data.reshape((N * num_individuals, T))

    # Create a DataFrame with column names as time periods
    df = pd.DataFrame(reshaped_data, columns=[f'{t}' for t in range(T)])

    # Add a new 'state' column with repeated state values
    df['state'] = np.repeat(np.arange(1, N + 1), num_individuals)

    # Add a new 'individual' column with repeated individual values
    df['individual'] = np.tile(np.arange(1, num_individuals + 1), N)

    melted_df = pd.melt(df, id_vars=['state', 'individual'], var_name='time', value_name='value')

    # Convert the 'time' column to int
    melted_df['time'] = melted_df['time'].astype(int)

    data = melted_df.copy()

    data['time'] = data['time'].astype(int)
    # Create state dummy variables
    state_dummies = pd.get_dummies(data['state'], prefix='state', drop_first = True)

    # Convert state dummy variables to int
    state_dummies = state_dummies.astype(int)

    # Create time dummy variables
    time_dummies = pd.get_dummies(data['time'].astype(int), prefix='time', drop_first = True)

    # Convert time dummy variables to int
    time_dummies = time_dummies.astype(int)

    data = pd.concat([data, state_dummies, time_dummies], axis=1)

    return data

In [6]:
# Assuming residuals_mean_by_state_year is your DataFrame with columns like 'Treatment', 'STATEFIP', 'YEAR', 'Residuals'
reject_count = 0  # Counter for the number of rejections
alpha = 0.05  # Significance level

# Monte Carlo simulation for 20 iterations
for _ in range(400):

    data = generate_powercheck_data(50, 20, 0.44, 20)
    # Step 1: Generate random intervention year and selected industries
    intervention_year = 10
    unique_states = data['state'].unique()
    selected_industries = np.random.choice(unique_states, size=len(unique_states) // 2, replace=False)

    # Step 2: Create the treatment variable
    data['Treatment'] = ((data['state'].isin(selected_industries)) & (data['time'] > intervention_year)).astype(int)

    data['outcome'] = data.apply(lambda x: x['value']*(1.02) if x['Treatment'] == 1 else x['value'], axis=1)

    # Step 3: Perform the second OLS regression
    X_second_regression = data[['Treatment', 'state_2', 'state_3', 'state_4', 'state_5',
       'state_6', 'state_7', 'state_8', 'state_9', 'state_10', 'state_11',
       'state_12', 'state_13', 'state_14', 'state_15', 'state_16', 'state_17',
       'state_18', 'state_19', 'state_20', 'state_21', 'state_22', 'state_23',
       'state_24', 'state_25', 'state_26', 'state_27', 'state_28', 'state_29',
       'state_30', 'state_31', 'state_32', 'state_33', 'state_34', 'state_35',
       'state_36', 'state_37', 'state_38', 'state_39', 'state_40', 'state_41',
       'state_42', 'state_43', 'state_44', 'state_45', 'state_46', 'state_47',
       'state_48', 'state_49', 'state_50', 'time_1', 'time_2', 'time_3',
       'time_4', 'time_5', 'time_6', 'time_7', 'time_8', 'time_9', 'time_10',
       'time_11', 'time_12', 'time_13', 'time_14', 'time_15', 'time_16',
       'time_17', 'time_18', 'time_19'] ]
    
    X_second_regression = sm.add_constant(X_second_regression)
    Y_second_regression = data['outcome']  # Replace 'outcome' with your dependent variable
    model_second_regression = sm.OLS(Y_second_regression, X_second_regression).fit()

    # Check if null hypothesis for beta1 is rejected
    if model_second_regression.pvalues['Treatment'] < alpha:
        reject_count += 1

# Print the number of rejections
print(f"Number of times null hypothesis is rejected: {reject_count} out of 400 simulations")


Number of times null hypothesis is rejected: 75 out of 400 simulations
