In [22]:
import numpy as np
import pandas as pd

def generate_powercheck_data(N, T, rho, num_individuals, mean=0, std_dev=1):
    # Generate random white noise for each individual
    white_noise = np.random.normal(mean, std_dev, size=(N, num_individuals, T))

    # Initialize the array to store the data
    data = np.zeros((N, num_individuals, T))

    # Generate the AR(1) process data for each individual
    for i in range(N):
        for j in range(num_individuals):
            for t in range(T):
                if t == 0:
                    data[i, j, t] = white_noise[i, j, t]
                else:
                    data[i, j, t] = rho * data[i, j, t - 1] + white_noise[i, j, t]

    # Reshape the data array for easier DataFrame creation
    reshaped_data = data.reshape((N * num_individuals, T))

    # Create a DataFrame with column names as time periods
    df = pd.DataFrame(reshaped_data, columns=[f'{t}' for t in range(T)])

    # Add a new 'state' column with repeated state values
    df['state'] = np.repeat(np.arange(1, N + 1), num_individuals)

    # Add a new 'individual' column with repeated individual values
    df['individual'] = np.tile(np.arange(1, num_individuals + 1), N)

    melted_df = pd.melt(df, id_vars=['state', 'individual'], var_name='time', value_name='value')

    # Convert the 'time' column to int
    melted_df['time'] = melted_df['time'].astype(int)

    data = melted_df.copy()

    data['time'] = data['time'].astype(int)
    # Create state dummy variables
    state_dummies = pd.get_dummies(data['state'], prefix='state', drop_first = True)

    # Convert state dummy variables to int
    state_dummies = state_dummies.astype(int)

    # Create time dummy variables
    time_dummies = pd.get_dummies(data['time'].astype(int), prefix='time', drop_first = True)

    # Convert time dummy variables to int
    time_dummies = time_dummies.astype(int)

    data = pd.concat([data, state_dummies, time_dummies], axis=1)

    states = data['state'].unique()

    # Randomly select half of the states to be in the treatment group
    
    treatment_states = np.random.choice(states, size=len(states)//2, replace=False)

    # Assign treatment year to each treatment state, staggered between 1985 and 1995
    treatment_years = np.random.choice(range(5, 15), size=len(treatment_states), replace=True)
    state_to_treatment_year = dict(zip(treatment_states, treatment_years))

    # Add a treatment column to the DataFrame
    data['TREATMENT'] = data.apply(lambda x: 1 if x['state'] in treatment_states and x['time'] >= state_to_treatment_year[x['state']] else 0, axis=1)
    data['outcome'] = data.apply(lambda x: x['value']*(1.02) if x['TREATMENT'] == 1 else x['value'], axis=1)
   
    return data

In [30]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy.stats as stats

np.random.seed(42)


def beta_Ztest_twosided(n, alpha, sigma, mu_0, mu):
    # (1-alpha/2)-quantile of N(0,1):
    z_upper = stats.norm.ppf(1 - alpha/2)
    
    # location shift under H_1:
    location_shift = np.sqrt(n) * (mu - mu_0) / sigma
    
    # compute power
    power = 1 - stats.norm.cdf(z_upper - location_shift)
    
    return power

beta1_estimates = []
reject_count = 0  # Counter for the number of rejections
reject_count1 = 0
alpha = 0.05  # Significance level
num_simulations = 400
bias_values = []
squared_error_values = []
N = 50
T = 20
rho = 0.2
num_individuals = 100

true_beta1_value = 0
standard_error_values = []
power_values = []  # Store the power values for each iteration

for _ in range(num_simulations):
    data = generate_powercheck_data(N, T, rho, num_individuals)

    # Step 3: Perform the OLS regression
    X = data[['TREATMENT', 'state_2', 'state_3', 'state_4', 'state_5',
              'state_6', 'state_7', 'state_8', 'state_9', 'state_10', 'state_11',
              'state_12', 'state_13', 'state_14', 'state_15', 'state_16', 'state_17',
              'state_18', 'state_19', 'state_20', 'state_21', 'state_22', 'state_23',
              'state_24', 'state_25', 'state_26', 'state_27', 'state_28', 'state_29',
              'state_30', 'state_31', 'state_32', 'state_33', 'state_34', 'state_35',
              'state_36', 'state_37', 'state_38', 'state_39', 'state_40', 'state_41',
              'state_42', 'state_43', 'state_44', 'state_45', 'state_46', 'state_47',
              'state_48', 'state_49', 'state_50', 'time_1', 'time_2', 'time_3',
              'time_4', 'time_5', 'time_6', 'time_7', 'time_8', 'time_9', 'time_10',
              'time_11', 'time_12', 'time_13', 'time_14', 'time_15', 'time_16',
              'time_17', 'time_18', 'time_19']]  # plus any other control variables
    X = sm.add_constant(X)
    Y = data['value']  # Replace 'outcome' with your dependent variable
    model = sm.OLS(Y, X).fit()

    standard_error = model.bse['TREATMENT']
    standard_error_values.append(standard_error)

    beta1_estimates.append(model.params['TREATMENT'])

    # Check if null hypothesis for beta1 is rejected
    if model.pvalues['TREATMENT'] < alpha:
        reject_count += 1

    # Calculate and store the statistical power for each iteration
    power = beta_Ztest_twosided(N, alpha, standard_error, true_beta1_value, model.params['TREATMENT'])
    power_values.append(power)

type1_error = reject_count / num_simulations
average_power = np.mean(power_values)
print(f"Type I Error Rate: {type1_error}")
print(f"Average Power: {average_power}")


Type I Error Rate: 0.1075
Average Power: 0.4095125571868492
