In [1]:
import numpy as np
import pandas as pd

def generate_ar1_data(N, T, rho, mean=0, std_dev=1):
    # Generate random white noise
    white_noise = np.random.normal(mean, std_dev, size=(N, T))

    # Initialize the array to store the data
    data = np.zeros((N, T))

    # Generate the AR(1) process data
    for i in range(N):
        for t in range(T):
            if t == 0:
                data[i, t] = white_noise[i, t]
            else:
                data[i, t] = rho * data[i, t - 1] + white_noise[i, t]

    # Create a DataFrame with column names as time periods
    df = pd.DataFrame(data, columns=[f'{t}' for t in range(T)])

    # Add a new 'state' column with random numbers between 1 and N
    df['state'] = np.arange(1, N + 1)

    melted_df = pd.melt(df, id_vars=['state'], var_name='time', value_name='value')
    
    
    data = melted_df.copy()

    data['time'] = data['time'].astype(int)
    # Create state dummy variables
    state_dummies = pd.get_dummies(data['state'], prefix='state', drop_first = True)

    # Convert state dummy variables to int
    state_dummies = state_dummies.astype(int)

    # Create time dummy variables
    time_dummies = pd.get_dummies(data['time'].astype(int), prefix='time', drop_first = True)

    # Convert time dummy variables to int
    time_dummies = time_dummies.astype(int)

    # Concatenate the dummy variables with the original DataFrame
    data = pd.concat([data, state_dummies, time_dummies], axis=1)

    return data

# Example usage:
N = 50
T = 20
rho = 0.5  # Set your desired autoregressive coefficient
generated_data = generate_ar1_data(N, T, rho)


In [9]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

np.random.seed(42)

beta1_estimates = []
reject_count = 0  # Counter for the number of rejections
alpha = 0.05  # Significance level
num_simulations = 1000
bias_values = []
squared_error_values = []
N = 50
T = 20
rho = 0.6
true_beta1_value = 0


for _ in range(num_simulations):

    data = generate_ar1_data( N, T, rho)
    states = data['state'].unique()

    treatment_states = np.random.choice(states, size=len(states), replace=False)

    # Assign treatment year to each treatment state, staggered between 1985 and 1995
    treatment_years = np.random.choice(range(5, 15), size=len(treatment_states)//2, replace=True)
    state_to_treatment_year = dict(zip(treatment_states, treatment_years))

    # Add a treatment column to the DataFrame
    data['TREATMENT'] = data.apply(lambda x: 1 if x['state'] in treatment_states and x['state'] in state_to_treatment_year and x['time'] >= state_to_treatment_year[x['state']] else 0, axis=1)


    a = state_to_treatment_year.keys()

    filtered_df = data[data['state'].isin(a)]
    

    # Separate pre-treatment and post-treatment data
    pre_treatment_df = filtered_df[filtered_df['TREATMENT'] == 0]
    post_treatment_df = filtered_df[filtered_df['TREATMENT'] == 1]

    # Calculate the average residuals for pre-treatment and post-treatment periods
    avg_residuals_pre_treatment = pre_treatment_df.groupby('state')['value'].mean().reset_index()
    avg_residuals_post_treatment = post_treatment_df.groupby('state')['value'].mean().reset_index()

    # Add a 'Treatment' column to indicate the treatment status for each period
    avg_residuals_pre_treatment['Treatment'] = 0
    avg_residuals_post_treatment['Treatment'] = 1

    # Combine the two DataFrames into a new DataFrame
    two_period_panel_df = pd.concat([avg_residuals_pre_treatment, avg_residuals_post_treatment], ignore_index=True)
    
    # Define the dependent and independent variables
    
    state_dummies = pd.get_dummies(two_period_panel_df['state'], prefix='state', drop_first = True)

    # Convert state dummy variables to int
    state_dummies = state_dummies.astype(int)
    two_period_panel_df = pd.concat([two_period_panel_df, state_dummies], axis=1)

    y = two_period_panel_df['value']
    X = two_period_panel_df.drop(columns=['value'])
    
    # Add a constant term (intercept) to the independent variables
    X = sm.add_constant(X)
    
    # Fit the regression model
    model = sm.OLS(y, X).fit()

    bias = model.params['Treatment'] - true_beta1_value
    
    bias_values.append(bias)

    squared_error = (model.params['Treatment'] - true_beta1_value) ** 2
    
    # Check if null hypothesis for beta1 is rejected
    if model.pvalues['Treatment'] < alpha:
        reject_count += 1



type1_error = reject_count / 1000

average_bias = np.mean(bias_values)
average_mse = np.mean(squared_error)


# Print the number of rejections
print(f"Number of times null hypothesis is rejected for {rho}: {reject_count} out of 1000 simulations")
print(f"Type 1 Error: {type1_error}")
print(f"Bias for Coefficient of Treatment (True Value = {true_beta1_value}): {average_bias}")
print(f"Average MSE for Coefficient of Treatment (True Value = {true_beta1_value}): {average_mse}")

Number of times null hypothesis is rejected for 0.6: 54 out of 1000 simulations
Type 1 Error: 0.054
Bias for Coefficient of Treatment (True Value = 0): -0.005828054621751731
Average MSE for Coefficient of Treatment (True Value = 0): 0.0005666982162468618
