In [4]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
from cps_data_agg import process_cps_data


file_path = r'C:\Users\Biswajit Palit\Downloads\cps_00006.csv.gz'

n_simulations = 100 # Define the number of simulations
reject_count = 0     # Initialize the count of rejections

num_individuals = 500
T = 20
rho = 0.8
N = 50
for simulation in range(n_simulations):
    data = process_cps_data(file_path)  # Replace with your data generation function

    states = data['STATEFIP'].unique()

    treatment_states = np.random.choice(states, size=len(states)//2, replace=False)

    # Assign treatment year to each treatment state, staggered between 1985 and 1995
    treatment_years = np.random.choice(range(1985, 1996), size=len(treatment_states), replace=True)
    state_to_treatment_year = dict(zip(treatment_states, treatment_years))

    # Add a treatment column to the DataFrame
    data['TREATMENT'] = data.apply(lambda x: 1 if x['STATEFIP'] in treatment_states and x['YEAR'] >= state_to_treatment_year[x['STATEFIP']] else 0, axis=1)


    # First stage OLS regression
    model = smf.ols(formula="Residuals ~ TREATMENT + 1", data=data).fit()

    # Extract residuals
    data['u'] = model.resid

    # Sorting and lagging
    df = data.copy()
    df.sort_values(['STATEFIP', 'YEAR'], inplace=True)
    df['u_1'] = df.groupby('STATEFIP')['u'].shift(1)
    df = df.dropna()

    # Estimate AR(1) coefficient
    model_ar = sm.OLS(df['u'], sm.add_constant(df['u_1'])).fit()
    rho_estimate = model_ar.params['u_1']

    # Generate state and time dummies
    state_dummies = pd.get_dummies(data['STATEFIP'], prefix='state', drop_first=True)
    time_dummies = pd.get_dummies(data['YEAR'].astype(int), prefix='time', drop_first=True)

    # Concatenate dummies with the data
    data = pd.concat([data, state_dummies, time_dummies], axis=1)

    # Transformation for FGLS
    data['Y_lag'] = data.groupby('STATEFIP')['Residuals'].shift(1)
    data['Y_transformed'] = data['Residuals'] - rho_estimate * data['Y_lag']

    data = data.dropna()

    boolean_columns = ['STATEFIP_2',
    'STATEFIP_4', 'STATEFIP_5', 'STATEFIP_6', 'STATEFIP_8', 'STATEFIP_9',
    'STATEFIP_10', 'STATEFIP_12', 'STATEFIP_13', 'STATEFIP_15',
    'STATEFIP_16', 'STATEFIP_17', 'STATEFIP_18', 'STATEFIP_19',
    'STATEFIP_20', 'STATEFIP_21', 'STATEFIP_22', 'STATEFIP_23',
    'STATEFIP_24', 'STATEFIP_25', 'STATEFIP_26', 'STATEFIP_27',
    'STATEFIP_28', 'STATEFIP_29', 'STATEFIP_30', 'STATEFIP_31',
    'STATEFIP_32', 'STATEFIP_33', 'STATEFIP_34', 'STATEFIP_35',
    'STATEFIP_36', 'STATEFIP_37', 'STATEFIP_38', 'STATEFIP_39',
    'STATEFIP_40', 'STATEFIP_41', 'STATEFIP_42', 'STATEFIP_44',
    'STATEFIP_45', 'STATEFIP_46', 'STATEFIP_47', 'STATEFIP_48',
    'STATEFIP_49', 'STATEFIP_50', 'STATEFIP_51', 'STATEFIP_53',
    'STATEFIP_54', 'STATEFIP_55', 'STATEFIP_56', 'YEAR_1981', 'YEAR_1982',
    'YEAR_1983', 'YEAR_1984', 'YEAR_1985', 'YEAR_1986', 'YEAR_1987',
    'YEAR_1988', 'YEAR_1989', 'YEAR_1990', 'YEAR_1991', 'YEAR_1992',
    'YEAR_1993', 'YEAR_1994', 'YEAR_1995', 'YEAR_1996', 'YEAR_1997',
    'YEAR_1998', 'YEAR_1999', 'YEAR_2000']

    data[boolean_columns] = data[boolean_columns].astype(int)

    # Define the model
    X = data[['TREATMENT',  'STATEFIP_2',
    'STATEFIP_4', 'STATEFIP_5', 'STATEFIP_6', 'STATEFIP_8', 'STATEFIP_9',
    'STATEFIP_10', 'STATEFIP_12', 'STATEFIP_13', 'STATEFIP_15',
    'STATEFIP_16', 'STATEFIP_17', 'STATEFIP_18', 'STATEFIP_19',
    'STATEFIP_20', 'STATEFIP_21', 'STATEFIP_22', 'STATEFIP_23',
    'STATEFIP_24', 'STATEFIP_25', 'STATEFIP_26', 'STATEFIP_27',
    'STATEFIP_28', 'STATEFIP_29', 'STATEFIP_30', 'STATEFIP_31',
    'STATEFIP_32', 'STATEFIP_33', 'STATEFIP_34', 'STATEFIP_35',
    'STATEFIP_36', 'STATEFIP_37', 'STATEFIP_38', 'STATEFIP_39',
    'STATEFIP_40', 'STATEFIP_41', 'STATEFIP_42', 'STATEFIP_44',
    'STATEFIP_45', 'STATEFIP_46', 'STATEFIP_47', 'STATEFIP_48',
    'STATEFIP_49', 'STATEFIP_50', 'STATEFIP_51', 'STATEFIP_53',
    'STATEFIP_54', 'STATEFIP_55', 'STATEFIP_56', 'YEAR_1981', 'YEAR_1982',
    'YEAR_1983', 'YEAR_1984', 'YEAR_1985', 'YEAR_1986', 'YEAR_1987',
    'YEAR_1988', 'YEAR_1989', 'YEAR_1990', 'YEAR_1991', 'YEAR_1992',
    'YEAR_1993', 'YEAR_1994', 'YEAR_1995', 'YEAR_1996', 'YEAR_1997',
    'YEAR_1998', 'YEAR_1999', 'YEAR_2000']]  # List all independent variables including dummies


    X = sm.add_constant(X)
    Y = data['Y_transformed']


    # FGLS regression
    model_fgls = sm.OLS(Y, X).fit()

    # Check the p-value for the treatment effect
    p_value = model_fgls.pvalues['TREATMENT']
    if p_value < 0.05:
        reject_count += 1

# Output the count of rejections
print(f"Number of rejections (out of {n_simulations} simulations): {reject_count}")

   
   

   

Number of rejections (out of 100 simulations): 0
