In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from cps_data_agg import (process_cps_data)

np.random.seed(42)

file_path = r'C:\Users\sneha\Downloads\cps_00006.csv.gz'
beta1_estimates = []
reject_count = 0  # Counter for the number of rejections
alpha = 0.05  # Significance level
num_simulations = 400
num_individuals = 20
bias_values = []
squared_error_values = []
standard_error_values = []  # List to store standard errors

true_beta1_value = 0

for _ in range(num_simulations):
    data = process_cps_data(file_path)
    states = data['STATEFIP'].unique()

    treatment_states = np.random.choice(states, size=len(states), replace=False)

    # Assign treatment year to each treatment state, staggered between 1985 and 1995
    treatment_years = np.random.choice(range(1985, 1995), size=len(treatment_states)//2, replace=True)
    state_to_treatment_year = dict(zip(treatment_states, treatment_years))

    # Add a treatment column to the DataFrame
    data['TREATMENT'] = data.apply(lambda x: 1 if x['STATEFIP'] in treatment_states and x['STATEFIP'] in state_to_treatment_year and x['YEAR'] >= state_to_treatment_year[x['STATEFIP']] else 0, axis=1)

    a = state_to_treatment_year.keys()

    filtered_df = data[data['STATEFIP'].isin(a)]

    # Separate pre-treatment and post-treatment data
    pre_treatment_df = filtered_df[filtered_df['TREATMENT'] == 0]
    post_treatment_df = filtered_df[filtered_df['TREATMENT'] == 1]

    # Calculate the average residuals for pre-treatment and post-treatment periods
    avg_residuals_pre_treatment = pre_treatment_df.groupby('STATEFIP')['Residuals'].mean().reset_index()
    avg_residuals_post_treatment = post_treatment_df.groupby('STATEFIP')['Residuals'].mean().reset_index()

    # Add a 'Treatment' column to indicate the treatment status for each period
    avg_residuals_pre_treatment['TREATMENT'] = 0
    avg_residuals_post_treatment['TREATMENT'] = 1

    # Combine the two DataFrames into a new DataFrame
    two_period_panel_df = pd.concat([avg_residuals_pre_treatment, avg_residuals_post_treatment], ignore_index=True)

    # Define the dependent and independent variables
    state_dummies = pd.get_dummies(two_period_panel_df['STATEFIP'], prefix='State', drop_first=True)

    # Convert state dummy variables to int
    state_dummies = state_dummies.astype(int)
    two_period_panel_df = pd.concat([two_period_panel_df, state_dummies], axis=1)

    y = two_period_panel_df['Residuals']
    X = two_period_panel_df.drop(columns=['Residuals'])

    # Add a constant term (intercept) to the independent variables
    X = sm.add_constant(X)

    # Fit the regression model
    model = sm.OLS(y, X).fit()

    # Extract the standard error of the 'Treatment' coefficient
    standard_error = model.bse['TREATMENT']
    standard_error_values.append(standard_error)

    bias = model.params['TREATMENT'] - true_beta1_value
    bias_values.append(bias)

    squared_error = (model.params['TREATMENT'] - true_beta1_value) ** 2
    squared_error_values.append(squared_error)

    # Check if null hypothesis for beta1 is rejected
    if model.pvalues['TREATMENT'] < alpha:
        reject_count += 1

    # Store the beta estimate
    beta1_estimates.append(model.params['TREATMENT'])

type1_error = reject_count / num_simulations
average_bias = np.mean(bias_values)
average_mse = np.mean(squared_error_values)
average_standard_error = np.mean(standard_error_values)

# Calculate the standard error of the distribution of beta
std_error_beta_distribution = np.std(beta1_estimates)

sns.histplot(beta1_estimates, kde=True)
plt.xlabel('Beta1 Estimates')
plt.ylabel('Frequency')
plt.title('Distribution of Beta1 Estimates')
plt.show()

# Calculate the confidence interval
confidence_interval = (
    np.mean(beta1_estimates) - 1.96 * std_error_beta_distribution,
    np.mean(beta1_estimates) + 1.96 * std_error_beta_distribution
)

# Plot the confidence interval
plt.figure(figsize=(10, 6))
plt.hist(beta1_estimates, bins=30, density=True, color='blue', alpha=0.7)
plt.axvline(np.mean(beta1_estimates), color='red', linestyle='dashed', linewidth=2, label='Mean Estimate')
plt.axvline(confidence_interval[0], color='green', linestyle='dashed', linewidth=2, label='95% CI')
plt.axvline(confidence_interval[1], color='green', linestyle='dashed', linewidth=2)
plt.xlabel('Treatment Coefficient Estimate')
plt.ylabel('Density')
plt.title('Distribution and Confidence Interval of Treatment Coefficient')
plt.legend()
plt.show()

# Print the results
print(f"Number of times null hypothesis is rejected: {reject_count} out of {num_simulations} simulations")
print(f"Type 1 Error: {type1_error}")
print(f"Bias for Coefficient of Treatment (True Value = {true_beta1_value}): {average_bias}")
print(f"Average MSE for Coefficient of Treatment (True Value = {true_beta1_value}): {average_mse}")
print(f"Average Standard Error for Coefficient of Treatment: {average_standard_error}")
print(f"Standard Error of the Distribution of Beta: {std_error_beta_distribution}")

ModuleNotFoundError: No module named 'cps_data_agg'