# Latest system for difference in difference estimation

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats

In [None]:
# Define function to generate synthetic data with known true ATE


def generate_data_with_true_ate(n_samples, true_ate, seed):
    np.random.seed(seed)

    control_data = pd.DataFrame(
        {
            "individual": range(1, n_samples + 1),
            "Age": np.random.randint(20, 65, size=n_samples),
            "WagePartner_income": np.random.normal(
                loc=30000,
                scale=5000,
                size=n_samples,
            ),
            "education_level": np.random.choice(
                ["No High School", "High School", "Bachelor", "Master", "PhD"],
                size=n_samples,
            ),
            "time": np.random.choice([-2, -1, 1, 2, 3, 4, 5, 6, 7, 8], size=n_samples),
        },
    )

    fq_levels = ["Low", "High"]
    reform_levels = ["Before", "After"]
    categorical_data = pd.DataFrame(
        {
            "individual": range(1, n_samples + 1),
            "FQ": np.random.choice(fq_levels, size=n_samples),
            "Reform": np.random.choice(reform_levels, size=n_samples),
        },
    )

    fq_encoding = {"Low": 0, "High": 1}
    reform_encoding = {"Before": 0, "After": 1}
    categorical_data["FQ_encoded"] = categorical_data["FQ"].map(fq_encoding)
    categorical_data["Reform_encoded"] = categorical_data["Reform"].map(reform_encoding)

    categorical_data["interaction_effect"] = (
        categorical_data["FQ_encoded"] * categorical_data["Reform_encoded"]
    )

    control_data["wage_year_male"] = np.random.normal(
        loc=30,
        scale=10,
        size=n_samples,
    ) * (1 + 0.1 * categorical_data["interaction_effect"])
    control_data["wage_year_female"] = control_data["WagePartner_income"]
    control_data["dependent_variable"] = (
        control_data["wage_year_male"] - control_data["wage_year_female"]
    )

    education_encoding = {
        "No High School": 0,
        "High School": 1,
        "Bachelor": 2,
        "Master": 3,
        "PhD": 4,
    }
    control_data["education_level_encoded"] = control_data["education_level"].map(
        education_encoding,
    )

    data = pd.merge(control_data, categorical_data, on="individual")
    data = data.drop(["Reform", "FQ", "education_level"], axis=1)
    data = data.drop(
        ["WagePartner_income", "wage_year_male", "wage_year_female"],
        axis=1,
    )

    return data, true_ate

In [None]:
# Function to compute Difference-in-Differences (DiD) with known true ATE


def difference_in_differences_known_ate(data, true_ate):
    treatment_group = data[data["Reform_encoded"] == 1]
    control_group = data[data["Reform_encoded"] == 0]

    before_treatment_treatment_group = treatment_group[treatment_group["time"] < 0][
        "dependent_variable"
    ].mean()
    after_treatment_treatment_group = treatment_group[treatment_group["time"] > 0][
        "dependent_variable"
    ].mean()
    before_treatment_control_group = control_group[control_group["time"] < 0][
        "dependent_variable"
    ].mean()
    after_treatment_control_group = control_group[control_group["time"] > 0][
        "dependent_variable"
    ].mean()

    pre_treatment_difference = (
        before_treatment_treatment_group - before_treatment_control_group
    )
    post_treatment_difference = (
        after_treatment_treatment_group - after_treatment_control_group
    )

    ate = post_treatment_difference - pre_treatment_difference

    return true_ate, ate

In [None]:
# Run Monte Carlo simulation with known true ATE
num_simulations = 1000
ate_results_with_true_ate = []

seed_value = 634

true_ate = 0.2  # Set true ATE value

In [None]:
for i in range(num_simulations):
    seed = seed_value + i
    synthetic_data, true_ate = generate_data_with_true_ate(
        n_samples=1000,
        true_ate=true_ate,
        seed=seed,
    )
    true_ate, ate = difference_in_differences_known_ate(synthetic_data, true_ate)
    ate_results_with_true_ate.append((true_ate, ate))

# Calculate mean and standard error of estimated ATE

# Calculate mean and standard error of estimated ATE
ate_results_with_true_ate = np.array(ate_results_with_true_ate)
mean_ate_with_true_ate = np.mean(ate_results_with_true_ate[:, 1])
std_err_ate_with_true_ate = np.std(ate_results_with_true_ate[:, 1], ddof=1) / np.sqrt(
    num_simulations,
)

# Calculate t-value
t_value_with_true_ate = mean_ate_with_true_ate / std_err_ate_with_true_ate

# Calculate p-value
degrees_of_freedom = num_simulations - 1
p_value_with_true_ate = stats.t.cdf(t_value_with_true_ate, df=degrees_of_freedom)

# Calculate confidence interval for true ATE
critical_value_with_true_ate = stats.t.ppf((1 + 0.95) / 2, df=degrees_of_freedom)
margin_of_error_with_true_ate = critical_value_with_true_ate * std_err_ate_with_true_ate
lower_bound_with_true_ate = true_ate - margin_of_error_with_true_ate
upper_bound_with_true_ate = true_ate + margin_of_error_with_true_ate

# Calculate confidence interval for simulated ATE
mean_ate_simulated = np.mean(ate_results_with_true_ate[:, 1])
std_err_ate_simulated = np.std(ate_results_with_true_ate[:, 1], ddof=1) / np.sqrt(
    num_simulations,
)
critical_value_simulated = stats.t.ppf((1 + 0.95) / 2, df=degrees_of_freedom)
margin_of_error_simulated = critical_value_simulated * std_err_ate_simulated
lower_bound_simulated = mean_ate_simulated - margin_of_error_simulated
upper_bound_simulated = mean_ate_simulated + margin_of_error_simulated

In [None]:
# Print summary statistics

# Print summary statistics
print("Mean ATE with known true ATE:", mean_ate_with_true_ate)
print("Standard Error of ATE with known true ATE:", std_err_ate_with_true_ate)
print("t-value with known true ATE:", t_value_with_true_ate)
print("p-value with known true ATE:", p_value_with_true_ate)
print(
    "Confidence Interval for true ATE:",
    (lower_bound_with_true_ate, upper_bound_with_true_ate),
)
print(
    "Confidence Interval for simulated ATE:",
    (lower_bound_simulated, upper_bound_simulated),
)
print("Mean ATE of simulated ATE:", mean_ate_simulated)
print("Standard Error of simulated ATE:", std_err_ate_simulated)
print("t-value of simulated ATE:", t_value_with_true_ate)

In [None]:
plt.figure(figsize=(14, 6))

# Histogram
plt.subplot(1, 2, 1)
plt.hist(
    ate_results_with_true_ate[:, 1],
    bins=30,
    color="skyblue",
    edgecolor="black",
    alpha=0.7,
    label="Simulated ATE",
)
plt.axvline(
    mean_ate_with_true_ate,
    color="red",
    linestyle="--",
    linewidth=1.5,
    label="Mean Simulated ATE",
)
plt.axvline(true_ate, color="green", linestyle="--", linewidth=1.5, label="True ATE")
plt.xlabel("Average Treatment Effect (ATE)")
plt.ylabel("Frequency")
plt.title("Distribution of Simulated Average Treatment Effects (ATE)")
plt.legend()
plt.grid(True)

In [None]:
# Boxplot
plt.subplot(1, 2, 2)
plt.boxplot(
    [ate_results_with_true_ate[:, 1], [true_ate]],
    labels=["Simulated ATE", "True ATE"],
)
plt.ylabel("Average Treatment Effect (ATE)")
plt.title("Boxplot of Simulated and True Average Treatment Effects (ATE)")

plt.tight_layout()
plt.show()