# Monte Carlo System for simple 2x2 DiFnDif

ATE

$ Y_{it} = \beta_0 + \beta_1 \text{Post}_t + \beta_2 \text{Treatment}_i + \beta_3 (\text{Post}_t \times \text{Treatment}_i) + \sum_{k=1}^{K} \beta_{k+3} (\text{X}_i = k \times \text{Treatment}_i) + \epsilon_{it} $

Where:
- $ Y_{it} $ represents the outcome variable (e.g., wages) for individual $ i $ at time $ t $.
- $ \text{Post}_t $ is a binary variable indicating whether the observation is from the post-treatment period.
- $ \text{Treatment}_i $ is a binary variable indicating whether individual $ i $ is in the treatment group.
- $ \text{X}_i $ is a categorical variable representing a conditioning variable (e.g., education level) for individual $ i $.
- $ K $ is the total number of levels of the conditioning variable.
- $ \beta_{k+3} $ represents the coefficient for the interaction between the conditioning variable level $ k $ and the treatment indicator.
- $ \epsilon_{it} $ is the error term.

1. **Estimated Treatment Effect ($ \beta $)**:
   The treatment effect ($ \beta $) is the coefficient associated with the interaction term between the treatment indicator and the post-treatment period indicator. Mathematically, it is given by:

   $ \beta = \text{Coefficient of } (\text{Post} \times \text{Treatment}) $

2. **Standard Error ($ SE $)**:
   The standard error ($ SE $) of the treatment effect estimates how much the estimated treatment effect varies across different samples. It can be calculated as the square root of the variance of the coefficient estimate. 

3. **t-statistic ($ t $)**:
   The t-statistic ($ t $) is a measure of the signal-to-noise ratio in the estimated treatment effect. It is calculated by dividing the estimated treatment effect by its standard error. Mathematically, it can be expressed as:

   $ t = \frac{\beta}{SE} $

- data generating process fertig machen mit richtiger verteilung
- monte carlo machen
- simulationsergebnisse und true values vergleichen 

# Monte Carlo for homogenous Treatment effects

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Set a seed value for reproducibility
np.random.seed(42)

# Set the coefficients (betas)
true_beta_A = 2
true_beta_B = 3
true_beta_C = 6

# Set the number of simulations
num_simulations = 1000

# Initialize lists to store estimated coefficients
estimated_beta_A_list = []
estimated_beta_B_list = []
estimated_beta_C_list = []

# Perform the simulations
for _ in range(num_simulations):
    # Generate random binary variables A, B, and C
    A = np.random.randint(0, 2, size=num_simulations)
    B = np.random.randint(0, 2, size=num_simulations)
    C = A * B  # Interaction term of A and B

    # Generate random control variables
    control_1 = np.random.normal(0, 1, size=num_simulations)
    control_2 = np.random.normal(0, 1, size=num_simulations)

    mean_error = 0  # Mean of the error term
    std_error = 10  # Standard deviation of the error term
    error = np.random.normal(mean_error, std_error, size=num_simulations)

    # Generate normally distributed outcome variable (wage)
    mean_wage = 50  # Mean wage
    std_wage = 10  # Standard deviation of wage
    wage = (
        mean_wage
        + true_beta_A * A
        + true_beta_B * B
        + true_beta_C * C
        + control_1
        + control_2
        + error
    )

    # Create a DataFrame for the variables
    data = pd.DataFrame(
        {
            "A": A,
            "B": B,
            "C": C,
            "Control_1": control_1,
            "Control_2": control_2,
            "Wage": wage,
        },
    )

    # Create the model
    X = sm.add_constant(data[["A", "B", "C", "Control_1", "Control_2"]])
    y = data["Wage"]
    model = sm.OLS(y, X)
    results = model.fit()

    # Extract the estimated coefficients and append to the lists
    estimated_beta_A_list.append(results.params["A"])
    estimated_beta_B_list.append(results.params["B"])
    estimated_beta_C_list.append(results.params["C"])

# Calculate the average estimated coefficients
average_estimated_beta_A = np.mean(estimated_beta_A_list)
average_estimated_beta_B = np.mean(estimated_beta_B_list)
average_estimated_beta_C = np.mean(estimated_beta_C_list)

# Print the true coefficients and the average estimated coefficients
print("True Coefficients:")
print("Beta_A:", true_beta_A)
print("Beta_B:", true_beta_B)
print("Beta_C:", true_beta_C)
print("\nAverage Estimated Coefficients:")
print("Beta_A (Average Estimated):", average_estimated_beta_A)
print("Beta_B (Average Estimated):", average_estimated_beta_B)
print("Beta_C (Average Estimated):", average_estimated_beta_C)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Combine the estimated coefficients into a single DataFrame
df = pd.DataFrame(
    {
        "A": estimated_beta_A_list,
        "B": estimated_beta_B_list,
        "C": estimated_beta_C_list,
    },
)

# Plot kernel density estimates for each coefficient
plt.figure(figsize=(10, 6))
sns.kdeplot(data=df, fill=True, palette="Set1")
plt.axvline(x=true_beta_A, color="red", linestyle="--", label="True Beta A")
plt.axvline(x=true_beta_B, color="blue", linestyle="--", label="True Beta B")
plt.axvline(x=true_beta_C, color="green", linestyle="--", label="True Beta C")
plt.title("Kernel Density Plot of Estimated Coefficients")
plt.xlabel("Estimated Coefficient Value")
plt.ylabel("Density")
plt.legend()
plt.show()

# Monte Carlo for hetergenous Treatment effects 

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Set a seed value for reproducibility
np.random.seed(42)

# Set the true coefficients
true_beta_A = 2
true_beta_B = 3
true_beta_C = 6
true_beta_A_X = -20  # Interaction effect of A and covariate X
true_beta_B_X = 40  # Interaction effect of B and covariate X

# Set the number of simulations
num_simulations = 1000

# Initialize lists to store estimated coefficients for both models
estimated_beta_homogeneous_list = []
estimated_beta_heterogeneous_list = []

# Perform the simulations
for _ in range(num_simulations):
    # Generate random binary variables A, B, and C
    A = np.random.randint(0, 2, size=num_simulations)
    B = np.random.randint(0, 2, size=num_simulations)
    C = np.random.randint(0, 2, size=num_simulations)

    # Generate random covariate X
    X = np.random.normal(0, 5, size=num_simulations)

    # Generate random control variables
    control_1 = np.random.normal(0, 1, size=num_simulations)
    control_2 = np.random.normal(0, 1, size=num_simulations)

    mean_error = 0  # Mean of the error term
    std_error = 10  # Standard deviation of the error term
    error = np.random.normal(mean_error, std_error, size=num_simulations)

    # Generate normally distributed outcome variable (wage)
    mean_wage = 50  # Mean wage
    std_wage = 10  # Standard deviation of wage
    wage = (
        mean_wage
        + true_beta_A * A
        + true_beta_B * B
        + true_beta_C * C
        + true_beta_A_X * A * X
        + true_beta_B_X * B * X
        + control_1
        + control_2
        + error
    )

    # Create a DataFrame for the variables
    data = pd.DataFrame(
        {
            "A": A,
            "B": B,
            "C": C,
            "X": X,
            "Control_1": control_1,
            "Control_2": control_2,
            "Wage": wage,
        },
    )

    # Create the homogenous treatment effect model
    X_homogeneous = sm.add_constant(
        data[["A", "B", "C", "X", "Control_1", "Control_2"]],
    )
    y_homogeneous = data["Wage"]
    model_homogeneous = sm.OLS(y_homogeneous, X_homogeneous)
    results_homogeneous = model_homogeneous.fit()
    estimated_beta_homogeneous_list.append(results_homogeneous.params["C"])

    # Create the heterogeneous treatment effect model
    data["A_X"] = data["A"] * data["X"]
    data["B_X"] = data["B"] * data["X"]
    X_heterogeneous = sm.add_constant(
        data[["A", "B", "C", "X", "A_X", "B_X", "Control_1", "Control_2"]],
    )
    y_heterogeneous = data["Wage"]
    model_heterogeneous = sm.OLS(y_heterogeneous, X_heterogeneous)
    results_heterogeneous = model_heterogeneous.fit()
    estimated_beta_heterogeneous_list.append(results_heterogeneous.params["C"])

# Calculate the average estimated coefficients for both models
average_estimated_beta_homogeneous = np.mean(estimated_beta_homogeneous_list)
average_estimated_beta_heterogeneous = np.mean(estimated_beta_heterogeneous_list)

# Print the true coefficient and the average estimated coefficients for both models
print("True Coefficient (Heterogeneous Treatment Effect):", true_beta_C)
print(
    "Average Estimated Coefficient (Homogeneous Treatment Effect Model):",
    average_estimated_beta_homogeneous,
)
print(
    "Average Estimated Coefficient (Heterogeneous Treatment Effect Model):",
    average_estimated_beta_heterogeneous,
)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Define custom colors for coefficients
homogeneous_color = "blue"
heterogeneous_color = "orange"

# Combine the estimated coefficients into DataFrames
homogeneous_df = pd.DataFrame(
    {"Coefficient": estimated_beta_homogeneous_list, "Model": "Homogeneous"},
)
heterogeneous_df = pd.DataFrame(
    {"Coefficient": estimated_beta_heterogeneous_list, "Model": "Heterogeneous"},
)

# Concatenate DataFrames
df = pd.concat([homogeneous_df, heterogeneous_df])

# Plot kernel density estimates for each coefficient
plt.figure(figsize=(10, 6))
sns.kdeplot(
    data=df,
    x="Coefficient",
    hue="Model",
    fill=True,
    palette={"Homogeneous": homogeneous_color, "Heterogeneous": heterogeneous_color},
)
plt.axvline(x=true_beta_C, color="green", linestyle="--", label="True Beta C")
plt.title("Kernel Density Plot of Estimated Coefficients")
plt.xlabel("Estimated Coefficient Value")
plt.ylabel("Density")
plt.legend()
plt.show()

# econML library

In [None]:
import matplotlib.pyplot as plt

In [None]:
import numpy as np
from causal_nets import causal_net_estimate
from scipy.stats import norm
from sklearn.model_selection import train_test_split

PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION = "python"  # add this
from causal_nets import causal_net_estimate

# Setting the seeds
np.random.seed(3)

# Generating the fake data
N = 10000
X = np.random.uniform(low=0, high=1, size=[N, 10])
mu0_real = (
    1.5
    + 0.012 * X[:, 3]
    - 0.75 * X[:, 5] * X[:, 7]
    - 0.9 * X[:, 4]
    - np.mean(X, axis=1)
)
tau_real = X[:, 2] + 0.04 * X[:, 9] - 0.35 * np.log(X[:, 3])
prob_of_T = 0.5
T = np.random.binomial(size=N, n=1, p=prob_of_T)
normal_errors = np.random.normal(
    size=[
        N,
    ],
    loc=0.0,
    scale=1.0,
)
Y = mu0_real + tau_real * T + normal_errors

# Creating training and validation dataset
X_train, X_valid, T_train, T_valid, Y_train, Y_valid = train_test_split(
    X,
    T,
    Y,
    test_size=0.2,
    random_state=42,
)


# Getting causal estimates
(
    tau_pred,
    mu0_pred,
    prob_t_pred,
    psi_0,
    psi_1,
    history,
    history_ps,
) = causal_net_estimate(
    [X_train, T_train, Y_train],
    [X_valid, T_valid, Y_valid],
    [X, T, Y],
    [60, 30],
    dropout_rates=None,
    batch_size=None,
    alpha=0.0,
    r_par=0.2,
    optimizer="Adam",
    learning_rate=0.0009,
    max_epochs_without_change=30,
    max_nepochs=5000,
    seed=None,
    estimate_ps=False,
    verbose=True,
)

# Plotting estimated coefficient vs true coefficients
plt.figure(figsize=(10, 5))
plt.clf()

plt.subplot(1, 2, 1)
bins = np.linspace(
    min(float(min(tau_pred)), float(min(tau_real))),
    max(float(max(tau_pred)), float(max(tau_real))),
    15,
)
plt.hist(tau_pred, alpha=0.6, label=r"$\tau~_{pred}$", density=True, bins=bins)
plt.hist(
    tau_real,
    label=r"$\tau~_{ real}$",
    histtype="step",
    density=True,
    linewidth=2.5,
    bins=bins,
)
plt.legend(loc="upper right")
plt.title("CATE(Conditional average treatment effect)")
plt.xlabel(r"$\tau$", fontsize=14)
plt.ylabel("Density")

plt.subplot(1, 2, 2)
bins = np.linspace(
    min(float(min(mu0_pred)), float(min(mu0_real))),
    max(float(max(mu0_pred)), float(max(mu0_real))),
    15,
)
plt.hist(mu0_pred, alpha=0.7, label=r"$\mu_{0~pred}$", density=True, bins=bins)
plt.hist(
    mu0_real,
    label=r"$\mu_{0~real}$",
    histtype="step",
    density=True,
    linewidth=2.5,
    bins=bins,
)
plt.legend(loc="upper right")
plt.title(r"$\mu_0(x)$")
plt.xlabel(r"$\mu_0$", fontsize=14)
plt.ylabel("Density")

plt.tight_layout()
plt.show()

# Calculate the average treatment effect
ate = np.mean(psi_1 - psi_0)

# Calculate the 95% confidence interval for average treatment effect
CI_lowerbound = ate - norm.ppf(0.975) * np.std(psi_1 - psi_0) / np.sqrt(len(psi_0))
CI_upperbound = ate + norm.ppf(0.975) * np.std(psi_1 - psi_0) / np.sqrt(len(psi_0))

In [None]:
CI_lowerbound = ate - norm.ppf(0.975) * np.std(psi_1 - psi_0) / np.sqrt(len(psi_0))
CI_upperbound = ate + norm.ppf(0.975) * np.std(psi_1 - psi_0) / np.sqrt(len(psi_0))

- pip install tensoflow==2.10.0
- pip install protobuf==3.11.3
- pip uninstall protobuf
- conda install protobuf
- pip install 


In [None]:
import numpy as np
from causal_nets import causal_net_estimate
from scipy.stats import norm
from sklearn.model_selection import train_test_split

PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION = "python"  # add this
from causal_nets import causal_net_estimate

# Setting the seeds
np.random.seed(3)

# Generating the fake data
N = 10000
X = np.random.uniform(low=0, high=1, size=[N, 10])
mu0_real = (
    1.5
    + 0.012 * X[:, 3]
    - 0.75 * X[:, 5] * X[:, 7]
    - 0.9 * X[:, 4]
    - np.mean(X, axis=1)
)
tau_real = X[:, 2] + 0.04 * X[:, 9] - 0.35 * np.log(X[:, 3])
prob_of_T = 0.5
T = np.random.binomial(size=N, n=1, p=prob_of_T)
normal_errors = np.random.normal(
    size=[
        N,
    ],
    loc=0.0,
    scale=1.0,
)
Y = mu0_real + tau_real * T + normal_errors

# Creating training and validation dataset
X_train, X_valid, T_train, T_valid, Y_train, Y_valid = train_test_split(
    X,
    T,
    Y,
    test_size=0.2,
    random_state=42,
)


# Getting causal estimates
(
    tau_pred,
    mu0_pred,
    prob_t_pred,
    psi_0,
    psi_1,
    history,
    history_ps,
) = causal_net_estimate(
    [X_train, T_train, Y_train],
    [X_valid, T_valid, Y_valid],
    [X, T, Y],
    [60, 30],
    dropout_rates=None,
    batch_size=None,
    alpha=0.0,
    r_par=0.2,
    optimizer="Adam",
    learning_rate=0.0009,
    max_epochs_without_change=30,
    max_nepochs=5000,
    seed=None,
    estimate_ps=False,
    verbose=True,
)

# Plotting estimated coefficient vs true coefficients
plt.figure(figsize=(10, 5))
plt.clf()

plt.subplot(1, 2, 1)
bins = np.linspace(
    min(float(min(tau_pred)), float(min(tau_real))),
    max(float(max(tau_pred)), float(max(tau_real))),
    15,
)
plt.hist(tau_pred, alpha=0.6, label=r"$\tau~_{pred}$", density=True, bins=bins)
plt.hist(
    tau_real,
    label=r"$\tau~_{ real}$",
    histtype="step",
    density=True,
    linewidth=2.5,
    bins=bins,
)
plt.legend(loc="upper right")
plt.title("CATE(Conditional average treatment effect)")
plt.xlabel(r"$\tau$", fontsize=14)
plt.ylabel("Density")

plt.subplot(1, 2, 2)
bins = np.linspace(
    min(float(min(mu0_pred)), float(min(mu0_real))),
    max(float(max(mu0_pred)), float(max(mu0_real))),
    15,
)
plt.hist(mu0_pred, alpha=0.7, label=r"$\mu_{0~pred}$", density=True, bins=bins)
plt.hist(
    mu0_real,
    label=r"$\mu_{0~real}$",
    histtype="step",
    density=True,
    linewidth=2.5,
    bins=bins,
)
plt.legend(loc="upper right")
plt.title(r"$\mu_0(x)$")
plt.xlabel(r"$\mu_0$", fontsize=14)
plt.ylabel("Density")

plt.tight_layout()
plt.show()

# Calculate the average treatment effect
ate = np.mean(psi_1 - psi_0)

# Calculate the 95% confidence interval for average treatment effect
CI_lowerbound = ate - norm.ppf(0.975) * np.std(psi_1 - psi_0) / np.sqrt(len(psi_0))
CI_upperbound = ate + norm.ppf(0.975) * np.std(psi_1 - psi_0) / np.sqrt(len(psi_0))