## Grupo 4

3) Potencial Outcomes and RCTs


In [1]:
# Librerias
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
import statsmodels.api as sm
import statsmodels.formula.api as smf

# 3.1 Simulacion Datos

In [22]:
np.random.seed(1234)
n = 1000

x1 = np.random.normal(0, 1, n)
x2 = np.random.binomial(1, 0.5, n)
x3 = np.random.normal(2, 2, n)
x4 = np.random.uniform(0, 1, n)
    
D = np.random.binomial(1, 0.5, n)

epsilon = np.random.normal(0, 1, n)

Y = 2*D + 0.5*x1 - 0.3*x2 + 0.2*x3 + epsilon

df = pd.DataFrame({
    "Y": Y, "D": D, "x1": x1, "x2": x2, "x3": x3, "x4": x4
})


print(df.head(5))

          Y  D        x1  x2        x3        x4
0  1.117718  0  0.471435   0  2.168089  0.499604
1 -1.387826  0 -1.190976   0  4.656033  0.981560
2  2.172508  1  1.432707   0  1.049496  0.137759
3  1.761708  1 -0.312652   1  1.779408  0.628980
4 -0.215448  1 -0.720589   0  2.731973  0.851738


In [7]:

# Separar tratamiento y control
treated = df[df["D"] == 1]
control = df[df["D"] == 0]
rows = []
for var in ["x1", "x2", "x3", "x4"]:
    t_stat, p_val = ttest_ind(treated[var], control[var])
    rows.append({
        "Variable": var,
        "Treated_Mean": treated[var].mean(),
        "Control_Mean": control[var].mean(),
        "Difference": treated[var].mean() - control[var].mean(),
        "t_stat": t_stat,
        "p_value": p_val
    })

balance_table = pd.DataFrame(rows)
print(balance_table)

  Variable  Treated_Mean  Control_Mean  Difference    t_stat   p_value
0       x1      0.026877      0.005665    0.021213  0.343932  0.730970
1       x2      0.490526      0.516190   -0.025664 -0.810038  0.418111
2       x3      2.056260      2.152219   -0.095959 -0.773543  0.439384
3       x4      0.507233      0.484243    0.022990  1.268364  0.204964


## 3.2 Estimating the Average Treatment Effect

In [None]:
# Estimate the treatment effect (ATE) using a simple regression: (1 point)
model1 = smf.ols("Y ~ D", data=df).fit()
print(model1.summary().tables[1])

                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.3194      0.053      6.075      0.000       0.216       0.423
D              1.9418      0.076     25.455      0.000       1.792       2.091


In [9]:
# Estimate the ATE controlling for all covariates: (1 point)
model2 = smf.ols("Y ~ D + x1 + x2 + x3 + x4", data=df).fit()
print(model2.summary().tables[1])


                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0241      0.086     -0.279      0.780      -0.194       0.145
D              1.9441      0.065     29.962      0.000       1.817       2.071
x1             0.4927      0.033     14.807      0.000       0.427       0.558
x2            -0.2181      0.065     -3.368      0.001      -0.345      -0.091
x3             0.2011      0.017     12.159      0.000       0.169       0.234
x4             0.0422      0.113      0.373      0.710      -0.180       0.264


In [14]:
# Compare the two estimates. Answer the following questions: (1 point)
coef_treated = model1.params["D"]
coef_control = model2.params["D"]
se_treated = model1.bse["D"]
se_control = model2.bse["D"]

print(f"ATE (sin controles): {coef_treated:.3f} (SE={se_treated:.3f})")
print(f"ATE (con controles): {coef_control:.3f} (SE={se_control:.3f})")
print(f"Difference in ATE (con - sin): {coef_treated - coef_control:.3f} (SE={se_control - se_treated:.3f})")


ATE (sin controles): 1.942 (SE=0.076)
ATE (con controles): 1.944 (SE=0.065)
Difference in ATE (con - sin): -0.002 (SE=-0.011)


# 3.3) LASSO and Variable Selection


In [24]:
from sklearn.linear_model import LassoCV
import numpy as np

In [33]:
# Prepare data for LASSO
X = df[['x1','x2','x3','x4']].to_numpy()
Y = df['Y'].to_numpy()

# Cross-validation (10 folds by default)
lasso = LassoCV(cv=10, random_state=1234).fit(X, Y)

coef = pd.Series(lasso.coef_, index=['x1','x2','x3','x4'])
selected = coef[coef != 0].index.tolist()


print("Optimal alpha (λ):", lasso.alpha_)

Optimal alpha (λ): 0.0007465970591700263


In [38]:
# Re-estimate the ATE with only the covariates selected by LASSO: (1 point)

X_selected = df[['D'] + selected]
X_selected = sm.add_constant(X_selected, has_constant='add')
X_lasso = sm.OLS(df['Y'], X_selected).fit()
ate_lasso = X_lasso.params['D']
se_lasso  = X_lasso.bse['D']
ci_lasso  = X_lasso.conf_int().loc['D'].tolist()


print(f"ATE (coef D): {ate_lasso:.4f}\nSE: {se_lasso:.4f}\n95% CI: [{ci_lasso[0]:.4f}, {ci_lasso[1]:.4f}]")


ATE (coef D): 1.9441
SE: 0.0649
95% CI: [1.8167, 2.0714]


In [40]:
print(f"Simple ATE (3.2.1): {coef_treated:.4f}")
print(f"ATE with all controls (3.2.2): {coef_control:.4f}")
print(f"ATE with controls selected by LASSO (3.3.2): {ate_lasso:.4f}")
print("Comment: The ATE should be stable because the treatment was randomly assigned.")
print("Using LASSO can reduce noise, improve precision, and select only the most relevant covariates.")

Simple ATE (3.2.1): 1.9418
ATE with all controls (3.2.2): 1.9441
ATE with controls selected by LASSO (3.3.2): 1.9441
Comment: The ATE should be stable because the treatment was randomly assigned.
Using LASSO can reduce noise, improve precision, and select only the most relevant covariates.
