# Librerias

In [9]:
import pandas as pd

import statsmodels.formula.api as smf
import matplotlib.pyplot as plt



Matplotlib is building the font cache; this may take a moment.


# Question 1 — TWFE & Event-Study

## a) TWFE

Note that asmrs is the outcome variable, pcinc, asmrh and cases are controls. The dataset already includes treatment and post-treatment variables.

Estimate a Two-Way Fixed Effects (TWFE) regression with unit and time fixed effects.

# Two-Way Fixed Effects DID:
outcome: asmrs

controls: pcinc, asmrh, cases

treatment: post

unit FE: stfips

time FE: year

In [3]:


url_git = "https://raw.githubusercontent.com/LOST-STATS/LOST-STATS.github.io/master/Model_Estimation/Data/Event_Study_DiD/bacon_example.csv"

df = pd.read_csv(url_git)

df.head()

Unnamed: 0,stfips,year,_nfd,post,asmrs,pcinc,asmrh,cases,weight,copop
0,1,1964,1971.0,0,35.639885,12406.178537,5.007341,0.012312,1715156.0,1715156.0
1,1,1965,1971.0,0,41.543755,13070.206738,4.425367,0.010419,1715156.0,1725186.0
2,1,1966,1971.0,0,34.252335,13526.663217,4.874819,0.0099,1715156.0,1735219.0
3,1,1967,1971.0,0,34.465023,13918.189823,5.362014,0.009975,1715156.0,1745250.0
4,1,1968,1971.0,0,40.440105,14684.808682,4.643759,0.012401,1715156.0,1755283.0


In [5]:
df.size

16170

In [6]:
df.describe()

Unnamed: 0,stfips,year,_nfd,post,asmrs,pcinc,asmrh,cases,weight,copop
count,1617.0,1617.0,1188.0,1617.0,1617.0,1617.0,1617.0,1617.0,1617.0,1617.0
mean,29.795918,1980.0,1973.583333,0.684601,52.166413,26080.967473,3.439058,0.024491,1959531.0,2365343.0
std,15.376316,9.52485,3.539969,0.464818,19.621876,6472.084916,1.901716,0.012091,2032263.0,2494629.0
min,1.0,1964.0,1969.0,0.0,7.477235,10274.976602,0.0,0.003131,162168.8,162168.8
25%,18.0,1972.0,1971.0,0.0,40.237625,21462.046505,2.166024,0.01599,486287.6,608496.0
50%,30.0,1980.0,1973.0,1.0,48.842182,25783.268441,3.207187,0.02315,1407318.0,1642584.0
75%,42.0,1988.0,1974.25,1.0,60.035744,30008.4207,4.356279,0.031429,2298065.0,2798542.0
max,56.0,1996.0,1985.0,1.0,185.970886,48822.390731,19.134295,0.088966,9028069.0,16038240.0


In [10]:
twfe_model = smf.ols(
    "asmrs ~ cases +post + asmrh  + pcinc + C(stfips) + C(year)",
    data=df
).fit(
    cov_type="cluster", 
    cov_kwds={"groups": df["stfips"]} 
)

print(twfe_model.summary())
print("\nCoeficiente del tratamiento (post):", twfe_model.params["post"])
print("Error estándar del tratamiento (post):", twfe_model.bse["post"])

                            OLS Regression Results                            
Dep. Variable:                  asmrs   R-squared:                       0.710
Model:                            OLS   Adj. R-squared:                  0.694
Method:                 Least Squares   F-statistic:                     32.13
Date:                Fri, 05 Dec 2025   Prob (F-statistic):           2.00e-23
Time:                        02:08:15   Log-Likelihood:                -6107.4
No. Observations:                1617   AIC:                         1.238e+04
Df Residuals:                    1532   BIC:                         1.284e+04
Df Model:                          84                                         
Covariance Type:              cluster                                         
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          51.8784      3.392     



# b) Cleaning for Event-Study


In [11]:
# 1. Get treatment time by unit
treat_time_by_state = (
    df[df["post"] == 1]
    .groupby("stfips")["year"]
    .min()
)

df["treat_time"] = df["stfips"].map(treat_time_by_state)

# 2. Create event time
df["event_time"] = df["year"] - df["treat_time"]

# 3. Frequency table and descriptives
freq = df["event_time"].value_counts(dropna=False).sort_index()
print(freq)

print(df["event_time"].dropna().describe())

# 4. Choose bounds (example)
# Definir límites para agrupar extremos
lb, ub = -10, 20

# Recodificar los tiempos extremos en un solo paso
df["event_time_group"] = (
    df["event_time"]
    .clip(lower=lb, upper=ub)   # agrupa ≤ lb y ≥ ub automáticamente
)

# Convertir a entero (manteniendo NA si existen)
df["event_time_group_int"] = df["event_time_group"].astype("Int64")

# Crear dummies de tiempo relativo
event_dummies = pd.get_dummies(
    df["event_time_group_int"],
    prefix="ev",
    dummy_na=False
)

# Elegir el periodo base y eliminarlo para no caer en trampa de dummy's
ref = -1
event_dummies = event_dummies.drop(columns=[f"ev_{ref}"])

# Merge dummies al DataFrame original
df = pd.concat([df, event_dummies], axis=1)

event_time
-21.0      1
-20.0      2
-19.0      2
-18.0      2
-17.0      2
-16.0      3
-15.0      3
-14.0      3
-13.0      6
-12.0      7
-11.0      9
-10.0     12
-9.0      22
-8.0      25
-7.0      32
-6.0      34
-5.0      36
-4.0      36
-3.0      36
-2.0      36
-1.0      36
 0.0      44
 1.0      44
 2.0      44
 3.0      44
 4.0      44
 5.0      44
 6.0      44
 7.0      44
 8.0      44
 9.0      44
 10.0     44
 11.0     44
 12.0     43
 13.0     42
 14.0     42
 15.0     42
 16.0     42
 17.0     41
 18.0     41
 19.0     41
 20.0     38
 21.0     37
 22.0     35
 23.0     32
 24.0     22
 25.0     19
 26.0     12
 27.0     10
 28.0      8
 29.0      8
 30.0      8
 31.0      8
 32.0      8
 NaN     165
Name: count, dtype: int64
count    1452.000000
mean        8.159091
std        10.707574
min       -21.000000
25%         0.000000
50%         8.000000
75%        17.000000
max        32.000000
Name: event_time, dtype: float64


In [19]:
df.columns

Index(['stfips', 'year', '_nfd', 'post', 'asmrs', 'pcinc', 'asmrh', 'cases',
       'weight', 'copop', 'treat_time', 'event_time', 'event_time_group',
       'event_time_group_int', 'ev_-10', 'ev_-9', 'ev_-8', 'ev_-7', 'ev_-6',
       'ev_-5', 'ev_-4', 'ev_-3', 'ev_-2', 'ev_0', 'ev_1', 'ev_2', 'ev_3',
       'ev_4', 'ev_5', 'ev_6', 'ev_7', 'ev_8', 'ev_9', 'ev_10', 'ev_11',
       'ev_12', 'ev_13', 'ev_14', 'ev_15', 'ev_16', 'ev_17', 'ev_18', 'ev_19',
       'ev_20'],
      dtype='object')

**Why do we usually group very distant event times together?**

We group very distant event times because extreme leads or lags typically contain very few observations. This produces highly noisy and unstable coefficient estimates, inflates standard errors, and can distort the event-study plot. Grouping the tails (binning) improves statistical precision, avoids overfitting, and focuses the analysis on periods where there is meaningful support in the data.

# c) Event-Study Estimation


1 Even study

In [None]:
# Nos quedamos solo con unidades con event_time_group definido
df_es = df.dropna(subset=["event_time_group"]).copy()

# Seleccionar las dummies creadas para el event study
event_cols = [
    col for col in df_es.columns
    if col.startswith("ev_") and "group" not in col
]

print("Dummies incluidas en el modelo:", event_cols)

# Construir el lado derecho de las dummies usando Q("")
rhs_events = " + ".join([f'Q("{col}")' for col in event_cols])

# Fórmula del modelo event-study con FE y controles
formula_es = (
    "asmrs ~ " + rhs_events +
    " + pcinc + asmrh + cases"
    " + C(stfips) + C(year)"
)

print("Fórmula usada:\n", formula_es)

# Estimar el TWFE Event Study
es_model = smf.ols(
    formula_es,
    data=df_es
).fit(
    cov_type="cluster",
    cov_kwds={"groups": df_es["stfips"]}
)

print(es_model.summary())


Dummies incluidas en el modelo: ['event_time']
Fórmula usada:
 asmrs ~ Q("event_time") + pcinc + asmrh + cases + C(stfips) + C(year)
                            OLS Regression Results                            
Dep. Variable:                  asmrs   R-squared:                       0.719
Model:                            OLS   Adj. R-squared:                  0.703
Method:                 Least Squares   F-statistic:                     49.18
Date:                Fri, 05 Dec 2025   Prob (F-statistic):           3.62e-25
Time:                        02:17:34   Log-Likelihood:                -5496.0
No. Observations:                1452   AIC:                         1.115e+04
Df Residuals:                    1373   BIC:                         1.157e+04
Df Model:                          78                                         
Covariance Type:              cluster                                         
                      coef    std err          z      P>|z|      [0.025      



# 2) Storing de coeficientes + SE en un dataframe

In [17]:
es_model.params.index

Index(['Intercept', 'C(stfips)[T.4]', 'C(stfips)[T.6]', 'C(stfips)[T.8]',
       'C(stfips)[T.9]', 'C(stfips)[T.11]', 'C(stfips)[T.12]',
       'C(stfips)[T.13]', 'C(stfips)[T.16]', 'C(stfips)[T.17]',
       'C(stfips)[T.18]', 'C(stfips)[T.19]', 'C(stfips)[T.20]',
       'C(stfips)[T.21]', 'C(stfips)[T.22]', 'C(stfips)[T.23]',
       'C(stfips)[T.24]', 'C(stfips)[T.25]', 'C(stfips)[T.26]',
       'C(stfips)[T.27]', 'C(stfips)[T.29]', 'C(stfips)[T.30]',
       'C(stfips)[T.31]', 'C(stfips)[T.32]', 'C(stfips)[T.33]',
       'C(stfips)[T.34]', 'C(stfips)[T.35]', 'C(stfips)[T.37]',
       'C(stfips)[T.38]', 'C(stfips)[T.39]', 'C(stfips)[T.40]',
       'C(stfips)[T.41]', 'C(stfips)[T.42]', 'C(stfips)[T.44]',
       'C(stfips)[T.45]', 'C(stfips)[T.46]', 'C(stfips)[T.48]',
       'C(stfips)[T.49]', 'C(stfips)[T.50]', 'C(stfips)[T.51]',
       'C(stfips)[T.53]', 'C(stfips)[T.54]', 'C(stfips)[T.55]',
       'C(stfips)[T.56]', 'C(year)[T.1965]', 'C(year)[T.1966]',
       'C(year)[T.1967]', 'C(ye