In [1]:
import pandas as pd
import numpy as np

In [2]:
mkt_data = (pd.read_csv(r"C:\Users\matte\OneDrive\Desktop\GitHub\data\causal\short_offline_mkt_south.csv")
                    .astype({"date":"datetime64[ns]"}))

mkt_data.head()

Unnamed: 0,date,city,region,treated,tau,downloads,post
0,2021-05-01,5,S,0,0.0,51.0,0
1,2021-05-02,5,S,0,0.0,51.0,0
2,2021-05-03,5,S,0,0.0,51.0,0
3,2021-05-04,5,S,0,0.0,50.0,0
4,2021-05-05,5,S,0,0.0,49.0,0


In [3]:
(mkt_data
         .assign(w = lambda d: d["treated"]*d["post"])
         .groupby(["w"])
         .agg({"date":[min, max]}))

  .agg({"date":[min, max]}))
  .agg({"date":[min, max]}))


Unnamed: 0_level_0,date,date
Unnamed: 0_level_1,min,max
w,Unnamed: 1_level_2,Unnamed: 2_level_2
0,2021-05-01,2021-06-01
1,2021-05-15,2021-06-01


In [4]:
did_data = (mkt_data
                    .groupby(["treated", "post"])
                    .agg({"downloads":"mean", "date": "min"}))

did_data

Unnamed: 0_level_0,Unnamed: 1_level_0,downloads,date
treated,post,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,50.335034,2021-05-01
0,1,50.556878,2021-05-15
1,0,50.944444,2021-05-01
1,1,51.858025,2021-05-15


In [5]:
y0_est = (did_data.loc[1].loc[0, "downloads"] # treated baseline
                  # control evolution
        + did_data.loc[0].diff().loc[1, "downloads"]) 

att = did_data.loc[1].loc[1, "downloads"] - y0_est
att

0.6917359536407233

In [8]:
mkt_data.query("post==1").query("treated==1")["tau"].mean()

0.7660316402518457

In [9]:
pre = mkt_data.query("post==0").groupby("city")["downloads"].mean()
post = mkt_data.query("post==1").groupby("city")["downloads"].mean()

delta_y = ((post - pre)
                   .rename("delta_y")
                   .to_frame()
                   # add the treatment dummy
                   .join(mkt_data.groupby("city")["treated"].max()))

delta_y.tail()

Unnamed: 0_level_0,delta_y,treated
city,Unnamed: 1_level_1,Unnamed: 2_level_1
192,0.555556,0
193,0.166667,0
195,0.420635,0
196,0.119048,0
197,1.595238,1


In [10]:
did_data = (mkt_data
                    .groupby(["city", "post"])
                    .agg({"downloads":"mean", "date": "min", "treated": "max"})
                    .reset_index())

did_data.head()

Unnamed: 0,city,post,downloads,date,treated
0,5,0,50.642857,2021-05-01,0
1,5,1,50.166667,2021-05-15,0
2,15,0,49.142857,2021-05-01,0
3,15,1,49.166667,2021-05-15,0
4,20,0,48.785714,2021-05-01,0


In [11]:
import statsmodels.formula.api as smf

smf.ols(
            'downloads ~ treated*post', data=did_data
        ).fit().params["treated:post"]

0.6917359536407144

In [12]:
m = smf.ols('downloads ~ treated:post + C(city) + C(post)',
                     data=did_data).fit()

m.params["treated:post"]

0.6917359536407073

In [13]:
m = smf.ols('downloads ~ treated*post', data=mkt_data).fit()

m.params["treated:post"]

0.6917359536407045

In [14]:
def block_sample(df, unit_col):
             
             units = df[unit_col].unique()
             sample = np.random.choice(units, size=len(units), replace=True) 
             
             return (df
                     .set_index(unit_col)
                     .loc[sample]
                     .reset_index(level=[unit_col]))

In [15]:
from joblib import Parallel, delayed

def block_bootstrap(data, est_fn, unit_col,
                             rounds=200, seed=123, pcts=[2.5, 97.5]):
             np.random.seed(seed)
             
             stats = Parallel(n_jobs=4)(
                 delayed(est_fn)(block_sample(data, unit_col=unit_col))
                 for _ in range(rounds))
             
             return np.percentile(stats, pcts)

In [16]:
def est_fn(df):
             m = smf.ols('downloads ~ treated:post + C(city) + C(date)',
                         data=df).fit()
             return m.params["treated:post"]

block_bootstrap(mkt_data, est_fn, "city")

array([0.23162214, 1.14002646])

In [17]:
m = smf.ols(
             'downloads ~ treated:post + C(city) + C(date)', data=mkt_data
         ).fit(cov_type='cluster', cov_kwds={'groups': mkt_data['city']})

print("ATT:", m.params["treated:post"])
m.conf_int().loc["treated:post"]

ATT: 0.6917359536406645


0    0.296101
1    1.087370
Name: treated:post, dtype: float64