In [1]:
import pandas as pd
import numpy as np

path1 = r'C:\Users\matte\OneDrive\Desktop\GitHub\data\causal\short_offline_mkt_south.csv'

mkt_data = pd.read_csv(path1).astype({"date":"datetime64[ns]"})

mkt_data.head()

Unnamed: 0,date,city,region,treated,tau,downloads,post
0,2021-05-01,5,S,0,0.0,51.0,0
1,2021-05-02,5,S,0,0.0,51.0,0
2,2021-05-03,5,S,0,0.0,51.0,0
3,2021-05-04,5,S,0,0.0,50.0,0
4,2021-05-05,5,S,0,0.0,49.0,0


In [2]:
did_data = (mkt_data
                    .groupby(["treated", "post"])
                    .agg({"downloads":"mean", "date": "min"}))

did_data

Unnamed: 0_level_0,Unnamed: 1_level_0,downloads,date
treated,post,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,50.335034,2021-05-01
0,1,50.556878,2021-05-15
1,0,50.944444,2021-05-01
1,1,51.858025,2021-05-15


In [11]:
y0_est = (did_data.loc[1].loc[0, "downloads"] # treated baseline
                  # control evolution
                  + did_data.loc[0].diff().loc[1, "downloads"]) 

att = did_data.loc[1].loc[1, "downloads"] - y0_est
att

0.6917359536407233

In [16]:
pre = mkt_data.query("post==0").groupby("city")["downloads"].mean()
post = mkt_data.query("post==1").groupby("city")["downloads"].mean()

delta_y = ((post - pre)
                   .rename("delta_y")
                   .to_frame()
                   # add the treatment dummy
                   .join(mkt_data.groupby("city")["treated"].max()))

delta_y.tail()

Unnamed: 0_level_0,delta_y,treated
city,Unnamed: 1_level_1,Unnamed: 2_level_1
192,0.555556,0
193,0.166667,0
195,0.420635,0
196,0.119048,0
197,1.595238,1


In [18]:
(delta_y.query("treated==1")["delta_y"].mean() 
    - delta_y.query("treated==0")["delta_y"].mean())

0.6917359536407155

In [19]:
did_data = (mkt_data
                    .groupby(["city", "post"])
                    .agg({"downloads":"mean", "date": "min", "treated": "max"})
                    .reset_index())

did_data.head()

Unnamed: 0,city,post,downloads,date,treated
0,5,0,50.642857,2021-05-01,0
1,5,1,50.166667,2021-05-15,0
2,15,0,49.142857,2021-05-01,0
3,15,1,49.166667,2021-05-15,0
4,20,0,48.785714,2021-05-01,0


In [21]:
import statsmodels.formula.api as smf

smf.ols(
            'downloads ~ treated*post', data=did_data
        ).fit().params

Intercept       50.335034
treated          0.609410
post             0.221844
treated:post     0.691736
dtype: float64

In [22]:
def block_sample(df, unit_col):
             
    units = df[unit_col].unique()
    sample = np.random.choice(units, size=len(units), replace=True) 
             
    return (df
            .set_index(unit_col)
            .loc[sample]
            .reset_index(level=[unit_col]))

In [23]:
from joblib import Parallel, delayed

def block_bootstrap(data, est_fn, unit_col,
                             rounds=200, seed=123, pcts=[2.5, 97.5]):
             np.random.seed(seed)
             
             stats = Parallel(n_jobs=4)(
                 delayed(est_fn)(block_sample(data, unit_col=unit_col))
                 for _ in range(rounds))
             
             return np.percentile(stats, pcts)

In [24]:
def est_fn(df):
             m = smf.ols('downloads ~ treated:post + C(city) + C(date)',
                         data=df).fit()
             return m.params["treated:post"]

block_bootstrap(mkt_data, est_fn, "city")

array([0.23162214, 1.14002646])