In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\data\causal\management_training.csv')
df.head()

Unnamed: 0,departament_id,intervention,engagement_score,tenure,n_of_reports,gender,role,last_engagement_score,department_score,department_size
0,76,1,0.277359,6,4,2,4,0.614261,0.224077,843
1,76,1,-0.449646,4,8,2,4,0.069636,0.224077,843
2,76,1,0.769703,6,4,2,4,0.866918,0.224077,843
3,76,1,-0.121763,6,4,2,4,0.029071,0.224077,843
4,76,1,1.526147,6,4,1,4,0.589857,0.224077,843


In [2]:
import statsmodels.formula.api as smf

smf.ols("engagement_score ~ intervention",
            data=df).fit().summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.2347,0.014,-16.619,0.000,-0.262,-0.207
intervention,0.4346,0.019,22.616,0.000,0.397,0.472


In [3]:
model = smf.ols("""engagement_score ~ intervention 
        + tenure + last_engagement_score + department_score
        + n_of_reports + C(gender) + C(role)""", data=df).fit()

model.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.7206,0.052,-32.942,0.000,-1.823,-1.618
C(gender)[T.2],-0.2705,0.017,-15.942,0.000,-0.304,-0.237
C(role)[T.1],-0.4742,0.035,-13.602,0.000,-0.542,-0.406
C(role)[T.2],-0.2395,0.037,-6.518,0.000,-0.311,-0.167
C(role)[T.3],-0.4003,0.037,-10.705,0.000,-0.474,-0.327
C(role)[T.4],-0.1068,0.034,-3.179,0.001,-0.173,-0.041
intervention,0.2678,0.017,15.343,0.000,0.234,0.302
tenure,0.3694,0.008,47.742,0.000,0.354,0.385
last_engagement_score,-0.0102,0.007,-1.537,0.124,-0.023,0.003


In [4]:
ps_model = smf.logit("""intervention ~ 
        tenure + last_engagement_score + department_score
        + C(n_of_reports) + C(gender) + C(role)""", data=df).fit(disp=0)

In [6]:
data_ps = df.assign(
            propensity_score = ps_model.predict(df),
        )

data_ps[["intervention", "engagement_score", "propensity_score"]].head()

Unnamed: 0,intervention,engagement_score,propensity_score
0,1,0.277359,0.596106
1,1,-0.449646,0.391138
2,1,0.769703,0.602578
3,1,-0.121763,0.58099
4,1,1.526147,0.619976


In [7]:
model = smf.ols("engagement_score ~ intervention + propensity_score",
                        data=data_ps).fit()
model.params["intervention"]

0.26331267490276516

In [8]:
from sklearn.neighbors import KNeighborsRegressor

T = "intervention"
X = "propensity_score"
Y = "engagement_score"
treated = data_ps.query(f"{T}==1")
untreated = data_ps.query(f"{T}==0")

mt0 = KNeighborsRegressor(n_neighbors=1).fit(untreated[[X]],
                                                     untreated[Y])

mt1 = KNeighborsRegressor(n_neighbors=1).fit(treated[[X]], treated[Y])

predicted = pd.concat([
            # find matches for the treated looking at the untreated knn model
            treated.assign(match=mt0.predict(treated[[X]])),
            
            # find matches for the untreated looking at the treated knn model
            untreated.assign(match=mt1.predict(untreated[[X]]))
        ])

predicted.head()

Unnamed: 0,departament_id,intervention,engagement_score,tenure,n_of_reports,gender,role,last_engagement_score,department_score,department_size,propensity_score,match
0,76,1,0.277359,6,4,2,4,0.614261,0.224077,843,0.596106,0.55768
1,76,1,-0.449646,4,8,2,4,0.069636,0.224077,843,0.391138,-0.952622
2,76,1,0.769703,6,4,2,4,0.866918,0.224077,843,0.602578,-0.618381
3,76,1,-0.121763,6,4,2,4,0.029071,0.224077,843,0.58099,-1.404962
4,76,1,1.526147,6,4,1,4,0.589857,0.224077,843,0.619976,0.000354


In [9]:
weight_t = 1/data_ps.query("intervention==1")["propensity_score"]
weight_nt = 1/(1-data_ps.query("intervention==0")["propensity_score"])
t1 = data_ps.query("intervention==1")["engagement_score"] 
t0 = data_ps.query("intervention==0")["engagement_score"] 

y1 = sum(t1*weight_t)/len(data_ps)
y0 = sum(t0*weight_nt)/len(data_ps)

print("E[Y1]:", y1)
print("E[Y0]:", y0)
print("ATE", y1 - y0)

E[Y1]: 0.11656317232946992
E[Y0]: -0.14941553647814657
ATE 0.2659787088076165
