In [51]:
# --- Required Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from econml.dml import CausalForestDML
from linearmodels.iv import IV2SLS

# --- Load Datasets ---
did = pd.read_csv("dataset_did.csv")
psm = pd.read_csv("dataset_psm.csv")
cf = pd.read_csv("dataset_causal_forest.csv")
rd = pd.read_csv("dataset_rd.csv").rename(columns={'running_variable':'score'})
iv = pd.read_csv("dataset_iv.csv")

# --- 1. Difference-in-Differences ---
# did['treatment'] = (did['group'] == 'treatment') & (did['period'] == 'post')
did['period'] = pd.Categorical(did['period'], ['pre','post'])
did_model = smf.ols('conversion ~ group * period', data=did).fit()
did_estimate = did_model.params['group[T.treatment]:period[T.post]']

# --- 2. Propensity Score Matching ---
psm_encoded = pd.get_dummies(psm, columns=['device'], drop_first=True)
X = psm_encoded[['age', 'pre_cr', 'device_mobile', 'device_tablet']]
y = psm_encoded['treated']

model = LogisticRegression()
model.fit(X, y)
psm_encoded['propensity_score'] = model.predict_proba(X)[:, 1]

treated = psm_encoded[psm_encoded['treated'] == 1]
control = psm_encoded[psm_encoded['treated'] == 0]

nn = NearestNeighbors(n_neighbors=1)
nn.fit(control[['propensity_score']])
_, indices = nn.kneighbors(treated[['propensity_score']])
matched_control = control.iloc[indices.flatten()]
psm_att = (treated['cr'].values - matched_control['cr'].values).mean()

treated['weights'] = 1 / treated['propensity_score']
control['weights'] = 1 / (1 - control['propensity_score'])

ate_ipw = (
    np.sum(treated['weights'] * treated['cr']) / np.sum(treated['weights']) -
    np.sum(control['weights'] * control['cr']) / np.sum(control['weights'])
)

# --- 3. Causal Forests (CATE) ---
X_cf = cf[['age', 'pre_cr']]
T_cf = cf['treated']
Y_cf = cf['cr']

cf_model = CausalForestDML(
    model_y=RandomForestRegressor(),
    model_t=RandomForestClassifier(),
    discrete_treatment=True,
    random_state=0
)
cf_model.fit(Y_cf, T_cf, X=X_cf)
cf['cate'] = cf_model.effect(X_cf)
cf_cate_mean = cf['cate'].mean()

# --- 4. Regression Discontinuity ---
rd['above'] = (rd['score'] >= 50).astype(int)
rd_model = smf.ols('conversion ~ above + score', data=rd).fit()
rd_estimate = rd_model.params['above']

# --- 5. Instrumental Variables (2SLS) ---
iv_model = IV2SLS.from_formula('outcome ~ 1 + [treatment ~ instrument]', data=iv).fit()
iv_estimate = iv_model.params['treatment']

# --- Final Summary ---
print("===== SUMMARY OF CAUSAL EFFECTS =====")
print(f"DiD Estimate: {did_estimate:.4f}")
print(f"PSM ATT: {psm_att:.4f}")
print(f"PSM ATE: {ate_ipw:.4f}")
print(f"Causal Forest Mean CATE: {cf_cate_mean:.4f}")
print(f"RD Estimate: {rd_estimate:.4f}")
print(f"IV Estimate: {iv_estimate:.4f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treated['weights'] = 1 / treated['propensity_score']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control['weights'] = 1 / (1 - control['propensity_score'])


===== SUMMARY OF CAUSAL EFFECTS =====
DiD Estimate: 0.0139
PSM ATT: 0.0351
PSM ATE: 0.0474
Causal Forest Mean CATE: 0.0290
RD Estimate: 0.0183
IV Estimate: 2.1057
