# __Econometric Game__: Effect of electoral accountability on corruption?
   
### Team 3: Katheryn Ding, Amber Wei, Max Ye



## Set up 

In [103]:
import numpy as np
import pandas as pd
import tensorflow as tf
from scipy.special import expit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")


In [104]:
corruption_df = pd.read_stata("corruptiondata.dta")
corruption_df.columns

Index(['uf', 'nsorteio', 'totrecursos', 'tot_os', 'pop', 'purb',
       'p_secundario', 'cod_ibge6', 'pib_capita_02', 'op_01_04',
       ...
       'uf_d18', 'uf_d19', 'uf_d20', 'uf_d21', 'uf_d22', 'uf_d23', 'uf_d24',
       'uf_d25', 'uf_d26', 'esample2'],
      dtype='object', length=116)

In [105]:
# Define covariates for each category

# Mayor characteristics
mayor_covariates = [
    "pref_idade_tse",  # Age
    "pref_masc",       # Gender
    "pref_escola",     # Schooling
    "winmargin2000",   # Margin of victory in 2000
    "exp_prefeito"     # Was previously a mayor in a consecutive term
] + [col for col in corruption_df.columns if col.startswith("party_d")]

# Municipal characteristics
municipal_covariates = [
    "lpop",           # Log of population in 2000
    "purb",           # Percentage of population in urban sectors
    "p_secundario",   # Percentage with at least secondary education
    "mun_novo",       # New municipality indicator
    "lpib02",         # Log of GDP per capita in 2002
    "gini_ipea"       # Gini coefficient
]

# Political and Judicial characteristics
political_judicial_covariates = [
    "ENEP2000",  # Effective number of parties in 2000 mayor elections
    "ENLP2000",  # Effective number of parties in 2000 legislative elections
    "p_cad_pref" # Proportion of legislators from the same party as the mayor
]

# Dummies
dummy_covariates = [
    col for col in corruption_df.columns if col.startswith("uf_d") or col.startswith("sorteio")
]


print("Mayor Covariates:", mayor_covariates)
print("Municipal Covariates:", municipal_covariates)
print("Political and Judicial Covariates:", political_judicial_covariates)
print("Dummy Covariates:", dummy_covariates)

Mayor Covariates: ['pref_idade_tse', 'pref_masc', 'pref_escola', 'winmargin2000', 'exp_prefeito', 'party_d1', 'party_d3', 'party_d4', 'party_d5', 'party_d6', 'party_d7', 'party_d8', 'party_d9', 'party_d10', 'party_d11', 'party_d12', 'party_d13', 'party_d14', 'party_d15', 'party_d16', 'party_d17', 'party_d18']
Municipal Covariates: ['lpop', 'purb', 'p_secundario', 'mun_novo', 'lpib02', 'gini_ipea']
Political and Judicial Covariates: ['ENEP2000', 'ENLP2000', 'p_cad_pref']
Dummy Covariates: ['sorteio1', 'sorteio2', 'sorteio3', 'sorteio4', 'sorteio5', 'sorteio6', 'sorteio7', 'sorteio8', 'sorteio9', 'sorteio10', 'uf_d1', 'uf_d2', 'uf_d3', 'uf_d4', 'uf_d5', 'uf_d6', 'uf_d7', 'uf_d8', 'uf_d9', 'uf_d10', 'uf_d11', 'uf_d12', 'uf_d13', 'uf_d14', 'uf_d15', 'uf_d16', 'uf_d17', 'uf_d18', 'uf_d19', 'uf_d20', 'uf_d21', 'uf_d22', 'uf_d23', 'uf_d24', 'uf_d25', 'uf_d26']


In [106]:
corruption_df["log_valor_corrupt"] = np.log(corruption_df["valor_corrupt"] + 1)

# Define the treatment and outcome variables
treatment = "first"  
outcomes = ["pcorrupt", "ncorrupt_os", 'log_valor_corrupt'] 

# Use all covariates
all_covariates = (
    mayor_covariates +
    municipal_covariates +
    political_judicial_covariates +
    dummy_covariates
)
#set up dataset:
required_columns = [treatment] + outcomes + all_covariates
double_ml_dataset = corruption_df[required_columns].dropna()

print("Dataset Columns:", double_ml_dataset.columns)
print("Number of rows in the Dataset:", double_ml_dataset.shape[0])

Dataset Columns: Index(['first', 'pcorrupt', 'ncorrupt_os', 'log_valor_corrupt',
       'pref_idade_tse', 'pref_masc', 'pref_escola', 'winmargin2000',
       'exp_prefeito', 'party_d1', 'party_d3', 'party_d4', 'party_d5',
       'party_d6', 'party_d7', 'party_d8', 'party_d9', 'party_d10',
       'party_d11', 'party_d12', 'party_d13', 'party_d14', 'party_d15',
       'party_d16', 'party_d17', 'party_d18', 'lpop', 'purb', 'p_secundario',
       'mun_novo', 'lpib02', 'gini_ipea', 'ENEP2000', 'ENLP2000', 'p_cad_pref',
       'sorteio1', 'sorteio2', 'sorteio3', 'sorteio4', 'sorteio5', 'sorteio6',
       'sorteio7', 'sorteio8', 'sorteio9', 'sorteio10', 'uf_d1', 'uf_d2',
       'uf_d3', 'uf_d4', 'uf_d5', 'uf_d6', 'uf_d7', 'uf_d8', 'uf_d9', 'uf_d10',
       'uf_d11', 'uf_d12', 'uf_d13', 'uf_d14', 'uf_d15', 'uf_d16', 'uf_d17',
       'uf_d18', 'uf_d19', 'uf_d20', 'uf_d21', 'uf_d22', 'uf_d23', 'uf_d24',
       'uf_d25', 'uf_d26'],
      dtype='object')
Number of rows in the Dataset: 467


## Lasso 

In [107]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

#### Assuming Heterogeneosity 

In [147]:
# Function to estimate propensity scores using Lasso
def estimate_propensity_scores_lasso(X_train, D_train, X_test):
    lasso_pscore = LassoCV(alphas=np.logspace(-4, 0, 50), cv=5, random_state=42).fit(X_train, D_train)
    D_pred = lasso_pscore.predict(X_test)
    propensity_scores = expit(D_pred)  # Convert to probabilities
    return propensity_scores

In [148]:
# Function to estimate outcome model using Lasso for treated and untreated groups
def estimate_outcome_lasso(X_train, Y_train, D_train, X_test, treated=True):
    lasso_model = LassoCV(alphas=np.logspace(-4, 0, 50), cv=5, random_state=42)
    if treated:
        lasso_model.fit(X_train[D_train == 1], Y_train[D_train == 1])
    else:
        lasso_model.fit(X_train[D_train == 0], Y_train[D_train == 0])
    return lasso_model.predict(X_test)


In [149]:
# Function to perform doubly robust estimation with cross-fitting
def doubly_robust_estimation_lasso(X, D, Y, k_folds=5):
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    W_hat_dr_values = []
    mse_fold = []
    
    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D.iloc[train_index], D.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
        
        # Step 1: Estimate propensity scores
        propensity_scores = estimate_propensity_scores_lasso(X_train, D_train, X_test)
        
        # Trim propensity scores
        trimmed_indices = (propensity_scores > 0.01) & (propensity_scores < 0.99)
        trimmed_X = X_test[trimmed_indices]
        trimmed_D = D_test.iloc[trimmed_indices]
        trimmed_Y = Y_test.iloc[trimmed_indices]
        trimmed_pscore = propensity_scores[trimmed_indices]
        
        # Step 2: Estimate outcome models for treated and untreated groups
        gamma1 = estimate_outcome_lasso(X_train, Y_train, D_train, trimmed_X, treated=True)
        gamma0 = estimate_outcome_lasso(X_train, Y_train, D_train, trimmed_X, treated=False)
        
        # Step 3: Doubly Robust Estimates
        trimmed_data = pd.DataFrame({
            "gamma1": gamma1,
            "gamma0": gamma0,
            "D": trimmed_D.values,
            "Y": trimmed_Y.values,
            "pscore": trimmed_pscore,
        })
        
        trimmed_data['Y1_dr'] = trimmed_data['gamma1'] + \
            (trimmed_data['D'] / trimmed_data['pscore']) * (trimmed_data['Y'] - trimmed_data['gamma1'])
        
        trimmed_data['Y0_dr'] = trimmed_data['gamma0'] + \
            ((1 - trimmed_data['D']) / (1 - trimmed_data['pscore'])) * (trimmed_data['Y'] - trimmed_data['gamma0'])
        
        # Step 4: Calculate treatment effect and MSE
        W_hat_dr_values.append(np.mean(trimmed_data['Y1_dr'] - trimmed_data['Y0_dr']))
        mse_fold.append(mean_squared_error(
            trimmed_data['Y'], 
            trimmed_data['Y1_dr'] * trimmed_data['D'] + trimmed_data['Y0_dr'] * (1 - trimmed_data['D'])
        ))
    
    # Return average treatment effect and MSE
    return np.mean(W_hat_dr_values), np.mean(mse_fold)

In [150]:
# Main function to loop through outcomes and store results
def estimate_dr_for_outcomes_lasso(double_ml_dataset, all_covariates, treatment, outcomes):
    theta_hat_dr_values = []
    mse_values = []

    for outcome in outcomes:
        X = double_ml_dataset[all_covariates]
        D = double_ml_dataset[treatment]
        Y = double_ml_dataset[outcome]
        
        dr_ate, mse = doubly_robust_estimation_lasso(X, D, Y)
        theta_hat_dr_values.append(dr_ate)
        mse_values.append(mse)
    
    # Create Results DataFrame
    result_df = pd.DataFrame({
        "Outcome": outcomes,
        "Estimate": theta_hat_dr_values,
        "MSE": mse_values
    })
    return result_df

In [151]:
result_df = estimate_dr_for_outcomes_lasso(double_ml_dataset, all_covariates, treatment, outcomes)
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting using Lasso:")
print(result_df)

Doubly Robust Treatment Effect Estimates with Cross-Fitting using Lasso:
             Outcome  Estimate        MSE
0           pcorrupt -0.022035   0.017367
1        ncorrupt_os -0.008515   0.003262
2  log_valor_corrupt -1.067475  36.140774


### Balance Checking

In [162]:
def compute_propensity_scores(X, D):
    lasso_pscore = LassoCV(alphas=np.logspace(-4, 0, 50), cv=5, random_state=42)
    lasso_pscore.fit(X, D)
    propensity_scores = expit(lasso_pscore.predict(X))  # Convert to probabilities using expit
    return propensity_scores
def compute_smd(double_ml_dataset, covariates, treatment):
    X = double_ml_dataset[covariates].values
    D = double_ml_dataset[treatment].values
    # Estimate propensity scores
    pscore = compute_propensity_scores(X, D)
    
    # Create weights
    weights_treated = 1 / pscore
    weights_control = 1 / (1 - pscore)
    
    # Treated and control indices
    treated_indices = (D == 1)
    control_indices = (D == 0)
    
    # Compute weighted means
    weighted_means_treated = np.average(X[treated_indices], axis=0, weights=weights_treated[treated_indices])
    weighted_means_control = np.average(X[control_indices], axis=0, weights=weights_control[control_indices])
    
    # Compute pooled standard deviation
    pooled_std = np.sqrt((np.var(X[treated_indices], axis=0) + np.var(X[control_indices], axis=0)) / 2)
    
    # Compute Standardized Mean Differences (SMD)
    smd = (weighted_means_treated - weighted_means_control) / pooled_std
    
    return smd

# Example usage
smd_values = compute_smd(double_ml_dataset, all_covariates, treatment)


# Print only the SMD
print("Standardized Mean Differences (SMD):")
print(smd_values)

Standardized Mean Differences (SMD):
[ 0.104372   -0.04765426  0.02968724 -0.38068098  0.38181138  0.19310077
 -0.0484961  -0.12084576  0.10655728  0.06272838  0.08174942  0.08930089
  0.0956954   0.16994452 -0.07496204 -0.141793    0.11795554  0.14579037
  0.13770705  0.08130773  0.09544342 -0.09380843  0.06517223  0.17611495
  0.2541373  -0.08120754  0.10690512  0.127647    0.30606577  0.32192838
 -0.4765111   0.00553448 -0.09751498 -0.03301595  0.02807158  0.00833995
 -0.03725498 -0.00081405  0.04965926 -0.07079715  0.14603858  0.07495855
 -0.10515903 -0.12186608  0.22405127  0.08188919  0.04810288  0.00320493
  0.03633524 -0.08662325  0.05773501  0.01315004  0.00978864  0.05808311
 -0.11713681 -0.11842766 -0.04898335 -0.07461609 -0.1673562  -0.06620508
 -0.13794576  0.03507964  0.1753901   0.05958109 -0.09663015  0.12857486
  0.04806826]


Not very balanced.

#### Assuming Homogeneous treatment effct 

In [152]:
def estimate_outcome_lasso(X_train, Y_train, X_test):
    outcome_model = LassoCV(alphas=np.logspace(-4, 0, 50), cv=5, random_state=42)
    outcome_model.fit(X_train, Y_train)
    gamma_hat = outcome_model.predict(X_test)
    return gamma_hat

In [153]:
# Function to estimate E[D|X] using Lasso
def estimate_propensity_lasso(X_train, D_train, X_test):
    pscore_model = LassoCV(alphas=np.logspace(-4, 0, 50), cv=5, random_state=42)
    pscore_model.fit(X_train, D_train)
    pi_hat = pscore_model.predict(X_test)
    return pi_hat

In [154]:
# Function to perform doubly robust estimation with cross-fitting
def doubly_robust_crossfit_lasso(X, D, Y, k_folds=5):
    # Standardize features within this function
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Initialize variables
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    W_hat, V_hat = np.zeros(len(Y)), np.zeros(len(Y))
    mse_fold = []  # List to store MSE for each fold
    
    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D[train_index], D[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        
        # Step 1: Estimate outcome E[Y|X]
        gamma_hat = estimate_outcome_lasso(X_train, Y_train, X_test)
        
        # Step 2: Estimate propensity scores E[D|X]
        pi_hat = estimate_propensity_lasso(X_train, D_train, X_test)
        
        # Step 3: Calculate residuals
        W_hat[test_index] = Y_test - gamma_hat
        V_hat[test_index] = D_test - pi_hat
        
        # Step 4: Calculate MSE for this fold
        mse_fold.append(mean_squared_error(Y_test, gamma_hat))
    
    # Step 5: Regress residuals W_hat on V_hat to estimate theta_0
    regression = LinearRegression()
    regression.fit(V_hat.reshape(-1, 1), W_hat)
    theta_hat = regression.coef_[0]
    
    return theta_hat, np.mean(mse_fold)

In [155]:
def estimate_dr_homogeneous_lasso(double_ml_dataset, all_covariates, treatment, outcomes):
    theta_hat_dr_values = []
    mse_values = []

    for outcome in outcomes:
        X = double_ml_dataset[all_covariates].values
        D = double_ml_dataset[treatment].values
        Y = double_ml_dataset[outcome].values
        
        theta_hat, mse = doubly_robust_crossfit_lasso(X, D, Y)
        theta_hat_dr_values.append(theta_hat)
        mse_values.append(mse)
    
    # Create Results DataFrame
    result_df = pd.DataFrame({
        "Outcome": outcomes,
        "Estimate": theta_hat_dr_values,
        "MSE": mse_values
    })
    return result_df

In [156]:
ls_result_df_homo = estimate_dr_homogeneous_lasso(double_ml_dataset, all_covariates, treatment, outcomes)

# Print the results
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous):")
print(ls_result_df_homo)

Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous):
             Outcome  Estimate        MSE
0           pcorrupt -0.023304   0.010267
1        ncorrupt_os -0.008581   0.002078
2  log_valor_corrupt -1.205078  24.754757


## Random Forest

In [111]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

#### Assuming Heterogeneous Treatment Effect

In [112]:
def estimate_propensity_scores_rf(X_train, D_train, X_test, n_estimators=100, max_depth=5, random_state=42):
    rf_pscore = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    rf_pscore.fit(X_train, D_train)
    propensity_scores = rf_pscore.predict_proba(X_test)[:, 1]  # Probability of treatment
    return propensity_scores

In [113]:
# Function to estimate outcome models for treated and untreated groups using Random Forest
def estimate_outcome_rf(X_train, Y_train, X_test, D_train, treated=True, n_estimators=100, max_depth=5, random_state=42):
    rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    if treated:
        rf_model.fit(X_train[D_train == 1], Y_train[D_train == 1])
    else:
        rf_model.fit(X_train[D_train == 0], Y_train[D_train == 0])
    return rf_model.predict(X_test)

In [114]:
# Function to calculate doubly robust estimates for a single outcome
def calculate_dr_estimates_rf(X, D, Y, k_folds=5):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    dr_ate_fold = []
    mse_fold = []

    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D.iloc[train_index], D.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
        
        # Step 1: Estimate propensity scores
        propensity_scores = estimate_propensity_scores_rf(X_train, D_train, X_test)

        # Step 2: Trim propensity scores
        trimmed_indices = (propensity_scores > 0.01) & (propensity_scores < 0.99)
        trimmed_X = X_test[trimmed_indices]
        trimmed_D = D_test.iloc[trimmed_indices]
        trimmed_Y = Y_test.iloc[trimmed_indices]
        trimmed_pscore = propensity_scores[trimmed_indices]

        # Step 3: Estimate outcome models for treated and untreated
        gamma1 = estimate_outcome_rf(X_train, Y_train, trimmed_X, D_train, treated=True)
        gamma0 = estimate_outcome_rf(X_train, Y_train, trimmed_X, D_train, treated=False)

        # Step 4: Construct doubly robust estimates
        trimmed_data = pd.DataFrame({
            "gamma1": gamma1,
            "gamma0": gamma0,
            "D": trimmed_D.values,
            "Y": trimmed_Y.values,
            "pscore": trimmed_pscore
        })

        trimmed_data['Y1_dr'] = (
            trimmed_data['gamma1'] +
            (trimmed_data['D'] / trimmed_data['pscore']) * (trimmed_data['Y'] - trimmed_data['gamma1'])
        )
        trimmed_data['Y0_dr'] = (
            trimmed_data['gamma0'] +
            ((1 - trimmed_data['D']) / (1 - trimmed_data['pscore'])) * (trimmed_data['Y'] - trimmed_data['gamma0'])
        )

        # Step 5: Calculate treatment effect and MSE
        dr_ate_fold.append(np.mean(trimmed_data['Y1_dr'] - trimmed_data['Y0_dr']))
        mse_fold.append(mean_squared_error(
            trimmed_data['Y'], 
            trimmed_data['Y1_dr'] * trimmed_data['D'] + trimmed_data['Y0_dr'] * (1 - trimmed_data['D'])
        ))
    
    # Average results across folds
    dr_ate = np.mean(dr_ate_fold)
    mse = np.mean(mse_fold)
    return dr_ate, mse

In [115]:
# Function to loop through all outcomes and calculate results
def run_dr_estimation_rf(dataset, covariates, treatment, outcomes):
    theta_hat_dr_values = []
    mse_values = []

    for outcome in outcomes:
        print(f"Processing outcome: {outcome}")
        X = dataset[covariates]
        D = dataset[treatment]
        Y = dataset[outcome]
        
        dr_ate, mse = calculate_dr_estimates_rf(X, D, Y)
        theta_hat_dr_values.append(dr_ate)
        mse_values.append(mse)
    
    # Create Results DataFrame
    results_df = pd.DataFrame({
        "Outcome": outcomes,
        "Estimate": theta_hat_dr_values,
        "MSE": mse_values
    })
    return results_df

In [116]:
rf_result_df_hetero = run_dr_estimation_rf(
    dataset=double_ml_dataset, 
    covariates=all_covariates, 
    treatment=treatment, 
    outcomes=outcomes
)

# Print the final results
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting (Random Forest):")
print(rf_result_df_hetero)

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt
Doubly Robust Treatment Effect Estimates with Cross-Fitting (Random Forest):
             Outcome  Estimate        MSE
0           pcorrupt -0.026246   0.018019
1        ncorrupt_os -0.007434   0.002945
2  log_valor_corrupt -0.910680  33.156128


### Balance Checking

In [163]:
def compute_propensity_scores(X, D):
    rf_pscore = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
    rf_pscore.fit(X, D)
    propensity_scores = rf_pscore.predict_proba(X)[:, 1]  # Probability of treatment (D=1)
    return propensity_scores

smd_values = compute_smd(double_ml_dataset, all_covariates, treatment)


# Print only the SMD
print("Standardized Mean Differences (SMD):")
print(smd_values)

Standardized Mean Differences (SMD):
[ 0.06558709 -0.04899405 -0.00106626 -0.29304137  0.36982794  0.20952734
 -0.0443829  -0.10723244  0.08466159  0.07859063  0.07887369  0.08302273
  0.06478372  0.17108093 -0.06431373 -0.15535929  0.12530652  0.14007413
  0.13658106  0.08332839  0.08777559 -0.09634819 -0.0112643   0.05041207
  0.12867463 -0.0300527   0.04889403  0.12869706  0.19468053  0.20856626
 -0.35231704  0.00814893 -0.08209678 -0.0428761   0.022087    0.02805476
 -0.03007489  0.00071295  0.02754137 -0.06592755  0.1337333   0.09644191
 -0.0837574  -0.11902339  0.20411795  0.09864509  0.07200407 -0.03468904
  0.03025831 -0.06524535  0.02816652  0.00475782  0.00601626  0.06005698
 -0.08238073 -0.10178711 -0.01367828 -0.10251496 -0.19683067 -0.03652178
 -0.13499757  0.00736641  0.16936891  0.04743023 -0.08062995  0.10341781
  0.04708293]


Balanced

#### Assuming Homogeneous Treatment Effect

In [118]:
# Function to estimate E[Y|X] using Random Forest for the outcome model
def estimate_outcome_rf_homo(X_train, Y_train, X_test, n_estimators=100, max_depth=5, random_state=42):
    rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    rf_model.fit(X_train, Y_train)
    return rf_model.predict(X_test)

In [119]:
# Function to calculate residuals (W_hat and V_hat)
def calculate_residuals_rf_homo(X_train, X_test, Y_train, Y_test, D_train, D_test):
    # Estimate E[Y|X] using Random Forest (Outcome model)
    gamma_hat = estimate_outcome_rf_homo(X_train, Y_train, X_test)

    # Estimate E[D|X] using Random Forest (Propensity score model)
    pi_hat = estimate_propensity_scores_rf(X_train, D_train, X_test)
    
    W_hat = Y_test - gamma_hat
    V_hat = D_test - pi_hat
    
    return W_hat, V_hat, gamma_hat

In [120]:
# Function to calculate doubly robust estimates for homogeneous effects
def calculate_dr_estimates_rf_homo(X, D, Y, k_folds=5):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    W_hat, V_hat = np.zeros(len(Y)), np.zeros(len(Y))
    mse_fold = []  # List to store MSE for each fold

    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D[train_index], D[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        
        # Calculate residuals
        W_hat[test_index], V_hat[test_index], gamma_hat = calculate_residuals(
            X_train, X_test, Y_train, Y_test, D_train, D_test
        )
        
        # Calculate MSE for this fold
        mse_fold.append(mean_squared_error(Y_test, gamma_hat))

    # Regress W_hat on V_hat to estimate theta_0
    regression = LinearRegression()
    regression.fit(V_hat.reshape(-1, 1), W_hat)
    theta_hat = regression.coef_[0]

    return theta_hat, np.mean(mse_fold)

In [121]:
# Function to run DR estimation for multiple outcomes
def run_dr_estimation_rf_homo(dataset, covariates, treatment, outcomes):
    theta_hat_dr_values = []
    mse_values = []

    for outcome in outcomes:
        print(f"Processing outcome: {outcome}")
        X = dataset[covariates].values
        D = dataset[treatment].values
        Y = dataset[outcome].values
        
        theta_hat, mse = calculate_dr_estimates_rf_homo(X, D, Y)
        theta_hat_dr_values.append(theta_hat)
        mse_values.append(mse)
    
    # Create Results DataFrame
    results_df = pd.DataFrame({
        "Outcome": outcomes,
        "Estimate": theta_hat_dr_values,
        "MSE": mse_values
    })
    return results_df

In [122]:
rf_result_df_homo = run_dr_estimation_rf_homo(
    dataset=double_ml_dataset, 
    covariates=all_covariates, 
    treatment=treatment, 
    outcomes=outcomes
)

# Print the results
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous) using Random Forest:")
print(rf_result_df_homo)

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt
Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous) using Random Forest:
             Outcome  Estimate        MSE
0           pcorrupt -0.021572   0.011728
1        ncorrupt_os -0.006105   0.002650
2  log_valor_corrupt -0.876484  26.331269


In [123]:
theta_hat_dr_values = []
mse_values = []

# Loop through each outcome
for outcome in outcomes:
    
    X = double_ml_dataset[all_covariates].values
    D = double_ml_dataset[treatment].values
    Y = double_ml_dataset[outcome].values

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # K-Fold Cross-Fitting
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    W_hat, V_hat = np.zeros(len(Y)), np.zeros(len(Y))
    mse_fold = []  # List to store MSE for each fold
    
    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D[train_index], D[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        # Step 1: Estimate E[Y|X] using Random Forest (Outcome model)
        outcome_model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
        outcome_model.fit(X_train, Y_train)
        gamma_hat = outcome_model.predict(X_test)

        # Step 2: Estimate E[D|X] using Random Forest (Propensity score model)
        pscore_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
        pscore_model.fit(X_train, D_train)
        pi_hat = pscore_model.predict_proba(X_test)[:, 1]  # Probability of treatment
        
        # Step 3: Calculate residuals W_hat and V_hat
        W_hat[test_index] = Y_test - gamma_hat
        V_hat[test_index] = D_test - pi_hat

        # Step 4: Calculate MSE for this fold
        mse_fold.append(mean_squared_error(Y_test, gamma_hat))

    # Step 5: Regress W_hat on V_hat to estimate theta_0
    regression = LinearRegression()
    regression.fit(V_hat.reshape(-1, 1), W_hat)
    theta_hat = regression.coef_[0]
    
    # Store results
    theta_hat_dr_values.append(theta_hat)
    mse_values.append(np.mean(mse_fold))  # Average MSE across folds

# Step 6: Create Results DataFrame
rf_result_df_homo = pd.DataFrame({
    "Outcome": outcomes,
    "Estimate": theta_hat_dr_values,
    "MSE": mse_values
})

# Display the results DataFrame
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous) using Random Forest:")
print(rf_result_df_homo)

Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous) using Random Forest:
             Outcome  Estimate        MSE
0           pcorrupt -0.024575   0.010883
1        ncorrupt_os -0.007833   0.002269
2  log_valor_corrupt -0.966588  26.056496


## Multiple-Layer Perceptron

In [124]:
from sklearn.neural_network import MLPRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [125]:
# Function to build and train an MLP model
def train_mlp(X_train, Y_train, input_dim, output_activation='linear', loss='mse', verbose=0):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_dim,)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation=output_activation)
    ])
    model.compile(optimizer='adam', loss=loss, metrics=['mae'])
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model.fit(X_train, Y_train, validation_split=0.2, epochs=100, batch_size=32,
              callbacks=[early_stopping], verbose=verbose)
    return model

In [126]:
# Function to calculate doubly robust treatment effects using MLP
def double_ml_mlp_heterogeneous(X, D, Y, outcomes):
    theta_hat_dr_values = []
    mse_values = []

    # Standardize covariates
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 5-Fold Cross-Fitting
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    for outcome in outcomes:
        dr_ate_fold = []
        mse_fold = []
        
        for train_index, test_index in kf.split(X_scaled):
            X_train, X_test = X_scaled[train_index], X_scaled[test_index]
            D_train, D_test = D[train_index], D[test_index]
            Y_train, Y_test = Y[train_index], Y[test_index]
            
            # Step 1: Propensity score estimation using MLP
            model_pscore = train_mlp(X_train, D_train, input_dim=X_train.shape[1], 
                                     output_activation='sigmoid', loss='binary_crossentropy')
            propensity_scores = model_pscore.predict(X_test).flatten()

            # Trim p-scores (0.01, 0.99)
            trimmed_indices = (propensity_scores > 0.01) & (propensity_scores < 0.99)
            trimmed_X = X_test[trimmed_indices]
            trimmed_D = D_test[trimmed_indices]
            trimmed_Y = Y_test[trimmed_indices]
            trimmed_pscore = propensity_scores[trimmed_indices]

            # Step 2: Fit outcome models for treated and untreated groups
            model_treated = train_mlp(X_train[D_train == 1], Y_train[D_train == 1], input_dim=X_train.shape[1])
            gamma1 = model_treated.predict(trimmed_X).flatten()

            model_untreated = train_mlp(X_train[D_train == 0], Y_train[D_train == 0], input_dim=X_train.shape[1])
            gamma0 = model_untreated.predict(trimmed_X).flatten()

            # Step 3: Construct doubly robust estimates
            trimmed_data = pd.DataFrame({
                "gamma1": gamma1,
                "gamma0": gamma0,
                "D": trimmed_D,
                "Y": trimmed_Y,
                "pscore": trimmed_pscore
            })
            trimmed_data['Y1_dr'] = (
                trimmed_data['gamma1'] +
                (trimmed_data['D'] / trimmed_data['pscore']) * (trimmed_data['Y'] - trimmed_data['gamma1'])
            )
            trimmed_data['Y0_dr'] = (
                trimmed_data['gamma0'] +
                ((1 - trimmed_data['D']) / (1 - trimmed_data['pscore'])) * (trimmed_data['Y'] - trimmed_data['gamma0'])
            )

            # Calculate treatment effect and MSE for the fold
            dr_ate_fold.append(np.mean(trimmed_data['Y1_dr'] - trimmed_data['Y0_dr']))
            mse_fold.append(mean_squared_error(
                trimmed_data['Y'], 
                trimmed_data['Y1_dr'] * trimmed_data['D'] + trimmed_data['Y0_dr'] * (1 - trimmed_data['D'])
            ))

        # Store average estimates and MSE
        theta_hat_dr_values.append(np.mean(dr_ate_fold))
        mse_values.append(np.mean(mse_fold))

    results_df = pd.DataFrame({
        "Outcome": outcomes,
        "Estimate": theta_hat_dr_values,
        "MSE": mse_values
    })
    return results_df

In [127]:
# Function for easy call 
def run_double_ml_mlp(dataset, covariants, outcomes):
    X = dataset[covariants].values
    D = dataset[treatment].values
    results = []
    
    for outcome in outcomes:
        Y = dataset[outcome].values
        print(f"Processing outcome: {outcome}")
        result = double_ml_mlp_heterogeneous(X, D, Y, [outcome])
        results.append(result)
    
    final_results = pd.concat(results, ignore_index=True)
    return final_results

In [128]:
#mlp_result with all covariants
mlp_result_df_hetero = run_double_ml_mlp(double_ml_dataset, all_covariates, outcomes)
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting (Heterogeneous) using MLP:")
mlp_result_df_hetero

Processing outcome: pcorrupt
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━

Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.004678,0.085434
1,ncorrupt_os,-0.009873,0.097025
2,log_valor_corrupt,-0.469029,81.910507


### Balance Checking

In [165]:
def compute_propensity_scores(X, D):
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Define MLP Model
    model = Sequential([
        Dense(128, activation='relu', input_shape=(X_scaled.shape[1],)),  # Input layer
        Dense(64, activation='relu'),                                    # Hidden layer 1
        Dense(32, activation='relu'),                                    # Hidden layer 2
        Dense(1, activation='sigmoid')                                   # Output layer for probabilities
    ])
    
    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Early stopping to prevent overfitting
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    # Train the MLP model
    model.fit(X_scaled, D, validation_split=0.2, epochs=100, batch_size=32, 
              callbacks=[early_stopping], verbose=0)
    
    # Predict propensity scores
    propensity_scores = model.predict(X_scaled).flatten()
    return propensity_scores

smd_values = compute_smd(double_ml_dataset, all_covariates, treatment)


# Print only the SMD
print("Standardized Mean Differences (SMD):")
print(smd_values)

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Standardized Mean Differences (SMD):
[ 0.10255492 -0.09378493 -0.04928849 -0.40372005  0.39896184  0.22005796
 -0.04089811 -0.13214932  0.11899886  0.09692479  0.06045583  0.0717589
  0.02363408  0.14680792 -0.16189747 -0.09694399  0.10458215  0.12504125
  0.17975597  0.05891138  0.01978472 -0.17008045 -0.0182573   0.10930817
  0.19023459 -0.08055138  0.12916945  0.04918699  0.17569396  0.1997095
 -0.38437688  0.0193055  -0.07962076 -0.09172229  0.00982546  0.06538795
 -0.03104956  0.00810566  0.07776392 -0.10873184  0.13596366  0.03891975
 -0.05494297 -0.16956018  0.19507578  0.08128857  0.0560813  -0.06510179
  0.08022884 -0.10965527  0.07538258 -0.07897831  0.00331136  0.1015876
 -0.08391009 -0.16466638 -0.01569909 -0.07827742 -0.24601324 -0.10535873
 -0.1218379   0.02186373  0.2896228   0.12830795 -0.05722529  0.09750834
  0.01336877]


Balanced

### Assuming Homogeneity

In [130]:
def train_mlp(X_train, Y_train, input_dim, output_activation='linear', loss='mse', verbose=0):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_dim,)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation=output_activation)
    ])
    model.compile(optimizer='adam', loss=loss, metrics=['mae'])
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model.fit(X_train, Y_train, validation_split=0.2, epochs=100, batch_size=32,
              callbacks=[early_stopping], verbose=verbose)
    return model

In [131]:
# Function to perform Double ML for a single outcome
def double_ml_single_outcome_homogeneous(X, D, Y):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Prepare for K-Fold Cross-Fitting
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    W_hat, V_hat = np.zeros(len(Y)), np.zeros(len(Y))
    mse_fold = []  # List to store MSE for each fold

    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D[train_index], D[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        # Step 1: Estimate E[Y|X] using MLP (Outcome model)
        outcome_model = train_mlp(X_train, Y_train, input_dim=X_train.shape[1])
        gamma_hat = outcome_model.predict(X_test).flatten()

        # Step 2: Estimate E[D|X] using MLP (Propensity score model)
        pscore_model = train_mlp(X_train, D_train, input_dim=X_train.shape[1], 
                                 output_activation='sigmoid', loss='binary_crossentropy')
        pi_hat = pscore_model.predict(X_test).flatten()

        # Step 3: Calculate residuals W_hat and V_hat
        W_hat[test_index] = Y_test - gamma_hat
        V_hat[test_index] = D_test - pi_hat

        # Step 4: Calculate MSE for this fold
        mse_fold.append(mean_squared_error(Y_test, gamma_hat))

    # Step 5: Regress W_hat on V_hat to estimate theta_0
    regression = LinearRegression()
    regression.fit(V_hat.reshape(-1, 1), W_hat)
    theta_hat = regression.coef_[0]
    
    return theta_hat, np.mean(mse_fold)

In [132]:
# Main function to loop through multiple outcomes
def run_double_ml_mlp_homogeneous(dataset, covariates, treatment, outcomes):
    theta_hat_dr_values = []
    mse_values = []

    for outcome in outcomes:
        print(f"Processing outcome: {outcome}")
        
        X = dataset[covariates].values
        D = dataset[treatment].values
        Y = dataset[outcome].values

        # Perform Double ML for the single outcome
        theta_hat, mse = double_ml_single_outcome_homogeneous(X, D, Y)
        
        theta_hat_dr_values.append(theta_hat)
        mse_values.append(mse)

    # Create results DataFrame
    results_df = pd.DataFrame({
        "Outcome": outcomes,
        "Estimate": theta_hat_dr_values,
        "MSE": mse_values
    })
    return results_df


In [133]:
# Example Usage
mlp_result_df_homo = run_double_ml_mlp_homogeneous(
    dataset=double_ml_dataset, 
    covariates=all_covariates, 
    treatment=treatment, 
    outcomes=outcomes
)
mlp_result_df_homo

Processing outcome: pcorrupt
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
Processing outcome: ncorrupt_os
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1

Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.046082,0.038488
1,ncorrupt_os,0.003424,0.019541
2,log_valor_corrupt,-0.952303,30.573879


In [134]:
mlp_result_df_homo

Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.046082,0.038488
1,ncorrupt_os,0.003424,0.019541
2,log_valor_corrupt,-0.952303,30.573879


## Gradient Boosting


In [135]:
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

#### Assuming Heteogeneous treatment effct 

In [136]:
# Function to train propensity score model using Gradient Boosting
def train_gb_pscore(X_train, D_train, X_test, n_estimators=100, max_depth=3, random_state=42):
    gb_pscore = GradientBoostingClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    gb_pscore.fit(X_train, D_train)
    propensity_scores = gb_pscore.predict_proba(X_test)[:, 1]
    return propensity_scores

In [137]:
# Function to train outcome model for treated/untreated groups
def train_gb_outcome(X_train, Y_train, X_test, n_estimators=100, max_depth=3, random_state=42):
    gb_model = GradientBoostingRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    gb_model.fit(X_train, Y_train)
    predictions = gb_model.predict(X_test)
    return predictions

In [138]:
# Function to calculate the doubly robust treatment effect
def calculate_dr_treatment_effect(X, D, Y, k_folds=5):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # K-Fold Cross-Fitting
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    dr_ate_fold = []
    mse_fold = []

    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D.iloc[train_index], D.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
        
        # Step 1: Estimate propensity scores
        propensity_scores = train_gb_pscore(X_train, D_train, X_test)
        
        # Step 2: Trim p-scores (0.01, 0.99)
        trimmed_indices = (propensity_scores > 0.01) & (propensity_scores < 0.99)
        trimmed_X = X_test[trimmed_indices]
        trimmed_D = D_test.iloc[trimmed_indices]
        trimmed_Y = Y_test.iloc[trimmed_indices]
        trimmed_pscore = propensity_scores[trimmed_indices]
        
        # Step 3: Fit outcome models for treated and untreated groups
        gamma1 = train_gb_outcome(X_train[D_train == 1], Y_train[D_train == 1], trimmed_X)
        gamma0 = train_gb_outcome(X_train[D_train == 0], Y_train[D_train == 0], trimmed_X)
        
        # Step 4: Calculate doubly robust estimates
        trimmed_data = pd.DataFrame({
            "gamma1": gamma1,
            "gamma0": gamma0,
            "D": trimmed_D.values,
            "Y": trimmed_Y.values,
            "pscore": trimmed_pscore
        })
        
        trimmed_data['Y1_dr'] = (
            trimmed_data['gamma1'] +
            (trimmed_data['D'] / trimmed_data['pscore']) * (trimmed_data['Y'] - trimmed_data['gamma1'])
        )
        
        trimmed_data['Y0_dr'] = (
            trimmed_data['gamma0'] +
            ((1 - trimmed_data['D']) / (1 - trimmed_data['pscore'])) * (trimmed_data['Y'] - trimmed_data['gamma0'])
        )
        
        # Step 5: Calculate treatment effect and MSE for the fold
        dr_ate_fold.append(np.mean(trimmed_data['Y1_dr'] - trimmed_data['Y0_dr']))
        mse_fold.append(mean_squared_error(
            trimmed_data['Y'],
            trimmed_data['Y1_dr'] * trimmed_data['D'] + trimmed_data['Y0_dr'] * (1 - trimmed_data['D'])
        ))
    
    # Average treatment effect and MSE across folds
    dr_ate = np.mean(dr_ate_fold)
    mse = np.mean(mse_fold)
    return dr_ate, mse

In [139]:
def run_gradient_boosting_dml(dataset, covariates, treatment, outcomes):
    theta_hat_dr_values = []
    mse_values = []

    for outcome in outcomes:
        print(f"Processing outcome: {outcome}")
        X = dataset[covariates]
        D = dataset[treatment]
        Y = dataset[outcome]

        dr_ate, mse = calculate_dr_treatment_effect(X, D, Y)
        theta_hat_dr_values.append(dr_ate)
        mse_values.append(mse)
    
    # Create Results DataFrame
    results_df = pd.DataFrame({
        "Outcome": outcomes,
        "Estimate": theta_hat_dr_values,
        "MSE": mse_values
    })
    return results_df

In [140]:
gb_result_df_hetero = run_gradient_boosting_dml(
    dataset=double_ml_dataset, 
    covariates=all_covariates, 
    treatment=treatment, 
    outcomes=outcomes
)

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt


In [141]:
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting (Gradient Boosting):")
print(gb_result_df_hetero)

Doubly Robust Treatment Effect Estimates with Cross-Fitting (Gradient Boosting):
             Outcome  Estimate         MSE
0           pcorrupt -0.025620    0.141620
1        ncorrupt_os -0.006851    0.023093
2  log_valor_corrupt -0.247871  302.724438


### Balance Checking

In [166]:
def compute_propensity_scores(X, D):
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Define Gradient Boosting Classifier
    gb_model = GradientBoostingClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42)
    
    # Train the Gradient Boosting model
    gb_model.fit(X_scaled, D)
    
    # Predict propensity scores
    propensity_scores = gb_model.predict_proba(X_scaled)[:, 1]  # Probability of treatment
    return propensity_scores

smd_values = compute_smd(double_ml_dataset, all_covariates, treatment)


# Print only the SMD
print("Standardized Mean Differences (SMD):")
print(smd_values)

Standardized Mean Differences (SMD):
[ 0.09030618 -0.05935758  0.00913778 -0.27511533  0.3215796   0.19087747
 -0.07261261 -0.11765553  0.10332962  0.06669827  0.0727699   0.07814006
  0.08917901  0.15880263 -0.04890726 -0.12775864  0.11462099  0.14063892
  0.1361354   0.07878148  0.07599797 -0.08790506  0.04971291  0.10499881
  0.18966639 -0.0531381   0.07196117  0.09511551  0.25702069  0.2637618
 -0.37182915 -0.01454558 -0.06225104 -0.01587706  0.03335009  0.01973417
 -0.02683086 -0.02058486  0.0215772  -0.08640551  0.14543476  0.0617282
 -0.10543865 -0.09541292  0.20161219  0.08325514  0.07075639 -0.03389309
  0.02665818 -0.07137314  0.04641801  0.00743784  0.00947106  0.06753383
 -0.12445269 -0.08719094 -0.03280061 -0.07907675 -0.13699253 -0.0589075
 -0.12388311  0.02808309  0.14579819  0.03994939 -0.1023675   0.12261033
  0.037131  ]


Balanced

#### Assuming Homogeneous treatment effct 

In [143]:
def calculate_dr_homogeneous_gb(X, D, Y, k_folds=5):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # K-Fold Cross-Fitting
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    W_hat, V_hat = np.zeros(len(Y)), np.zeros(len(Y))
    mse_fold = []

    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D[train_index], D[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        # Step 1: Estimate propensity scores using the provided function
        pi_hat = train_gb_pscore(X_train, D_train, X_test)  # Function from the image

        # Trim propensity scores (0.01, 0.99)
        trimmed_indices = (pi_hat > 0.01) & (pi_hat < 0.99)
        trimmed_X, trimmed_D, trimmed_Y = X_test[trimmed_indices], D_test[trimmed_indices], Y_test[trimmed_indices]
        trimmed_pscore = pi_hat[trimmed_indices]

        # Step 2: Estimate E[Y|X] using the provided function
        gamma_hat = train_gb_outcome(X_train, Y_train, trimmed_X)  # Function from the image

        # Step 3: Calculate residuals W_hat and V_hat
        W_hat[test_index[trimmed_indices]] = trimmed_Y - gamma_hat
        V_hat[test_index[trimmed_indices]] = trimmed_D - trimmed_pscore

        # Step 4: Calculate MSE for this fold
        mse_fold.append(mean_squared_error(trimmed_Y, gamma_hat))
    
    # Step 5: Regress W_hat on V_hat to estimate theta_0
    regression = LinearRegression()
    regression.fit(V_hat.reshape(-1, 1), W_hat)
    theta_hat = regression.coef_[0]

    return theta_hat, np.mean(mse_fold)

In [144]:
def run_dr_homogeneous_gb(dataset, covariates, treatment, outcomes):
    theta_hat_dr_values = []
    mse_values = []

    for outcome in outcomes:
        print(f"Processing outcome: {outcome}")
        X = dataset[covariates].values
        D = dataset[treatment].values
        Y = dataset[outcome].values

        # Call the homogeneous treatment effect function
        dr_ate, mse = calculate_dr_homogeneous_gb(X, D, Y)
        theta_hat_dr_values.append(dr_ate)
        mse_values.append(mse)

    # Create Results DataFrame
    results_df = pd.DataFrame({
        "Outcome": outcomes,
        "Estimate": theta_hat_dr_values,
        "MSE": mse_values
    })
    return results_df

In [145]:
gb_result_df_homo = run_dr_homogeneous_gb(
    dataset=double_ml_dataset, 
    covariates=all_covariates, 
    treatment=treatment, 
    outcomes=outcomes
)

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt


In [146]:
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous, Gradient Boosting):")
print(gb_result_df_homo)

Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous, Gradient Boosting):
             Outcome  Estimate        MSE
0           pcorrupt -0.018570   0.012256
1        ncorrupt_os -0.006758   0.002253
2  log_valor_corrupt -0.805899  28.775320


## Combining Estimates:

### Lasso:

In [168]:
type(all_covariates)

list

In [None]:
# No covariants:
result_df = estimate_dr_for_outcomes_lasso(double_ml_dataset, , treatment, outcomes)

KeyError: ()

## Conclusion