# __Econometric Game__: Effect of electoral accountability on corruption?
   
### Team 3: Katheryn Ding, Amber Wei, Max Ye



## Set up 

In [102]:
import numpy as np
import pandas as pd
import tensorflow as tf
from scipy.special import expit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")


In [103]:
corruption_df = pd.read_stata("corruptiondata.dta")
corruption_df.columns

Index(['uf', 'nsorteio', 'totrecursos', 'tot_os', 'pop', 'purb',
       'p_secundario', 'cod_ibge6', 'pib_capita_02', 'op_01_04',
       ...
       'uf_d18', 'uf_d19', 'uf_d20', 'uf_d21', 'uf_d22', 'uf_d23', 'uf_d24',
       'uf_d25', 'uf_d26', 'esample2'],
      dtype='object', length=116)

In [104]:
# Define covariates for each category

# Mayor characteristics
mayor_covariates = [
    "pref_idade_tse",  # Age
    "pref_masc",       # Gender
    "pref_escola",     # Schooling
    "winmargin2000",   # Margin of victory in 2000
    "exp_prefeito",     # Was previously a mayor in a consecutive term
    "samepartygov98"     #Indicator if mayor is same party as the 1998 governor
] + [col for col in corruption_df.columns if col.startswith("party_d")]

# Municipal characteristics
municipal_covariates = [
    "lpop",           # Log of population in 2000
    "purb",           # Percentage of population in urban sectors
    "p_secundario",   # Percentage with at least secondary education
    "mun_novo",       # New municipality indicator
    "lpib02",         # Log of GDP per capita in 2002
    "gini_ipea",       # Gini coefficient
    "media2"      #indicator if municipality has a AM radio station and local newspape
]

# Political and Judicial characteristics
political_judicial_covariates = [
    "ENEP2000",  # Effective number of parties in 2000 mayor elections
    "ENLP2000",  # Effective number of parties in 2000 legislative elections
    "p_cad_pref" # Proportion of legislators from the same party as the mayor
]

# Dummies
dummy_covariates = [
    col for col in corruption_df.columns if col.startswith("uf_d") or col.startswith("sorteio")
]


print("Mayor Covariates:", mayor_covariates)
print("Municipal Covariates:", municipal_covariates)
print("Political and Judicial Covariates:", political_judicial_covariates)
print("Dummy Covariates:", dummy_covariates)

Mayor Covariates: ['pref_idade_tse', 'pref_masc', 'pref_escola', 'winmargin2000', 'exp_prefeito', 'samepartygov98', 'party_d1', 'party_d3', 'party_d4', 'party_d5', 'party_d6', 'party_d7', 'party_d8', 'party_d9', 'party_d10', 'party_d11', 'party_d12', 'party_d13', 'party_d14', 'party_d15', 'party_d16', 'party_d17', 'party_d18']
Municipal Covariates: ['lpop', 'purb', 'p_secundario', 'mun_novo', 'lpib02', 'gini_ipea', 'media2']
Political and Judicial Covariates: ['ENEP2000', 'ENLP2000', 'p_cad_pref']
Dummy Covariates: ['sorteio1', 'sorteio2', 'sorteio3', 'sorteio4', 'sorteio5', 'sorteio6', 'sorteio7', 'sorteio8', 'sorteio9', 'sorteio10', 'uf_d1', 'uf_d2', 'uf_d3', 'uf_d4', 'uf_d5', 'uf_d6', 'uf_d7', 'uf_d8', 'uf_d9', 'uf_d10', 'uf_d11', 'uf_d12', 'uf_d13', 'uf_d14', 'uf_d15', 'uf_d16', 'uf_d17', 'uf_d18', 'uf_d19', 'uf_d20', 'uf_d21', 'uf_d22', 'uf_d23', 'uf_d24', 'uf_d25', 'uf_d26']


In [105]:
corruption_df["log_valor_corrupt"] = np.log(corruption_df["valor_corrupt"] + 1)

# Define the treatment and outcome variables
treatment = "first"  
outcomes = ["pcorrupt", "ncorrupt_os", 'log_valor_corrupt'] 

# Use all covariates
all_covariates = (
    mayor_covariates +
    municipal_covariates +
    political_judicial_covariates +
    dummy_covariates
)
#set up dataset:
required_columns = [treatment] + outcomes + all_covariates
double_ml_dataset = corruption_df[required_columns].dropna()

print("Dataset Columns:", double_ml_dataset.columns)
print("Number of rows in the Dataset:", double_ml_dataset.shape[0])

Dataset Columns: Index(['first', 'pcorrupt', 'ncorrupt_os', 'log_valor_corrupt',
       'pref_idade_tse', 'pref_masc', 'pref_escola', 'winmargin2000',
       'exp_prefeito', 'samepartygov98', 'party_d1', 'party_d3', 'party_d4',
       'party_d5', 'party_d6', 'party_d7', 'party_d8', 'party_d9', 'party_d10',
       'party_d11', 'party_d12', 'party_d13', 'party_d14', 'party_d15',
       'party_d16', 'party_d17', 'party_d18', 'lpop', 'purb', 'p_secundario',
       'mun_novo', 'lpib02', 'gini_ipea', 'media2', 'ENEP2000', 'ENLP2000',
       'p_cad_pref', 'sorteio1', 'sorteio2', 'sorteio3', 'sorteio4',
       'sorteio5', 'sorteio6', 'sorteio7', 'sorteio8', 'sorteio9', 'sorteio10',
       'uf_d1', 'uf_d2', 'uf_d3', 'uf_d4', 'uf_d5', 'uf_d6', 'uf_d7', 'uf_d8',
       'uf_d9', 'uf_d10', 'uf_d11', 'uf_d12', 'uf_d13', 'uf_d14', 'uf_d15',
       'uf_d16', 'uf_d17', 'uf_d18', 'uf_d19', 'uf_d20', 'uf_d21', 'uf_d22',
       'uf_d23', 'uf_d24', 'uf_d25', 'uf_d26'],
      dtype='object')
Number of rows 

## Lasso 

In [106]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

#### Assuming Heterogeneosity 

In [107]:
# Function to estimate propensity scores using Lasso
def estimate_propensity_scores_lasso(X_train, D_train, X_test):
    lasso_pscore = LassoCV(alphas=np.logspace(-4, 0, 50), cv=5, random_state=42).fit(X_train, D_train)
    D_pred = lasso_pscore.predict(X_test)
    propensity_scores = expit(D_pred)  
    return propensity_scores

In [108]:
# Function to estimate outcome model using Lasso for treated and untreated groups
def estimate_outcome_lasso_he(X_train, Y_train, D_train, X_test, treated=True):
    if treated:
        X_train_filtered = X_train[D_train == 1]
        Y_train_filtered = Y_train[D_train == 1]
    else:
        X_train_filtered = X_train[D_train == 0]
        Y_train_filtered = Y_train[D_train == 0]

    # Fit Lasso model
    lasso_model = LassoCV(alphas=np.logspace(-4, 0, 50), cv=5, random_state=42)
    lasso_model.fit(X_train_filtered, Y_train_filtered)
    
    # Predict on test set
    return lasso_model.predict(X_test)


In [109]:
# Function to perform doubly robust estimation with cross-fitting
def doubly_robust_estimation_lasso(X, D, Y, k_folds=5):
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    W_hat_dr_values = []
    mse_fold = []
    
    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D.iloc[train_index], D.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
        # Step 1: Estimate propensity scores
        propensity_scores = estimate_propensity_scores_lasso(X_train, D_train, X_test)
        
        # Trim propensity scores
        trimmed_indices = (propensity_scores > 0.01) & (propensity_scores < 0.99)
        trimmed_X = X_test[trimmed_indices]
        trimmed_D = D_test.iloc[trimmed_indices]
        trimmed_Y = Y_test.iloc[trimmed_indices]
        trimmed_pscore = propensity_scores[trimmed_indices]
        
        # Step 2: Estimate outcome models for treated and untreated groups
        gamma1 = estimate_outcome_lasso_he(X_train, Y_train, D_train, trimmed_X, treated=True)
        gamma0 = estimate_outcome_lasso_he(X_train, Y_train, D_train, trimmed_X, treated=False)
        
        # Step 3: Doubly Robust Estimates
        trimmed_data = pd.DataFrame({
            "gamma1": gamma1,
            "gamma0": gamma0,
            "D": trimmed_D.values,
            "Y": trimmed_Y.values,
            "pscore": trimmed_pscore,
        })
        
        trimmed_data['Y1_dr'] = trimmed_data['gamma1'] + \
            (trimmed_data['D'] / trimmed_data['pscore']) * (trimmed_data['Y'] - trimmed_data['gamma1'])
        
        trimmed_data['Y0_dr'] = trimmed_data['gamma0'] + \
            ((1 - trimmed_data['D']) / (1 - trimmed_data['pscore'])) * (trimmed_data['Y'] - trimmed_data['gamma0'])
        
        # Step 4: Calculate treatment effect and MSE
        W_hat_dr_values.append(np.mean(trimmed_data['Y1_dr'] - trimmed_data['Y0_dr']))
        mse_fold.append(mean_squared_error(
            trimmed_data['Y'], 
            trimmed_data['Y1_dr'] * trimmed_data['D'] + trimmed_data['Y0_dr'] * (1 - trimmed_data['D'])
        ))
    
    # Return average treatment effect and MSE
    return np.mean(W_hat_dr_values), np.mean(mse_fold)

In [110]:
# Main function to loop through outcomes and store results
def estimate_dr_for_outcomes_lasso(double_ml_dataset, covariates, treatment, outcomes):
    theta_hat_dr_values = []
    mse_values = []

    for outcome in outcomes:
        X = double_ml_dataset[covariates]
        D = double_ml_dataset[treatment]
        Y = double_ml_dataset[outcome]
        
        dr_ate, mse = doubly_robust_estimation_lasso(X, D, Y)
        theta_hat_dr_values.append(dr_ate)
        mse_values.append(mse)
    
    # Create Results DataFrame
    result_df = pd.DataFrame({
        "Outcome": outcomes,
        "Estimate": theta_hat_dr_values,
        "MSE": mse_values
    })
    return result_df

### Balance Checking

In [111]:
def compute_propensity_scores(X, D):
    lasso_pscore = LassoCV(alphas=np.logspace(-4, 0, 50), cv=5, random_state=42)
    lasso_pscore.fit(X, D)
    propensity_scores = expit(lasso_pscore.predict(X))  # Convert to probabilities using expit
    return propensity_scores
def compute_smd(double_ml_dataset, covariates, treatment):
    X = double_ml_dataset[covariates].values
    D = double_ml_dataset[treatment].values
    # Estimate propensity scores
    pscore = compute_propensity_scores(X, D)
    
    # Create weights
    weights_treated = 1 / pscore
    weights_control = 1 / (1 - pscore)
    
    # Treated and control indices
    treated_indices = (D == 1)
    control_indices = (D == 0)
    
    # Compute weighted means
    weighted_means_treated = np.average(X[treated_indices], axis=0, weights=weights_treated[treated_indices])
    weighted_means_control = np.average(X[control_indices], axis=0, weights=weights_control[control_indices])
    
    # Compute pooled standard deviation
    pooled_std = np.sqrt((np.var(X[treated_indices], axis=0) + np.var(X[control_indices], axis=0)) / 2)
    
    # Compute Standardized Mean Differences (SMD)
    smd = (weighted_means_treated - weighted_means_control) / pooled_std
    
    return smd

# Example usage
smd_values = compute_smd(double_ml_dataset, all_covariates, treatment)


# Print only the SMD
print("Standardized Mean Differences (SMD):")
print(smd_values)

Standardized Mean Differences (SMD):
[ 0.10444128 -0.04778303  0.02943226 -0.38081515  0.3817461  -0.15100151
  0.19332846 -0.04916417 -0.1210468   0.10652541  0.06225735  0.08172923
  0.08921463  0.09542131  0.16976179 -0.07561675 -0.14200997  0.11768471
  0.14571364  0.13787584  0.08129106  0.09512347 -0.09401499  0.06487443
  0.17607577  0.25416455 -0.08146641  0.1068664   0.1276873   0.02848511
  0.30597427  0.32191885 -0.47661838  0.00551382 -0.09696279 -0.03323501
  0.02808839  0.0083512  -0.03747251 -0.00090737  0.04986335 -0.07095242
  0.14596204  0.0746449  -0.10573535 -0.12193336  0.2238265   0.08195173
  0.04812928  0.00348178  0.03644304 -0.08590718  0.057608    0.01254421
  0.00954509  0.05833079 -0.11724194 -0.11866081 -0.04932118 -0.07436268
 -0.1675658  -0.06508778 -0.13768427  0.03472234  0.17528316  0.05955687
 -0.09679345  0.12851685  0.04831812]


Not very balanced.

#### Assuming Homogeneous treatment effct 

In [112]:
def estimate_outcome_lasso_ho(X_train, Y_train, X_test):
    outcome_model = LassoCV(alphas=np.logspace(-4, 0, 50), cv=5, random_state=42)
    outcome_model.fit(X_train, Y_train)
    gamma_hat = outcome_model.predict(X_test)
    return gamma_hat

In [113]:
# Function to estimate E[D|X] using Lasso
def estimate_propensity_lasso(X_train, D_train, X_test):
    pscore_model = LassoCV(alphas=np.logspace(-4, 0, 50), cv=5, random_state=42)
    pscore_model.fit(X_train, D_train)
    pi_hat = pscore_model.predict(X_test)
    return pi_hat

In [114]:
# Function to perform doubly robust estimation with cross-fitting
def doubly_robust_crossfit_lasso(X, D, Y, k_folds=5):
    # Standardize features within this function
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Initialize variables
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    W_hat, V_hat = np.zeros(len(Y)), np.zeros(len(Y))
    mse_fold = []  # List to store MSE for each fold
    
    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D[train_index], D[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        
        # Step 1: Estimate outcome E[Y|X]
        gamma_hat = estimate_outcome_lasso_ho(X_train, Y_train, X_test)
        
        # Step 2: Estimate propensity scores E[D|X]
        pi_hat = estimate_propensity_lasso(X_train, D_train, X_test)
        
        # Step 3: Calculate residuals
        W_hat[test_index] = Y_test - gamma_hat
        V_hat[test_index] = D_test - pi_hat
        
        # Step 4: Calculate MSE for this fold
        mse_fold.append(mean_squared_error(Y_test, gamma_hat))
    
    # Step 5: Regress residuals W_hat on V_hat to estimate theta_0
    regression = LinearRegression()
    regression.fit(V_hat.reshape(-1, 1), W_hat)
    theta_hat = regression.coef_[0]
    
    return theta_hat, np.mean(mse_fold)

In [115]:
def estimate_dr_homogeneous_lasso(double_ml_dataset, all_covariates, treatment, outcomes):
    theta_hat_dr_values = []
    mse_values = []

    for outcome in outcomes:
        X = double_ml_dataset[all_covariates].values
        D = double_ml_dataset[treatment].values
        Y = double_ml_dataset[outcome].values
        
        theta_hat, mse = doubly_robust_crossfit_lasso(X, D, Y)
        theta_hat_dr_values.append(theta_hat)
        mse_values.append(mse)
    
    # Create Results DataFrame
    result_df = pd.DataFrame({
        "Outcome": outcomes,
        "Estimate": theta_hat_dr_values,
        "MSE": mse_values
    })
    return result_df

In [116]:
ls_result_df_homo = estimate_dr_homogeneous_lasso(double_ml_dataset, all_covariates, treatment, outcomes)
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous):")
print(ls_result_df_homo)

Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous):
             Outcome  Estimate        MSE
0           pcorrupt -0.023743   0.010310
1        ncorrupt_os -0.008505   0.002078
2  log_valor_corrupt -1.190014  24.754757


## Random Forest

In [117]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

#### Assuming Heterogeneous Treatment Effect

In [118]:
def estimate_propensity_scores_rf(X_train, D_train, X_test, n_estimators=100, max_depth=5, random_state=42):
    rf_pscore = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    rf_pscore.fit(X_train, D_train)
    propensity_scores = rf_pscore.predict_proba(X_test)[:, 1]  # Probability of treatment
    return propensity_scores

In [119]:
# Function to estimate outcome models for treated and untreated groups using Random Forest
def estimate_outcome_rf(X_train, Y_train, X_test, D_train, treated=True, n_estimators=100, max_depth=5, random_state=42):
    rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    if treated:
        rf_model.fit(X_train[D_train == 1], Y_train[D_train == 1])
    else:
        rf_model.fit(X_train[D_train == 0], Y_train[D_train == 0])
    return rf_model.predict(X_test)

In [120]:
# Function to calculate doubly robust estimates for a single outcome
def calculate_dr_estimates_rf(X, D, Y, k_folds=5):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    dr_ate_fold = []
    mse_fold = []

    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D.iloc[train_index], D.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
        
        # Step 1: Estimate propensity scores
        propensity_scores = estimate_propensity_scores_rf(X_train, D_train, X_test)

        # Step 2: Trim propensity scores
        trimmed_indices = (propensity_scores > 0.01) & (propensity_scores < 0.99)
        trimmed_X = X_test[trimmed_indices]
        trimmed_D = D_test.iloc[trimmed_indices]
        trimmed_Y = Y_test.iloc[trimmed_indices]
        trimmed_pscore = propensity_scores[trimmed_indices]

        # Step 3: Estimate outcome models for treated and untreated
        gamma1 = estimate_outcome_rf(X_train, Y_train, trimmed_X, D_train, treated=True)
        gamma0 = estimate_outcome_rf(X_train, Y_train, trimmed_X, D_train, treated=False)

        # Step 4: Construct doubly robust estimates
        trimmed_data = pd.DataFrame({
            "gamma1": gamma1,
            "gamma0": gamma0,
            "D": trimmed_D.values,
            "Y": trimmed_Y.values,
            "pscore": trimmed_pscore
        })

        trimmed_data['Y1_dr'] = (
            trimmed_data['gamma1'] +
            (trimmed_data['D'] / trimmed_data['pscore']) * (trimmed_data['Y'] - trimmed_data['gamma1'])
        )
        trimmed_data['Y0_dr'] = (
            trimmed_data['gamma0'] +
            ((1 - trimmed_data['D']) / (1 - trimmed_data['pscore'])) * (trimmed_data['Y'] - trimmed_data['gamma0'])
        )

        # Step 5: Calculate treatment effect and MSE
        dr_ate_fold.append(np.mean(trimmed_data['Y1_dr'] - trimmed_data['Y0_dr']))
        mse_fold.append(mean_squared_error(
            trimmed_data['Y'], 
            trimmed_data['Y1_dr'] * trimmed_data['D'] + trimmed_data['Y0_dr'] * (1 - trimmed_data['D'])
        ))
    
    # Average results across folds
    dr_ate = np.mean(dr_ate_fold)
    mse = np.mean(mse_fold)
    return dr_ate, mse

In [121]:
# Function to loop through all outcomes and calculate results
def run_dr_estimation_rf(dataset, covariates, treatment, outcomes):
    theta_hat_dr_values = []
    mse_values = []

    for outcome in outcomes:
        print(f"Processing outcome: {outcome}")
        X = dataset[covariates]
        D = dataset[treatment]
        Y = dataset[outcome]
        
        dr_ate, mse = calculate_dr_estimates_rf(X, D, Y)
        theta_hat_dr_values.append(dr_ate)
        mse_values.append(mse)
    
    # Create Results DataFrame
    results_df = pd.DataFrame({
        "Outcome": outcomes,
        "Estimate": theta_hat_dr_values,
        "MSE": mse_values
    })
    return results_df

In [122]:
rf_result_df_hetero = run_dr_estimation_rf(
    dataset=double_ml_dataset, 
    covariates=all_covariates, 
    treatment=treatment, 
    outcomes=outcomes
)

# Print the final results
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting (Random Forest):")
print(rf_result_df_hetero)

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt
Doubly Robust Treatment Effect Estimates with Cross-Fitting (Random Forest):
             Outcome  Estimate        MSE
0           pcorrupt -0.027213   0.019589
1        ncorrupt_os -0.007590   0.002966
2  log_valor_corrupt -0.975908  33.768337


### Balance Checking

In [123]:
def compute_propensity_scores(X, D):
    rf_pscore = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
    rf_pscore.fit(X, D)
    propensity_scores = rf_pscore.predict_proba(X)[:, 1]  # Probability of treatment (D=1)
    return propensity_scores

smd_values = compute_smd(double_ml_dataset, all_covariates, treatment)


# Print only the SMD
print("Standardized Mean Differences (SMD):")
print(smd_values)

Standardized Mean Differences (SMD):
[ 0.05401538 -0.05122315 -0.0115647  -0.30088451  0.3623659  -0.14696552
  0.20912083 -0.04102491 -0.10924533  0.08184664  0.07256128  0.07612164
  0.08663987  0.07463451  0.16772991 -0.06704582 -0.15241486  0.12495408
  0.1450842   0.13925536  0.08484242  0.08386355 -0.11150428 -0.02378073
  0.05198437  0.12500133 -0.038089    0.04915379  0.12069529 -0.027809
  0.18444536  0.20041131 -0.35951335  0.00753286 -0.08747367 -0.03310904
  0.02442544  0.02126029 -0.02639725 -0.00615057  0.0301662  -0.06298168
  0.13165383  0.08528576 -0.09050799 -0.12275228  0.20394276  0.10648058
  0.07194429 -0.04818617  0.03829937 -0.07007226  0.04206149  0.01176175
  0.0132209   0.05957888 -0.08506353 -0.1005578  -0.02192311 -0.095596
 -0.19606632 -0.04403612 -0.13292273  0.01575297  0.17337446  0.05002352
 -0.09249015  0.0939428   0.04134072]


Balanced

#### Assuming Homogeneous Treatment Effect

In [124]:
# Function to estimate E[Y|X] using Random Forest for the outcome model
def estimate_outcome_rf_homo(X_train, Y_train, X_test, n_estimators=100, max_depth=5, random_state=42):
    rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    rf_model.fit(X_train, Y_train)
    return rf_model.predict(X_test)

In [125]:
# Function to calculate residuals (W_hat and V_hat)
def calculate_residuals_rf_homo(X_train, X_test, Y_train, Y_test, D_train, D_test):
    # Estimate E[Y|X] using Random Forest (Outcome model)
    gamma_hat = estimate_outcome_rf_homo(X_train, Y_train, X_test)

    # Estimate E[D|X] using Random Forest (Propensity score model)
    pi_hat = estimate_propensity_scores_rf(X_train, D_train, X_test)
    
    W_hat = Y_test - gamma_hat
    V_hat = D_test - pi_hat
    
    return W_hat, V_hat, gamma_hat

In [126]:
# Function to calculate doubly robust estimates for homogeneous effects
def calculate_dr_estimates_rf_homo(X, D, Y, k_folds=5):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    W_hat, V_hat = np.zeros(len(Y)), np.zeros(len(Y))
    mse_fold = []  # List to store MSE for each fold

    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D[train_index], D[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        
        # Calculate residuals
        W_hat[test_index], V_hat[test_index], gamma_hat = calculate_residuals_rf_homo(
            X_train, X_test, Y_train, Y_test, D_train, D_test
        )
        
        # Calculate MSE for this fold
        mse_fold.append(mean_squared_error(Y_test, gamma_hat))

    # Regress W_hat on V_hat to estimate theta_0
    regression = LinearRegression()
    regression.fit(V_hat.reshape(-1, 1), W_hat)
    theta_hat = regression.coef_[0]

    return theta_hat, np.mean(mse_fold)

In [127]:
# Function to run DR estimation for multiple outcomes
def run_dr_estimation_rf_homo(dataset, covariates, treatment, outcomes):
    theta_hat_dr_values = []
    mse_values = []

    for outcome in outcomes:
        print(f"Processing outcome: {outcome}")
        X = dataset[covariates].values
        D = dataset[treatment].values
        Y = dataset[outcome].values
        
        theta_hat, mse = calculate_dr_estimates_rf_homo(X, D, Y)
        theta_hat_dr_values.append(theta_hat)
        mse_values.append(mse)
    
    # Create Results DataFrame
    results_df = pd.DataFrame({
        "Outcome": outcomes,
        "Estimate": theta_hat_dr_values,
        "MSE": mse_values
    })
    return results_df

In [128]:
theta_hat_dr_values = []
mse_values = []

# Loop through each outcome
for outcome in outcomes:
    
    X = double_ml_dataset[all_covariates].values
    D = double_ml_dataset[treatment].values
    Y = double_ml_dataset[outcome].values

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # K-Fold Cross-Fitting
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    W_hat, V_hat = np.zeros(len(Y)), np.zeros(len(Y))
    mse_fold = []  # List to store MSE for each fold
    
    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D[train_index], D[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        # Step 1: Estimate E[Y|X] using Random Forest (Outcome model)
        outcome_model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
        outcome_model.fit(X_train, Y_train)
        gamma_hat = outcome_model.predict(X_test)

        # Step 2: Estimate E[D|X] using Random Forest (Propensity score model)
        pscore_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
        pscore_model.fit(X_train, D_train)
        pi_hat = pscore_model.predict_proba(X_test)[:, 1]  # Probability of treatment
        
        # Step 3: Calculate residuals W_hat and V_hat
        W_hat[test_index] = Y_test - gamma_hat
        V_hat[test_index] = D_test - pi_hat

        # Step 4: Calculate MSE for this fold
        mse_fold.append(mean_squared_error(Y_test, gamma_hat))

    # Step 5: Regress W_hat on V_hat to estimate theta_0
    regression = LinearRegression()
    regression.fit(V_hat.reshape(-1, 1), W_hat)
    theta_hat = regression.coef_[0]
    
    # Store results
    theta_hat_dr_values.append(theta_hat)
    mse_values.append(np.mean(mse_fold))  # Average MSE across folds

# Step 6: Create Results DataFrame
rf_result_df_homo = pd.DataFrame({
    "Outcome": outcomes,
    "Estimate": theta_hat_dr_values,
    "MSE": mse_values
})

# Display the results DataFrame
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous) using Random Forest:")
print(rf_result_df_homo)

Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous) using Random Forest:
             Outcome  Estimate        MSE
0           pcorrupt -0.023355   0.010866
1        ncorrupt_os -0.007674   0.002254
2  log_valor_corrupt -0.938316  25.969169


## Multiple-Layer Perceptron

In [129]:
from sklearn.neural_network import MLPRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [130]:
# Function to build and train an MLP model
def train_mlp(X_train, Y_train, input_dim, output_activation='linear', loss='mse', verbose=0):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_dim,)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation=output_activation)
    ])
    model.compile(optimizer='adam', loss=loss, metrics=['mae'])
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model.fit(X_train, Y_train, validation_split=0.2, epochs=100, batch_size=32,
              callbacks=[early_stopping], verbose=verbose)
    return model

In [131]:
# Function to calculate doubly robust treatment effects using MLP
def double_ml_mlp_heterogeneous(X, D, Y, outcomes):
    theta_hat_dr_values = []
    mse_values = []

    # Standardize covariates
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 5-Fold Cross-Fitting
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    for outcome in outcomes:
        dr_ate_fold = []
        mse_fold = []
        
        for train_index, test_index in kf.split(X_scaled):
            X_train, X_test = X_scaled[train_index], X_scaled[test_index]
            D_train, D_test = D[train_index], D[test_index]
            Y_train, Y_test = Y[train_index], Y[test_index]
            
            # Step 1: Propensity score estimation using MLP
            model_pscore = train_mlp(X_train, D_train, input_dim=X_train.shape[1], 
                                     output_activation='sigmoid', loss='binary_crossentropy')
            propensity_scores = model_pscore.predict(X_test).flatten()

            # Trim p-scores (0.01, 0.99)
            trimmed_indices = (propensity_scores > 0.01) & (propensity_scores < 0.99)
            trimmed_X = X_test[trimmed_indices]
            trimmed_D = D_test[trimmed_indices]
            trimmed_Y = Y_test[trimmed_indices]
            trimmed_pscore = propensity_scores[trimmed_indices]

            # Step 2: Fit outcome models for treated and untreated groups
            model_treated = train_mlp(X_train[D_train == 1], Y_train[D_train == 1], input_dim=X_train.shape[1])
            gamma1 = model_treated.predict(trimmed_X).flatten()

            model_untreated = train_mlp(X_train[D_train == 0], Y_train[D_train == 0], input_dim=X_train.shape[1])
            gamma0 = model_untreated.predict(trimmed_X).flatten()

            # Step 3: Construct doubly robust estimates
            trimmed_data = pd.DataFrame({
                "gamma1": gamma1,
                "gamma0": gamma0,
                "D": trimmed_D,
                "Y": trimmed_Y,
                "pscore": trimmed_pscore
            })
            trimmed_data['Y1_dr'] = (
                trimmed_data['gamma1'] +
                (trimmed_data['D'] / trimmed_data['pscore']) * (trimmed_data['Y'] - trimmed_data['gamma1'])
            )
            trimmed_data['Y0_dr'] = (
                trimmed_data['gamma0'] +
                ((1 - trimmed_data['D']) / (1 - trimmed_data['pscore'])) * (trimmed_data['Y'] - trimmed_data['gamma0'])
            )

            # Calculate treatment effect and MSE for the fold
            dr_ate_fold.append(np.mean(trimmed_data['Y1_dr'] - trimmed_data['Y0_dr']))
            mse_fold.append(mean_squared_error(
                trimmed_data['Y'], 
                trimmed_data['Y1_dr'] * trimmed_data['D'] + trimmed_data['Y0_dr'] * (1 - trimmed_data['D'])
            ))

        # Store average estimates and MSE
        theta_hat_dr_values.append(np.mean(dr_ate_fold))
        mse_values.append(np.mean(mse_fold))

    results_df = pd.DataFrame({
        "Outcome": outcomes,
        "Estimate": theta_hat_dr_values,
        "MSE": mse_values
    })
    return results_df

In [132]:
# Function for easy call 
def run_double_ml_mlp(dataset, covariants, outcomes):
    X = dataset[covariants].values
    D = dataset[treatment].values
    results = []
    
    for outcome in outcomes:
        Y = dataset[outcome].values
        print(f"Processing outcome: {outcome}")
        result = double_ml_mlp_heterogeneous(X, D, Y, [outcome])
        results.append(result)
    
    final_results = pd.concat(results, ignore_index=True)
    return final_results

In [133]:
#mlp_result with all covariants
mlp_result_df_hetero = run_double_ml_mlp(double_ml_dataset, all_covariates, outcomes)
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting (Heterogeneous) using MLP:")
mlp_result_df_hetero

Processing outcome: pcorrupt
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━

Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.02993,0.059714
1,ncorrupt_os,-0.022342,0.047328
2,log_valor_corrupt,-1.283951,82.593216


### Balance Checking

In [134]:
def compute_propensity_scores(X, D):
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Define MLP Model
    model = Sequential([
        Dense(128, activation='relu', input_shape=(X_scaled.shape[1],)),  # Input layer
        Dense(64, activation='relu'),                                    # Hidden layer 1
        Dense(32, activation='relu'),                                    # Hidden layer 2
        Dense(1, activation='sigmoid')                                   # Output layer for probabilities
    ])
    
    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Early stopping to prevent overfitting
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    # Train the MLP model
    model.fit(X_scaled, D, validation_split=0.2, epochs=100, batch_size=32, 
              callbacks=[early_stopping], verbose=0)
    
    # Predict propensity scores
    propensity_scores = model.predict(X_scaled).flatten()
    return propensity_scores

smd_values = compute_smd(double_ml_dataset, all_covariates, treatment)


# Print only the SMD
print("Standardized Mean Differences (SMD):")
print(smd_values)

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Standardized Mean Differences (SMD):
[ 0.11269243 -0.09711623 -0.05244851 -0.29805818  0.2632233  -0.08070832
  0.2378292  -0.0513742  -0.06588445 -0.01309078  0.00995616  0.0599221
  0.06187376  0.07496508  0.12600891 -0.05636164 -0.11672065  0.04035265
  0.10744961  0.0901299   0.05939301 -0.00352721 -0.05672462 -0.08789636
 -0.04418197  0.06295685 -0.06079189  0.02176711 -0.00374919  0.02468247
  0.12393344  0.1457257  -0.24717459 -0.0633526  -0.15973091  0.04947739
  0.05715809  0.06567763  0.02059464 -0.01212692 -0.14213994  0.0106164
  0.14912196  0.03362967 -0.11117943 -0.08278649  0.16690487  0.05176123
  0.03951028  0.04089146  0.00596661 -0.04709987 -0.00723532 -0.06032001
 -0.15180053  0.06382515  0.00709551  0.0245404  -0.02327517 -0.03751536
 -0.08109072 -0.02818362 -0.06984321  0.02224928  0.11978235  0.05601641
 -0.05426168  0.0437335   0.0146101 ]


Balanced

### Assuming Homogeneity

In [135]:
def train_mlp(X_train, Y_train, input_dim, output_activation='linear', loss='mse', verbose=0):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_dim,)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation=output_activation)
    ])
    model.compile(optimizer='adam', loss=loss, metrics=['mae'])
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model.fit(X_train, Y_train, validation_split=0.2, epochs=100, batch_size=32,
              callbacks=[early_stopping], verbose=verbose)
    return model

In [136]:
# Function to perform Double ML for a single outcome
def double_ml_single_outcome_homogeneous(X, D, Y):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Prepare for K-Fold Cross-Fitting
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    W_hat, V_hat = np.zeros(len(Y)), np.zeros(len(Y))
    mse_fold = []  # List to store MSE for each fold

    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D[train_index], D[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        # Step 1: Estimate E[Y|X] using MLP (Outcome model)
        outcome_model = train_mlp(X_train, Y_train, input_dim=X_train.shape[1])
        gamma_hat = outcome_model.predict(X_test).flatten()

        # Step 2: Estimate E[D|X] using MLP (Propensity score model)
        pscore_model = train_mlp(X_train, D_train, input_dim=X_train.shape[1], 
                                 output_activation='sigmoid', loss='binary_crossentropy')
        pi_hat = pscore_model.predict(X_test).flatten()

        # Step 3: Calculate residuals W_hat and V_hat
        W_hat[test_index] = Y_test - gamma_hat
        V_hat[test_index] = D_test - pi_hat

        # Step 4: Calculate MSE for this fold
        mse_fold.append(mean_squared_error(Y_test, gamma_hat))

    # Step 5: Regress W_hat on V_hat to estimate theta_0
    regression = LinearRegression()
    regression.fit(V_hat.reshape(-1, 1), W_hat)
    theta_hat = regression.coef_[0]
    
    return theta_hat, np.mean(mse_fold)

In [137]:
# Main function to loop through multiple outcomes
def run_double_ml_mlp_homogeneous(dataset, covariates, treatment, outcomes):
    theta_hat_dr_values = []
    mse_values = []

    for outcome in outcomes:
        print(f"Processing outcome: {outcome}")
        
        X = dataset[covariates].values
        D = dataset[treatment].values
        Y = dataset[outcome].values

        # Perform Double ML for the single outcome
        theta_hat, mse = double_ml_single_outcome_homogeneous(X, D, Y)
        
        theta_hat_dr_values.append(theta_hat)
        mse_values.append(mse)

    # Create results DataFrame
    results_df = pd.DataFrame({
        "Outcome": outcomes,
        "Estimate": theta_hat_dr_values,
        "MSE": mse_values
    })
    return results_df


In [138]:
mlp_result_df_homo = run_double_ml_mlp_homogeneous(
    dataset=double_ml_dataset, 
    covariates=all_covariates, 
    treatment=treatment, 
    outcomes=outcomes
)
mlp_result_df_homo

Processing outcome: pcorrupt
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
Processing outcome: ncorrupt_os
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1

Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.001675,0.027615
1,ncorrupt_os,0.008892,0.021563
2,log_valor_corrupt,-0.842197,28.887939


## Gradient Boosting


In [139]:
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

#### Assuming Heteogeneous treatment effct 

In [140]:
# Function to train propensity score model using Gradient Boosting
def train_gb_pscore(X_train, D_train, X_test, n_estimators=100, max_depth=3, random_state=42):
    gb_pscore = GradientBoostingClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    gb_pscore.fit(X_train, D_train)
    propensity_scores = gb_pscore.predict_proba(X_test)[:, 1]
    return propensity_scores

In [141]:
# Function to train outcome model for treated/untreated groups
def train_gb_outcome(X_train, Y_train, X_test, n_estimators=100, max_depth=3, random_state=42):
    gb_model = GradientBoostingRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    gb_model.fit(X_train, Y_train)
    predictions = gb_model.predict(X_test)
    return predictions

In [142]:
# Function to calculate the doubly robust treatment effect
def calculate_dr_treatment_effect(X, D, Y, k_folds=5):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # K-Fold Cross-Fitting
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    dr_ate_fold = []
    mse_fold = []

    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D.iloc[train_index], D.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
        
        # Step 1: Estimate propensity scores
        propensity_scores = train_gb_pscore(X_train, D_train, X_test)
        
        # Step 2: Trim p-scores (0.01, 0.99)
        trimmed_indices = (propensity_scores > 0.01) & (propensity_scores < 0.99)
        trimmed_X = X_test[trimmed_indices]
        trimmed_D = D_test.iloc[trimmed_indices]
        trimmed_Y = Y_test.iloc[trimmed_indices]
        trimmed_pscore = propensity_scores[trimmed_indices]
        
        # Step 3: Fit outcome models for treated and untreated groups
        gamma1 = train_gb_outcome(X_train[D_train == 1], Y_train[D_train == 1], trimmed_X)
        gamma0 = train_gb_outcome(X_train[D_train == 0], Y_train[D_train == 0], trimmed_X)
        
        # Step 4: Calculate doubly robust estimates
        trimmed_data = pd.DataFrame({
            "gamma1": gamma1,
            "gamma0": gamma0,
            "D": trimmed_D.values,
            "Y": trimmed_Y.values,
            "pscore": trimmed_pscore
        })
        
        trimmed_data['Y1_dr'] = (
            trimmed_data['gamma1'] +
            (trimmed_data['D'] / trimmed_data['pscore']) * (trimmed_data['Y'] - trimmed_data['gamma1'])
        )
        
        trimmed_data['Y0_dr'] = (
            trimmed_data['gamma0'] +
            ((1 - trimmed_data['D']) / (1 - trimmed_data['pscore'])) * (trimmed_data['Y'] - trimmed_data['gamma0'])
        )
        
        # Step 5: Calculate treatment effect and MSE for the fold
        dr_ate_fold.append(np.mean(trimmed_data['Y1_dr'] - trimmed_data['Y0_dr']))
        mse_fold.append(mean_squared_error(
            trimmed_data['Y'],
            trimmed_data['Y1_dr'] * trimmed_data['D'] + trimmed_data['Y0_dr'] * (1 - trimmed_data['D'])
        ))
    
    # Average treatment effect and MSE across folds
    dr_ate = np.mean(dr_ate_fold)
    mse = np.mean(mse_fold)
    return dr_ate, mse

In [143]:
def run_gradient_boosting_dml(dataset, covariates, treatment, outcomes):
    theta_hat_dr_values = []
    mse_values = []

    for outcome in outcomes:
        print(f"Processing outcome: {outcome}")
        X = dataset[covariates]
        D = dataset[treatment]
        Y = dataset[outcome]

        dr_ate, mse = calculate_dr_treatment_effect(X, D, Y)
        theta_hat_dr_values.append(dr_ate)
        mse_values.append(mse)
    
    # Create Results DataFrame
    results_df = pd.DataFrame({
        "Outcome": outcomes,
        "Estimate": theta_hat_dr_values,
        "MSE": mse_values
    })
    return results_df

In [144]:
gb_result_df_hetero = run_gradient_boosting_dml(
    dataset=double_ml_dataset, 
    covariates=all_covariates, 
    treatment=treatment, 
    outcomes=outcomes
)

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt


In [145]:
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting (Gradient Boosting):")
print(gb_result_df_hetero)

Doubly Robust Treatment Effect Estimates with Cross-Fitting (Gradient Boosting):
             Outcome  Estimate         MSE
0           pcorrupt -0.033108    0.115371
1        ncorrupt_os -0.010750    0.014521
2  log_valor_corrupt -0.372401  231.858423


### Balance Checking

In [146]:
def compute_propensity_scores(X, D):
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Define Gradient Boosting Classifier
    gb_model = GradientBoostingClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42)
    
    # Train the Gradient Boosting model
    gb_model.fit(X_scaled, D)
    
    # Predict propensity scores
    propensity_scores = gb_model.predict_proba(X_scaled)[:, 1]  # Probability of treatment
    return propensity_scores

smd_values = compute_smd(double_ml_dataset, all_covariates, treatment)


# Print only the SMD
print("Standardized Mean Differences (SMD):")
print(smd_values)

Standardized Mean Differences (SMD):
[ 0.09740257 -0.05703785  0.00375209 -0.2861346   0.33890762 -0.15714109
  0.18962049 -0.0734889  -0.11348257  0.09824413  0.06337119  0.0752497
  0.07866989  0.09582446  0.16119766 -0.05315704 -0.13047893  0.11468225
  0.1455305   0.13890832  0.07855953  0.09294581 -0.09203957  0.05327385
  0.11897655  0.19542845 -0.06425797  0.06975095  0.10708625 -0.01069329
  0.26140846  0.2713531  -0.38254248 -0.02070274 -0.07629457 -0.0226028
  0.03844784  0.01549377 -0.02792332 -0.00804527  0.01678751 -0.07548997
  0.1514819   0.06192979 -0.10066815 -0.09796073  0.20619283  0.07976528
  0.06707955 -0.02203439  0.02908488 -0.07383468  0.04749452  0.01066028
  0.00645857  0.06366897 -0.11920983 -0.07374604 -0.03772157 -0.07910712
 -0.14829466 -0.07260076 -0.12621894  0.02692657  0.14937885  0.03353344
 -0.09532402  0.12659189  0.0433435 ]


Balanced

#### Assuming Homogeneous treatment effct 

In [147]:
def calculate_dr_homogeneous_gb(X, D, Y, k_folds=5):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # K-Fold Cross-Fitting
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    W_hat, V_hat = np.zeros(len(Y)), np.zeros(len(Y))
    mse_fold = []

    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D[train_index], D[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        # Step 1: Estimate propensity scores using the provided function
        pi_hat = train_gb_pscore(X_train, D_train, X_test)  # Function from the image

        # Trim propensity scores (0.01, 0.99)
        trimmed_indices = (pi_hat > 0.01) & (pi_hat < 0.99)
        trimmed_X, trimmed_D, trimmed_Y = X_test[trimmed_indices], D_test[trimmed_indices], Y_test[trimmed_indices]
        trimmed_pscore = pi_hat[trimmed_indices]

        # Step 2: Estimate E[Y|X] using the provided function
        gamma_hat = train_gb_outcome(X_train, Y_train, trimmed_X)  # Function from the image

        # Step 3: Calculate residuals W_hat and V_hat
        W_hat[test_index[trimmed_indices]] = trimmed_Y - gamma_hat
        V_hat[test_index[trimmed_indices]] = trimmed_D - trimmed_pscore

        # Step 4: Calculate MSE for this fold
        mse_fold.append(mean_squared_error(trimmed_Y, gamma_hat))
    
    # Step 5: Regress W_hat on V_hat to estimate theta_0
    regression = LinearRegression()
    regression.fit(V_hat.reshape(-1, 1), W_hat)
    theta_hat = regression.coef_[0]

    return theta_hat, np.mean(mse_fold)

In [148]:
def run_dr_homogeneous_gb(dataset, covariates, treatment, outcomes):
    theta_hat_dr_values = []
    mse_values = []

    for outcome in outcomes:
        print(f"Processing outcome: {outcome}")
        X = dataset[covariates].values
        D = dataset[treatment].values
        Y = dataset[outcome].values

        # Call the homogeneous treatment effect function
        dr_ate, mse = calculate_dr_homogeneous_gb(X, D, Y)
        theta_hat_dr_values.append(dr_ate)
        mse_values.append(mse)

    # Create Results DataFrame
    results_df = pd.DataFrame({
        "Outcome": outcomes,
        "Estimate": theta_hat_dr_values,
        "MSE": mse_values
    })
    return results_df

In [149]:
gb_result_df_homo = run_dr_homogeneous_gb(
    dataset=double_ml_dataset, 
    covariates=all_covariates, 
    treatment=treatment, 
    outcomes=outcomes
)

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt


In [150]:
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous, Gradient Boosting):")
print(gb_result_df_homo)

Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous, Gradient Boosting):
             Outcome  Estimate        MSE
0           pcorrupt -0.019908   0.012329
1        ncorrupt_os -0.006349   0.002226
2  log_valor_corrupt -0.823704  28.820632


## Combining Estimates:

### Lasso: (hetero)

In [151]:
# Only mayor_covariates:(hetero)
ls_result_df_1_he = estimate_dr_for_outcomes_lasso(double_ml_dataset, mayor_covariates, treatment, outcomes)
ls_result_df_1_he

Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.020254,0.016171
1,ncorrupt_os,-0.008128,0.003728
2,log_valor_corrupt,-0.992246,39.054955


In [152]:
# Only mayor_covariates:(homo)
ls_result_df_1_ho = estimate_dr_homogeneous_lasso(double_ml_dataset, mayor_covariates, treatment, outcomes)
ls_result_df_1_ho

Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.01792,0.010611
1,ncorrupt_os,-0.009267,0.002861
2,log_valor_corrupt,-0.913043,28.48884


In [153]:
# mayor_covariates + municipal_covariates
ma_mu=(mayor_covariates +municipal_covariates)
ls_result_df_2_he = estimate_dr_for_outcomes_lasso(double_ml_dataset, ma_mu, treatment, outcomes)
ls_result_df_2_he

Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.020004,0.016512
1,ncorrupt_os,-0.00675,0.003707
2,log_valor_corrupt,-0.905899,38.035667


In [154]:
ls_result_df_2_ho = estimate_dr_homogeneous_lasso(double_ml_dataset, ma_mu, treatment, outcomes)
ls_result_df_2_ho

Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.017183,0.010455
1,ncorrupt_os,-0.006682,0.002634
2,log_valor_corrupt,-0.769087,26.503841


In [155]:
# mayor_covariants + municipal_covariantes + political_judicial_covariants
ma_mu_pl=(
    mayor_covariates +
    municipal_covariates +
    political_judicial_covariates)

ls_result_df_3_he = estimate_dr_for_outcomes_lasso(double_ml_dataset, ma_mu_pl, treatment, outcomes)
ls_result_df_3_he


Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.021204,0.016981
1,ncorrupt_os,-0.007074,0.003574
2,log_valor_corrupt,-1.012787,37.229637


In [156]:
ls_result_df_3_ho = estimate_dr_homogeneous_lasso(double_ml_dataset, ma_mu_pl, treatment, outcomes)
ls_result_df_3_ho

Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.021765,0.010458
1,ncorrupt_os,-0.007514,0.002631
2,log_valor_corrupt,-1.001436,26.460489


In [157]:
# All Covariants (plus two dummies)
ls_result_df_4_he = estimate_dr_for_outcomes_lasso(double_ml_dataset, all_covariates, treatment, outcomes)
ls_result_df_4_he

Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.021965,0.017303
1,ncorrupt_os,-0.00906,0.003216
2,log_valor_corrupt,-1.067526,36.28524


In [158]:
# All Covariants (plus two dummies)
ls_result_df_4_ho = estimate_dr_homogeneous_lasso(double_ml_dataset, all_covariates, treatment, outcomes)
ls_result_df_4_ho

Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.023743,0.01031
1,ncorrupt_os,-0.008505,0.002078
2,log_valor_corrupt,-1.190014,24.754757


### Random Forest: 

In [159]:
# Only mayor_covariates:
rf_result_df_1_he = run_dr_estimation_rf(double_ml_dataset, mayor_covariates, treatment, outcomes)
rf_result_df_1_he

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt


Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.024551,0.016579
1,ncorrupt_os,-0.009267,0.003814
2,log_valor_corrupt,-1.013124,39.744113


In [160]:
# Only mayor_covariates:
rf_result_df_1_ho = run_dr_estimation_rf_homo(double_ml_dataset, mayor_covariates, treatment, outcomes)
rf_result_df_1_ho

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt


Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.022994,0.011305
1,ncorrupt_os,-0.009108,0.002943
2,log_valor_corrupt,-0.974373,29.498879


In [161]:
# mayor_covariates + municipal_covariates
rf_result_df_2_he = run_dr_estimation_rf(double_ml_dataset, ma_mu, treatment, outcomes)
rf_result_df_2_he

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt


Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.018236,0.016606
1,ncorrupt_os,-0.003394,0.003918
2,log_valor_corrupt,-0.816997,36.712125


In [162]:
rf_result_df_2_ho = run_dr_estimation_rf_homo(double_ml_dataset, ma_mu, treatment, outcomes)
rf_result_df_2_ho

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt


Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.019338,0.011028
1,ncorrupt_os,-0.004336,0.002763
2,log_valor_corrupt,-0.773453,27.397548


In [163]:
# mayor_covariants + municipal_covariantes + political_judicial_covariants
rf_result_df_3_he = run_dr_estimation_rf(double_ml_dataset, ma_mu_pl, treatment, outcomes)
rf_result_df_3_he

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt


Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.020898,0.019889
1,ncorrupt_os,-0.004831,0.003618
2,log_valor_corrupt,-0.848086,39.230805


In [164]:
rf_result_df_3_ho = run_dr_estimation_rf_homo(double_ml_dataset, ma_mu_pl, treatment, outcomes)
rf_result_df_3_ho

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt


Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.019102,0.011039
1,ncorrupt_os,-0.004482,0.002717
2,log_valor_corrupt,-0.872685,27.287176


In [165]:
# All Covariants (plus two dummies)
rf_result_df_4_he = run_dr_estimation_rf(double_ml_dataset, all_covariates, treatment, outcomes)
rf_result_df_4_he

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt


Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.027213,0.019589
1,ncorrupt_os,-0.00759,0.002966
2,log_valor_corrupt,-0.975908,33.768337


In [166]:
rf_result_df_4_ho = run_dr_estimation_rf_homo(double_ml_dataset, all_covariates, treatment, outcomes)
rf_result_df_4_ho

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt


Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.023355,0.010866
1,ncorrupt_os,-0.007674,0.002254
2,log_valor_corrupt,-0.938316,25.969169


### MLP: 

In [167]:
# Only mayor_covariates:
mlp_result_df_1_he = run_double_ml_mlp(double_ml_dataset, mayor_covariates, outcomes)
mlp_result_df_1_he

Processing outcome: pcorrupt
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━

Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.070568,0.489341
1,ncorrupt_os,0.025076,0.413284
2,log_valor_corrupt,-1.023083,105.423218


In [168]:
mlp_result_df_1_ho = run_double_ml_mlp_homogeneous(double_ml_dataset, mayor_covariates, treatment, outcomes)
mlp_result_df_1_ho

Processing outcome: pcorrupt
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
Processing outcome: ncorrupt_os
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1

Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.031959,0.017039
1,ncorrupt_os,-0.012347,0.009806
2,log_valor_corrupt,-1.001793,33.15033


In [169]:
# mayor_covariates + municipal_covariates
mlp_result_df_2_he = run_double_ml_mlp(double_ml_dataset, ma_mu, outcomes)
mlp_result_df_2_he

Processing outcome: pcorrupt
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━

Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.012248,0.190388
1,ncorrupt_os,-0.010301,0.04248
2,log_valor_corrupt,0.489262,908.862427


In [170]:
mlp_result_df_2_ho = run_double_ml_mlp_homogeneous(double_ml_dataset, ma_mu, treatment,outcomes)
mlp_result_df_2_ho

Processing outcome: pcorrupt
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
Processing outcome: ncorrupt_os
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1

Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.009896,0.025246
1,ncorrupt_os,-0.018507,0.01431
2,log_valor_corrupt,-0.66836,28.804699


In [171]:
# mayor_covariants + municipal_covariantes + political_judicial_covariants
mlp_result_df_3_he = run_double_ml_mlp(double_ml_dataset, ma_mu_pl, outcomes)
mlp_result_df_3_he

Processing outcome: pcorrupt
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━

Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.007341,0.043469
1,ncorrupt_os,0.001751,0.018468
2,log_valor_corrupt,-0.777292,47.814293


In [172]:
mlp_result_df_3_ho = run_double_ml_mlp_homogeneous(double_ml_dataset, ma_mu_pl, treatment,outcomes)
mlp_result_df_3_ho

Processing outcome: pcorrupt
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
Processing outcome: ncorrupt_os
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1

Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.033487,0.019224
1,ncorrupt_os,-0.007227,0.01191
2,log_valor_corrupt,-0.514015,29.386236


In [173]:
# All Covariants (plus two dummies)
mlp_result_df_4_he = run_double_ml_mlp(double_ml_dataset, all_covariates,outcomes)
mlp_result_df_4_he

Processing outcome: pcorrupt
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━

Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.017534,0.082121
1,ncorrupt_os,-0.024469,0.055118
2,log_valor_corrupt,-0.758514,63.923859


In [174]:
mlp_result_df_4_ho = run_double_ml_mlp_homogeneous(double_ml_dataset, all_covariates,treatment, outcomes)
mlp_result_df_4_ho

Processing outcome: pcorrupt
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
Processing outcome: ncorrupt_os
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1

Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.014225,0.034814
1,ncorrupt_os,-0.025236,0.022335
2,log_valor_corrupt,-1.306737,29.128323


### Gradiant Boosting: (hetero)

In [175]:
# Only mayor_covariants
gb_result_df_1_he = run_gradient_boosting_dml(double_ml_dataset, mayor_covariates, treatment, outcomes)
gb_result_df_1_he

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt


Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.020351,0.066814
1,ncorrupt_os,-0.006718,0.01455
2,log_valor_corrupt,-0.193866,357.192553


In [176]:
gb_result_df_1_ho = run_dr_homogeneous_gb(double_ml_dataset, mayor_covariates, treatment, outcomes)
gb_result_df_1_ho

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt


Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.018237,0.012909
1,ncorrupt_os,-0.009557,0.003252
2,log_valor_corrupt,-1.047505,33.206817


In [177]:
# mayor_covariates + municipal_covariates
gb_result_df_2_he = run_gradient_boosting_dml(double_ml_dataset, ma_mu, treatment, outcomes)
gb_result_df_2_he

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt


Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.019443,0.058081
1,ncorrupt_os,-0.00997,0.031864
2,log_valor_corrupt,-1.009129,190.921643


In [178]:
gb_result_df_2_ho = run_dr_homogeneous_gb(double_ml_dataset, ma_mu, treatment, outcomes)
gb_result_df_2_ho

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt


Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.020428,0.012555
1,ncorrupt_os,-0.006281,0.003009
2,log_valor_corrupt,-0.804077,28.99438


In [179]:
# mayor_covariants + municipal_covariantes + political_judicial_covariants
gb_result_df_3_he = run_gradient_boosting_dml(double_ml_dataset, ma_mu_pl, treatment, outcomes)
gb_result_df_3_he

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt


Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.015212,0.137367
1,ncorrupt_os,-0.015407,0.054797
2,log_valor_corrupt,-0.856813,506.335021


In [180]:
gb_result_df_3_ho = run_dr_homogeneous_gb(double_ml_dataset, ma_mu_pl, treatment, outcomes)
gb_result_df_3_ho

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt


Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.019965,0.012114
1,ncorrupt_os,-0.004934,0.002953
2,log_valor_corrupt,-0.519726,28.185947


In [181]:
# All Covariants (plus two dummies)
gb_result_df_4_he = run_gradient_boosting_dml(double_ml_dataset, all_covariates,treatment, outcomes)
gb_result_df_4_he

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt


Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.033108,0.115371
1,ncorrupt_os,-0.01075,0.014521
2,log_valor_corrupt,-0.372401,231.858423


In [182]:
gb_result_df_4_ho = run_dr_homogeneous_gb(double_ml_dataset, all_covariates,treatment, outcomes)
gb_result_df_4_ho

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: log_valor_corrupt


Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.019908,0.012329
1,ncorrupt_os,-0.006349,0.002226
2,log_valor_corrupt,-0.823704,28.820632


## Combining datasets (hetero)

In [183]:
def extract_estimate_mse(result_df, outcome_name):
    row = result_df[result_df['Outcome'] == outcome_name]
    estimate = row['Estimate'].values[0]
    mse = row['MSE'].values[0]
    return f"{estimate:.6f} ({mse:.6f})"
# Create a list of models and their corresponding result dataframes
models = ["Lasso", "Random Forest", "Multiple-layer Perceptron", "Gradient Boosting"]
result_dfs_1 = [ls_result_df_1_he, rf_result_df_1_he, mlp_result_df_1_he, gb_result_df_1_he]
result_dfs_2 = [ls_result_df_2_he, rf_result_df_2_he, mlp_result_df_2_he, gb_result_df_2_he]
result_dfs_3 = [ls_result_df_3_he, rf_result_df_3_he, mlp_result_df_3_he, gb_result_df_3_he]
result_dfs_4 = [ls_result_df_4_he, rf_result_df_4_he, mlp_result_df_4_he, gb_result_df_4_he]



In [184]:

outcome_name = "pcorrupt"

# Combine results into a single DataFrame
combined_results_pc = pd.DataFrame({
    "ML Method": models,
    "Covariates_1": [extract_estimate_mse(df, outcome_name) for df in result_dfs_1],
    "Covariates_2": [extract_estimate_mse(df, outcome_name) for df in result_dfs_2],
    "Covariates_3": [extract_estimate_mse(df, outcome_name) for df in result_dfs_3],
    "Covariates_4": [extract_estimate_mse(df, outcome_name) for df in result_dfs_4]
})

# Display the combined results
print(combined_results_pc)

                   ML Method          Covariates_1          Covariates_2  \
0                      Lasso  -0.020254 (0.016171)  -0.020004 (0.016512)   
1              Random Forest  -0.024551 (0.016579)  -0.018236 (0.016606)   
2  Multiple-layer Perceptron  -0.070568 (0.489341)  -0.012248 (0.190388)   
3          Gradient Boosting  -0.020351 (0.066814)  -0.019443 (0.058081)   

           Covariates_3          Covariates_4  
0  -0.021204 (0.016981)  -0.021965 (0.017303)  
1  -0.020898 (0.019889)  -0.027213 (0.019589)  
2  -0.007341 (0.043469)  -0.017534 (0.082121)  
3  -0.015212 (0.137367)  -0.033108 (0.115371)  


In [185]:
outcome_name = "ncorrupt_os"

# Combine results into a single DataFrame
combined_results_nc = pd.DataFrame({
    "ML Method": models,
    "Covariates_1": [extract_estimate_mse(df, outcome_name) for df in result_dfs_1],
    "Covariates_2": [extract_estimate_mse(df, outcome_name) for df in result_dfs_2],
    "Covariates_3": [extract_estimate_mse(df, outcome_name) for df in result_dfs_3],
    "Covariates_4": [extract_estimate_mse(df, outcome_name) for df in result_dfs_4]
})

# Display the combined results
print(combined_results_nc)

                   ML Method          Covariates_1          Covariates_2  \
0                      Lasso  -0.008128 (0.003728)  -0.006750 (0.003707)   
1              Random Forest  -0.009267 (0.003814)  -0.003394 (0.003918)   
2  Multiple-layer Perceptron   0.025076 (0.413284)  -0.010301 (0.042480)   
3          Gradient Boosting  -0.006718 (0.014550)  -0.009970 (0.031864)   

           Covariates_3          Covariates_4  
0  -0.007074 (0.003574)  -0.009060 (0.003216)  
1  -0.004831 (0.003618)  -0.007590 (0.002966)  
2   0.001751 (0.018468)  -0.024469 (0.055118)  
3  -0.015407 (0.054797)  -0.010750 (0.014521)  


In [186]:
outcome_name = "log_valor_corrupt"

# Combine results into a single DataFrame
combined_results_lv = pd.DataFrame({
    "ML Method": models,
    "Covariates_1": [extract_estimate_mse(df, outcome_name) for df in result_dfs_1],
    "Covariates_2": [extract_estimate_mse(df, outcome_name) for df in result_dfs_2],
    "Covariates_3": [extract_estimate_mse(df, outcome_name) for df in result_dfs_3],
    "Covariates_4": [extract_estimate_mse(df, outcome_name) for df in result_dfs_4]
})

# Display the combined results
print(combined_results_lv)

                   ML Method            Covariates_1            Covariates_2  \
0                      Lasso   -0.992246 (39.054955)   -0.905899 (38.035667)   
1              Random Forest   -1.013124 (39.744113)   -0.816997 (36.712125)   
2  Multiple-layer Perceptron  -1.023083 (105.423218)   0.489262 (908.862427)   
3          Gradient Boosting  -0.193866 (357.192553)  -1.009129 (190.921643)   

             Covariates_3            Covariates_4  
0   -1.012787 (37.229637)   -1.067526 (36.285240)  
1   -0.848086 (39.230805)   -0.975908 (33.768337)  
2   -0.777292 (47.814293)   -0.758514 (63.923859)  
3  -0.856813 (506.335021)  -0.372401 (231.858423)  


## Combining datasets (homo)

In [187]:
result_dfs_1_ho = [ls_result_df_1_ho, rf_result_df_1_ho, mlp_result_df_1_ho, gb_result_df_1_ho]
result_dfs_2_ho = [ls_result_df_2_ho, rf_result_df_2_ho, mlp_result_df_2_ho, gb_result_df_2_ho]
result_dfs_3_ho = [ls_result_df_3_ho, rf_result_df_3_ho, mlp_result_df_3_ho, gb_result_df_3_ho]
result_dfs_4_ho = [ls_result_df_4_ho, rf_result_df_4_ho, mlp_result_df_4_ho, gb_result_df_4_ho]



In [188]:
outcome_name = "pcorrupt"

# Combine results into a single DataFrame
combined_results_pc_ho = pd.DataFrame({
    "ML Method": models,
    "Covariates_1": [extract_estimate_mse(df, outcome_name) for df in result_dfs_1_ho],
    "Covariates_2": [extract_estimate_mse(df, outcome_name) for df in result_dfs_2_ho],
    "Covariates_3": [extract_estimate_mse(df, outcome_name) for df in result_dfs_3_ho],
    "Covariates_4": [extract_estimate_mse(df, outcome_name) for df in result_dfs_4_ho]
})

# Display the combined results
print(combined_results_pc_ho)

                   ML Method          Covariates_1          Covariates_2  \
0                      Lasso  -0.017920 (0.010611)  -0.017183 (0.010455)   
1              Random Forest  -0.022994 (0.011305)  -0.019338 (0.011028)   
2  Multiple-layer Perceptron  -0.031959 (0.017039)  -0.009896 (0.025246)   
3          Gradient Boosting  -0.018237 (0.012909)  -0.020428 (0.012555)   

           Covariates_3          Covariates_4  
0  -0.021765 (0.010458)  -0.023743 (0.010310)  
1  -0.019102 (0.011039)  -0.023355 (0.010866)  
2  -0.033487 (0.019224)  -0.014225 (0.034814)  
3  -0.019965 (0.012114)  -0.019908 (0.012329)  


In [189]:
outcome_name = "ncorrupt_os"

# Combine results into a single DataFrame
combined_results_nc_ho = pd.DataFrame({
    "ML Method": models,
    "Covariates_1": [extract_estimate_mse(df, outcome_name) for df in result_dfs_1_ho],
    "Covariates_2": [extract_estimate_mse(df, outcome_name) for df in result_dfs_2_ho],
    "Covariates_3": [extract_estimate_mse(df, outcome_name) for df in result_dfs_3_ho],
    "Covariates_4": [extract_estimate_mse(df, outcome_name) for df in result_dfs_4_ho]
})

# Display the combined results
print(combined_results_nc_ho)

                   ML Method          Covariates_1          Covariates_2  \
0                      Lasso  -0.009267 (0.002861)  -0.006682 (0.002634)   
1              Random Forest  -0.009108 (0.002943)  -0.004336 (0.002763)   
2  Multiple-layer Perceptron  -0.012347 (0.009806)  -0.018507 (0.014310)   
3          Gradient Boosting  -0.009557 (0.003252)  -0.006281 (0.003009)   

           Covariates_3          Covariates_4  
0  -0.007514 (0.002631)  -0.008505 (0.002078)  
1  -0.004482 (0.002717)  -0.007674 (0.002254)  
2  -0.007227 (0.011910)  -0.025236 (0.022335)  
3  -0.004934 (0.002953)  -0.006349 (0.002226)  


In [190]:
outcome_name = "log_valor_corrupt"

# Combine results into a single DataFrame
combined_results_lv_ho = pd.DataFrame({
    "ML Method": models,
    "Covariates_1": [extract_estimate_mse(df, outcome_name) for df in result_dfs_1_ho],
    "Covariates_2": [extract_estimate_mse(df, outcome_name) for df in result_dfs_2_ho],
    "Covariates_3": [extract_estimate_mse(df, outcome_name) for df in result_dfs_3_ho],
    "Covariates_4": [extract_estimate_mse(df, outcome_name) for df in result_dfs_4_ho]
})

# Display the combined results
print(combined_results_lv_ho)

                   ML Method           Covariates_1           Covariates_2  \
0                      Lasso  -0.913043 (28.488840)  -0.769087 (26.503841)   
1              Random Forest  -0.974373 (29.498879)  -0.773453 (27.397548)   
2  Multiple-layer Perceptron  -1.001793 (33.150330)  -0.668360 (28.804699)   
3          Gradient Boosting  -1.047505 (33.206817)  -0.804077 (28.994380)   

            Covariates_3           Covariates_4  
0  -1.001436 (26.460489)  -1.190014 (24.754757)  
1  -0.872685 (27.287176)  -0.938316 (25.969169)  
2  -0.514015 (29.386236)  -1.306737 (29.128323)  
3  -0.519726 (28.185947)  -0.823704 (28.820632)  
