# __Econometric Game__: Effect of electoral accountability on corruption?
   
### Team 3: Katheryn Ding, Amber Wei, Max Ye



## Set up 

In [38]:
import numpy as np
import pandas as pd
import tensorflow as tf
from scipy.special import expit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")


In [39]:
corruption_df = pd.read_stata("corruptiondata.dta")
corruption_df.columns

Index(['uf', 'nsorteio', 'totrecursos', 'tot_os', 'pop', 'purb',
       'p_secundario', 'cod_ibge6', 'pib_capita_02', 'op_01_04',
       ...
       'uf_d18', 'uf_d19', 'uf_d20', 'uf_d21', 'uf_d22', 'uf_d23', 'uf_d24',
       'uf_d25', 'uf_d26', 'esample2'],
      dtype='object', length=116)

In [40]:
# Define covariates for each category

# Mayor characteristics
mayor_covariates = [
    "pref_idade_tse",  # Age
    "pref_masc",       # Gender
    "pref_escola",     # Schooling
    "winmargin2000",   # Margin of victory in 2000
    "exp_prefeito"     # Was previously a mayor in a consecutive term
] + [col for col in corruption_df.columns if col.startswith("party_d")]

# Municipal characteristics
municipal_covariates = [
    "lpop",           # Log of population in 2000
    "purb",           # Percentage of population in urban sectors
    "p_secundario",   # Percentage with at least secondary education
    "mun_novo",       # New municipality indicator
    "lpib02",         # Log of GDP per capita in 2002
    "gini_ipea"       # Gini coefficient
]

# Political and Judicial characteristics
political_judicial_covariates = [
    "ENEP2000",  # Effective number of parties in 2000 mayor elections
    "ENLP2000",  # Effective number of parties in 2000 legislative elections
    "p_cad_pref" # Proportion of legislators from the same party as the mayor
]

# Dummies
dummy_covariates = [
    col for col in corruption_df.columns if col.startswith("uf_d") or col.startswith("sorteio")
]


print("Mayor Covariates:", mayor_covariates)
print("Municipal Covariates:", municipal_covariates)
print("Political and Judicial Covariates:", political_judicial_covariates)
print("Dummy Covariates:", dummy_covariates)

Mayor Covariates: ['pref_idade_tse', 'pref_masc', 'pref_escola', 'winmargin2000', 'exp_prefeito', 'party_d1', 'party_d3', 'party_d4', 'party_d5', 'party_d6', 'party_d7', 'party_d8', 'party_d9', 'party_d10', 'party_d11', 'party_d12', 'party_d13', 'party_d14', 'party_d15', 'party_d16', 'party_d17', 'party_d18']
Municipal Covariates: ['lpop', 'purb', 'p_secundario', 'mun_novo', 'lpib02', 'gini_ipea']
Political and Judicial Covariates: ['ENEP2000', 'ENLP2000', 'p_cad_pref']
Dummy Covariates: ['sorteio1', 'sorteio2', 'sorteio3', 'sorteio4', 'sorteio5', 'sorteio6', 'sorteio7', 'sorteio8', 'sorteio9', 'sorteio10', 'uf_d1', 'uf_d2', 'uf_d3', 'uf_d4', 'uf_d5', 'uf_d6', 'uf_d7', 'uf_d8', 'uf_d9', 'uf_d10', 'uf_d11', 'uf_d12', 'uf_d13', 'uf_d14', 'uf_d15', 'uf_d16', 'uf_d17', 'uf_d18', 'uf_d19', 'uf_d20', 'uf_d21', 'uf_d22', 'uf_d23', 'uf_d24', 'uf_d25', 'uf_d26']


In [41]:
corruption_df["log_valor_corrupt"] = np.log(corruption_df["valor_corrupt"] + 1)

# Define the treatment and outcome variables
treatment = "first"  
outcomes = ["pcorrupt", "ncorrupt_os", "valor_corrupt",'log_valor_corrupt'] 

# Use all covariates
all_covariates = (
    mayor_covariates +
    municipal_covariates +
    political_judicial_covariates +
    dummy_covariates
)
#set up dataset:
required_columns = [treatment] + outcomes + all_covariates
double_ml_dataset = corruption_df[required_columns].dropna()

print("Dataset Columns:", double_ml_dataset.columns)
print("Number of rows in the Dataset:", double_ml_dataset.shape[0])

Dataset Columns: Index(['first', 'pcorrupt', 'ncorrupt_os', 'valor_corrupt',
       'log_valor_corrupt', 'pref_idade_tse', 'pref_masc', 'pref_escola',
       'winmargin2000', 'exp_prefeito', 'party_d1', 'party_d3', 'party_d4',
       'party_d5', 'party_d6', 'party_d7', 'party_d8', 'party_d9', 'party_d10',
       'party_d11', 'party_d12', 'party_d13', 'party_d14', 'party_d15',
       'party_d16', 'party_d17', 'party_d18', 'lpop', 'purb', 'p_secundario',
       'mun_novo', 'lpib02', 'gini_ipea', 'ENEP2000', 'ENLP2000', 'p_cad_pref',
       'sorteio1', 'sorteio2', 'sorteio3', 'sorteio4', 'sorteio5', 'sorteio6',
       'sorteio7', 'sorteio8', 'sorteio9', 'sorteio10', 'uf_d1', 'uf_d2',
       'uf_d3', 'uf_d4', 'uf_d5', 'uf_d6', 'uf_d7', 'uf_d8', 'uf_d9', 'uf_d10',
       'uf_d11', 'uf_d12', 'uf_d13', 'uf_d14', 'uf_d15', 'uf_d16', 'uf_d17',
       'uf_d18', 'uf_d19', 'uf_d20', 'uf_d21', 'uf_d22', 'uf_d23', 'uf_d24',
       'uf_d25', 'uf_d26'],
      dtype='object')
Number of rows in the Data

## Lasso 

In [42]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

#### Assuming Heterogeneosity 

In [43]:

theta_hat_dr_values = []
mse_values = []

# Loop through each outcome
for outcome in outcomes:
    
    X = double_ml_dataset[all_covariates]
    D = double_ml_dataset[treatment]
    Y = double_ml_dataset[outcome]
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Prepare 5-Fold Cross-Fitting
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    dr_ate_fold = []
    mse_fold = []
    
    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D.iloc[train_index], D.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
        
        # Estimate propensity scores using Lasso
        lasso_pscore = LassoCV(alphas=np.logspace(-4, 0, 50), cv=5, random_state=42).fit(X_train, D_train)
        D_pred = lasso_pscore.predict(X_test)
        propensity_scores = expit(D_pred) 
        
        # Trim p-scores (0.01, 0.99)
        trimmed_indices = (propensity_scores > 0.01) & (propensity_scores < 0.99)
        trimmed_X = X_test[trimmed_indices]
        trimmed_D = D_test.iloc[trimmed_indices]
        trimmed_Y = Y_test.iloc[trimmed_indices]
        trimmed_pscore = propensity_scores[trimmed_indices]
        
        # Fit outcome models for treated (D=1) and untreated (D=0) groups
        # Treated model
        treated_model = LassoCV(alphas=np.logspace(-4, 0, 50), cv=5, random_state=42)
        treated_model.fit(X_train[D_train == 1], Y_train[D_train == 1])
        gamma1 = treated_model.predict(trimmed_X)
        
        # Untreated model
        untreated_model = LassoCV(alphas=np.logspace(-4, 0, 50), cv=5, random_state=42)
        untreated_model.fit(X_train[D_train == 0], Y_train[D_train == 0])
        gamma0 = untreated_model.predict(trimmed_X)
        
        # Step 3: Calculate doubly robust estimates for potential outcomes
        trimmed_data = pd.DataFrame({
            "gamma1": gamma1,
            "gamma0": gamma0,
            "D": trimmed_D.values,
            "Y": trimmed_Y.values,
            "pscore": trimmed_pscore,
        })
        
        # DR Estimate for Y(1)
        trimmed_data['Y1_dr'] = (
            trimmed_data['gamma1'] +
            (trimmed_data['D'] / trimmed_data['pscore']) * (trimmed_data['Y'] - trimmed_data['gamma1'])
        )
        
        # DR Estimate for Y(0)
        trimmed_data['Y0_dr'] = (
            trimmed_data['gamma0'] +
            ((1 - trimmed_data['D']) / (1 - trimmed_data['pscore'])) * (trimmed_data['Y'] - trimmed_data['gamma0'])
        )
        
        # Calculate treatment effect for the fold
        dr_ate_fold.append(np.mean(trimmed_data['Y1_dr'] - trimmed_data['Y0_dr']))
        
        # Calculate MSE for the fold
        mse_fold.append(mean_squared_error(
        trimmed_data['Y'], 
        trimmed_data['Y1_dr'] * trimmed_data['D'] + trimmed_data['Y0_dr'] * (1 - trimmed_data['D'])
        ))
    
    # Average treatment effect and MSE across folds
    dr_ate = np.mean(dr_ate_fold)
    mse = np.mean(mse_fold)
    
    # Store results
    theta_hat_dr_values.append(dr_ate)
    mse_values.append(mse)

# Create results DataFrame
ls_result_df_hetero = pd.DataFrame({
    "Outcome": outcomes,
    "Estimate": theta_hat_dr_values,
    "MSE": mse_values
})

# Print the final results DataFrame
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting:")
print(ls_result_df_hetero)

Doubly Robust Treatment Effect Estimates with Cross-Fitting:
             Outcome       Estimate           MSE
0           pcorrupt      -0.022035  1.736679e-02
1        ncorrupt_os      -0.008515  3.261818e-03
2      valor_corrupt -148571.437500  1.017620e+12
3  log_valor_corrupt      -1.067475  3.614077e+01


In [44]:
# Checking for balance
weights_treated = 1 / propensity_scores[trimmed_D == 1]
weights_untreated = 1 / (1 - propensity_scores[trimmed_D == 0])

# Create DataFrames for treated and untreated groups
treated_data = trimmed_X[trimmed_D == 1]
untreated_data = trimmed_X[trimmed_D == 0]

# Compute weighted means for each covariate
treated_means = np.average(treated_data, axis=0, weights=weights_treated)
untreated_means = np.average(untreated_data, axis=0, weights=weights_untreated)

# Check absolute standardized mean differences
smd = np.abs(treated_means - untreated_means) / np.std(trimmed_X, axis=0)
print("Standardized Mean Differences (SMD):", smd)

Standardized Mean Differences (SMD): [0.06885722 0.2901101  0.02904402 0.6848417  0.21398759 0.06088972
 0.25428903 0.41079307 0.09445282 0.04607635 0.6        0.6
 0.35854152 0.24288547 0.24353866 0.03111833 0.16987991 0.2
 0.17568141 0.1651254  0.21418196 0.6        0.02996126 0.04102391
 0.18319435 0.20693757 0.4547287  0.39264885 0.3478793  0.23060311
 0.2800635  0.00521759 0.20551337 0.15774368 0.38865724 0.08364097
 0.05980109 0.2552489  0.47479638 0.35756427 0.02450931 0.18689632
 0.19421004 0.06287777 0.35430452 0.26188627 0.01593611 0.25
 0.46131605 0.3687714  0.05878664 0.0249002  0.26350877 0.01907789
 0.1791523  0.16089885 0.17051342 0.3866724  0.17795302 0.25078654
 0.23934054 0.625      0.04331093 0.07217446 0.17713386 0.03650402
 0.4       ]


Not very balanced.

#### Assuming Homogeneous treatment effct 

In [45]:
theta_hat_dr_values = []
mse_values = []

# Loop through each outcome
for outcome in outcomes:
    
    X = double_ml_dataset[all_covariates].values
    D = double_ml_dataset[treatment].values
    Y = double_ml_dataset[outcome].values

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # K-Fold Cross-Fitting
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    W_hat, V_hat = np.zeros(len(Y)), np.zeros(len(Y))
    mse_fold = []  # List to store MSE for each fold
    
    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D[train_index], D[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        # Step 1: Estimate E[Y|X] using Lasso (Outcome model)
        outcome_model = LassoCV(alphas=np.logspace(-4, 0, 50), cv=5, random_state=42)
        outcome_model.fit(X_train, Y_train)
        gamma_hat = outcome_model.predict(X_test)

        # Step 2: Estimate E[D|X] using Lasso (Propensity score model)
        pscore_model = LassoCV(alphas=np.logspace(-4, 0, 50), cv=5, random_state=42)
        pscore_model.fit(X_train, D_train)
        pi_hat = pscore_model.predict(X_test)
        
        # Step 3: Calculate residuals W_hat and V_hat
        W_hat[test_index] = Y_test - gamma_hat
        V_hat[test_index] = D_test - pi_hat

        # Step 4: Calculate MSE for this fold
        mse_fold.append(mean_squared_error(Y_test, gamma_hat))

    # Step 5: Regress W_hat on V_hat to estimate theta_0
    regression = LinearRegression()
    regression.fit(V_hat.reshape(-1, 1), W_hat)
    theta_hat = regression.coef_[0]
    
    # Store results
    theta_hat_dr_values.append(theta_hat)
    mse_values.append(np.mean(mse_fold))  # Average MSE across folds

# Step 6: Create Results DataFrame
ls_result_df_homo = pd.DataFrame({
    "Outcome": outcomes,
    "Estimate": theta_hat_dr_values,
    "MSE": mse_values
})

# Display the results DataFrame
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous):")
print(ls_result_df_homo)

Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous):
             Outcome       Estimate           MSE
0           pcorrupt      -0.023304  1.026658e-02
1        ncorrupt_os      -0.008581  2.078044e-03
2      valor_corrupt -139784.148524  3.580824e+11
3  log_valor_corrupt      -1.205078  2.475476e+01


## Random Forest

In [46]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

#### Assuming Heterogeneous Treatment Effect

In [47]:
theta_hat_dr_values = []
mse_values = []

# Loop through each outcome
for outcome in outcomes:
    
    X = double_ml_dataset[all_covariates]
    D = double_ml_dataset[treatment]
    Y = double_ml_dataset[outcome]
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Prepare 5-Fold Cross-Fitting
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    dr_ate_fold = []
    mse_fold = []
    
    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D.iloc[train_index], D.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
        
        # Estimate propensity scores using Random Forest
        rf_pscore = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
        rf_pscore.fit(X_train, D_train)
        propensity_scores = rf_pscore.predict_proba(X_test)[:, 1]  # Probability of treatment
        
        # Trim p-scores (0.01, 0.99)
        trimmed_indices = (propensity_scores > 0.01) & (propensity_scores < 0.99)
        trimmed_X = X_test[trimmed_indices]
        trimmed_D = D_test.iloc[trimmed_indices]
        trimmed_Y = Y_test.iloc[trimmed_indices]
        trimmed_pscore = propensity_scores[trimmed_indices]
        
        # Fit outcome models for treated (D=1) and untreated (D=0) groups using Random Forest
        treated_model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
        treated_model.fit(X_train[D_train == 1], Y_train[D_train == 1])
        gamma1 = treated_model.predict(trimmed_X)
        
        untreated_model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
        untreated_model.fit(X_train[D_train == 0], Y_train[D_train == 0])
        gamma0 = untreated_model.predict(trimmed_X)
        
        trimmed_data = pd.DataFrame({
            "gamma1": gamma1,
            "gamma0": gamma0,
            "D": trimmed_D.values,
            "Y": trimmed_Y.values,
            "pscore": trimmed_pscore,
        })
        
        # DR Estimate for Y(1)
        trimmed_data['Y1_dr'] = (
            trimmed_data['gamma1'] +
            (trimmed_data['D'] / trimmed_data['pscore']) * (trimmed_data['Y'] - trimmed_data['gamma1'])
        )
        
        # DR Estimate for Y(0)
        trimmed_data['Y0_dr'] = (
            trimmed_data['gamma0'] +
            ((1 - trimmed_data['D']) / (1 - trimmed_data['pscore'])) * (trimmed_data['Y'] - trimmed_data['gamma0'])
        )
        
        # Calculate treatment effect and MSE for the fold
        dr_ate_fold.append(np.mean(trimmed_data['Y1_dr'] - trimmed_data['Y0_dr']))
        mse_fold.append(mean_squared_error(
            trimmed_data['Y'], 
            trimmed_data['Y1_dr'] * trimmed_data['D'] + trimmed_data['Y0_dr'] * (1 - trimmed_data['D'])
        ))
    
    # Average treatment effect and MSE across folds
    dr_ate = np.mean(dr_ate_fold)
    mse = np.mean(mse_fold)
    
    theta_hat_dr_values.append(dr_ate)
    mse_values.append(mse)

# Create results DataFrame
rf_result_df_hetero = pd.DataFrame({
    "Outcome": outcomes,
    "Estimate": theta_hat_dr_values,
    "MSE": mse_values
})

# Print the final results
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting:")
print(rf_result_df_hetero)

Doubly Robust Treatment Effect Estimates with Cross-Fitting:
             Outcome       Estimate           MSE
0           pcorrupt      -0.026246  1.801923e-02
1        ncorrupt_os      -0.007434  2.945456e-03
2      valor_corrupt -144203.110003  8.236523e+11
3  log_valor_corrupt      -0.910680  3.315613e+01


In [48]:
# Checking for balance
weights_treated = 1 / propensity_scores[trimmed_D == 1]
weights_untreated = 1 / (1 - propensity_scores[trimmed_D == 0])

# Create DataFrames for treated and untreated groups
treated_data = trimmed_X[trimmed_D == 1]
untreated_data = trimmed_X[trimmed_D == 0]

# Compute weighted means for each covariate
treated_means = np.average(treated_data, axis=0, weights=weights_treated)
untreated_means = np.average(untreated_data, axis=0, weights=weights_untreated)

# Check absolute standardized mean differences
smd = np.abs(treated_means - untreated_means) / np.std(trimmed_X, axis=0)
print("Standardized Mean Differences (SMD):", smd)

Standardized Mean Differences (SMD): [6.50356092e-02 2.62891458e-01 4.46274547e-02 5.68645557e-01
 1.40846193e-01 2.85167214e-02 2.39017798e-01 4.21856346e-01
 1.70203834e-01 3.12581267e-02 1.11758709e-09 1.11758709e-09
 3.40814162e-01 2.19392841e-01 2.04254184e-01 6.68983541e-04
 1.36899855e-01 1.11758709e-09 1.48401816e-01 1.57523424e-01
 1.77873843e-01 1.11758709e-09 1.64725845e-01 1.52697392e-01
 9.21535698e-03 2.80392638e-01 3.54954060e-01 4.13530648e-01
 1.65639872e-01 5.90287328e-02 1.05886511e-01 7.97249442e-03
 9.18892102e-02 1.52543983e-01 4.31016975e-01 9.91344798e-02
 1.11597675e-01 2.86269278e-01 4.33117663e-01 4.24619704e-01
 2.35066993e-02 1.68955984e-01 1.73020941e-01 1.04110585e-01
 3.15033490e-01 2.44864719e-01 7.19494572e-02 4.65661287e-10
 4.64575176e-01 3.91367717e-01 2.11395745e-02 8.00353031e-03
 2.68994830e-01 1.66763040e-03 1.63270268e-01 1.08851813e-01
 8.79607502e-02 4.67152370e-01 2.46684250e-01 2.45766903e-01
 1.55956523e-01 2.32830644e-10 2.61510988e-02 8.

Balanced

#### Assuming Homogeneous Treatment Effect

In [49]:
theta_hat_dr_values = []
mse_values = []

# Loop through each outcome
for outcome in outcomes:
    
    X = double_ml_dataset[all_covariates].values
    D = double_ml_dataset[treatment].values
    Y = double_ml_dataset[outcome].values

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # K-Fold Cross-Fitting
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    W_hat, V_hat = np.zeros(len(Y)), np.zeros(len(Y))
    mse_fold = []  # List to store MSE for each fold
    
    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D[train_index], D[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        # Step 1: Estimate E[Y|X] using Random Forest (Outcome model)
        outcome_model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
        outcome_model.fit(X_train, Y_train)
        gamma_hat = outcome_model.predict(X_test)

        # Step 2: Estimate E[D|X] using Random Forest (Propensity score model)
        pscore_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
        pscore_model.fit(X_train, D_train)
        pi_hat = pscore_model.predict_proba(X_test)[:, 1]  # Probability of treatment
        
        # Step 3: Calculate residuals W_hat and V_hat
        W_hat[test_index] = Y_test - gamma_hat
        V_hat[test_index] = D_test - pi_hat

        # Step 4: Calculate MSE for this fold
        mse_fold.append(mean_squared_error(Y_test, gamma_hat))

    # Step 5: Regress W_hat on V_hat to estimate theta_0
    regression = LinearRegression()
    regression.fit(V_hat.reshape(-1, 1), W_hat)
    theta_hat = regression.coef_[0]
    
    # Store results
    theta_hat_dr_values.append(theta_hat)
    mse_values.append(np.mean(mse_fold))  # Average MSE across folds

# Step 6: Create Results DataFrame
rf_result_df_homo = pd.DataFrame({
    "Outcome": outcomes,
    "Estimate": theta_hat_dr_values,
    "MSE": mse_values
})

# Display the results DataFrame
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous) using Random Forest:")
print(rf_result_df_homo)

Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous) using Random Forest:
             Outcome       Estimate           MSE
0           pcorrupt      -0.024575  1.088266e-02
1        ncorrupt_os      -0.007833  2.269427e-03
2      valor_corrupt -132970.875697  3.256660e+11
3  log_valor_corrupt      -0.966588  2.605650e+01


## Multiple-Layer Perceptron

In [58]:
from sklearn.neural_network import MLPRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [59]:
# Function to build and train an MLP model
def train_mlp(X_train, Y_train, input_dim, output_activation='linear', loss='mse', verbose=0):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_dim,)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation=output_activation)
    ])
    model.compile(optimizer='adam', loss=loss, metrics=['mae'])
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model.fit(X_train, Y_train, validation_split=0.2, epochs=100, batch_size=32,
              callbacks=[early_stopping], verbose=verbose)
    return model

In [60]:
# Function to calculate doubly robust treatment effects using MLP
def double_ml_mlp_heterogeneous(X, D, Y, outcomes):
    theta_hat_dr_values = []
    mse_values = []

    # Standardize covariates
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 5-Fold Cross-Fitting
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    for outcome in outcomes:
        dr_ate_fold = []
        mse_fold = []
        
        for train_index, test_index in kf.split(X_scaled):
            X_train, X_test = X_scaled[train_index], X_scaled[test_index]
            D_train, D_test = D[train_index], D[test_index]
            Y_train, Y_test = Y[train_index], Y[test_index]
            
            # Step 1: Propensity score estimation using MLP
            model_pscore = train_mlp(X_train, D_train, input_dim=X_train.shape[1], 
                                     output_activation='sigmoid', loss='binary_crossentropy')
            propensity_scores = model_pscore.predict(X_test).flatten()

            # Trim p-scores (0.01, 0.99)
            trimmed_indices = (propensity_scores > 0.01) & (propensity_scores < 0.99)
            trimmed_X = X_test[trimmed_indices]
            trimmed_D = D_test[trimmed_indices]
            trimmed_Y = Y_test[trimmed_indices]
            trimmed_pscore = propensity_scores[trimmed_indices]

            # Step 2: Fit outcome models for treated and untreated groups
            model_treated = train_mlp(X_train[D_train == 1], Y_train[D_train == 1], input_dim=X_train.shape[1])
            gamma1 = model_treated.predict(trimmed_X).flatten()

            model_untreated = train_mlp(X_train[D_train == 0], Y_train[D_train == 0], input_dim=X_train.shape[1])
            gamma0 = model_untreated.predict(trimmed_X).flatten()

            # Step 3: Construct doubly robust estimates
            trimmed_data = pd.DataFrame({
                "gamma1": gamma1,
                "gamma0": gamma0,
                "D": trimmed_D,
                "Y": trimmed_Y,
                "pscore": trimmed_pscore
            })
            trimmed_data['Y1_dr'] = (
                trimmed_data['gamma1'] +
                (trimmed_data['D'] / trimmed_data['pscore']) * (trimmed_data['Y'] - trimmed_data['gamma1'])
            )
            trimmed_data['Y0_dr'] = (
                trimmed_data['gamma0'] +
                ((1 - trimmed_data['D']) / (1 - trimmed_data['pscore'])) * (trimmed_data['Y'] - trimmed_data['gamma0'])
            )

            # Calculate treatment effect and MSE for the fold
            dr_ate_fold.append(np.mean(trimmed_data['Y1_dr'] - trimmed_data['Y0_dr']))
            mse_fold.append(mean_squared_error(
                trimmed_data['Y'], 
                trimmed_data['Y1_dr'] * trimmed_data['D'] + trimmed_data['Y0_dr'] * (1 - trimmed_data['D'])
            ))

        # Store average estimates and MSE
        theta_hat_dr_values.append(np.mean(dr_ate_fold))
        mse_values.append(np.mean(mse_fold))

    results_df = pd.DataFrame({
        "Outcome": outcomes,
        "Estimate": theta_hat_dr_values,
        "MSE": mse_values
    })
    return results_df

In [66]:
# Function for easy call 
def run_double_ml_mlp(dataset, covariants, outcomes):
    X = dataset[covariants].values
    D = dataset[treatment].values
    results = []
    
    for outcome in outcomes:
        Y = dataset[outcome].values
        print(f"Processing outcome: {outcome}")
        result = double_ml_mlp_heterogeneous(X, D, Y, [outcome])
        results.append(result)
    
    final_results = pd.concat(results, ignore_index=True)
    return final_results

In [None]:
#mlp_result with all covariants
mlp_result_df_hetero = run_double_ml_mlp(double_ml_dataset, all_covariates, outcomes)
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting (Heterogeneous) using MLP:")
mlp_result_df_hetero

Processing outcome: pcorrupt
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━

In [68]:
# Checking for balance
weights_treated = 1 / propensity_scores[trimmed_D == 1]
weights_untreated = 1 / (1 - propensity_scores[trimmed_D == 0])

# Create DataFrames for treated and untreated groups
treated_data = trimmed_X[trimmed_D == 1]
untreated_data = trimmed_X[trimmed_D == 0]

# Compute weighted means for each covariate
treated_means = np.average(treated_data, axis=0, weights=weights_treated)
untreated_means = np.average(untreated_data, axis=0, weights=weights_untreated)

# Check absolute standardized mean differences
smd = np.abs(treated_means - untreated_means) / np.std(trimmed_X, axis=0)
print("Standardized Mean Differences (SMD):", smd)

Standardized Mean Differences (SMD): [0.06431737 0.17436421 0.08196042 0.42424825 0.2740144  0.05741303
 0.26795167 0.24450615 0.15918534 0.37349612 0.6        0.6
 0.28162652 0.24226102 0.18890478 0.16642605 0.11068814 0.2
 0.11865247 0.13747789 0.21630324 0.6        0.3615755  0.13864243
 0.1238498  0.25766122 0.26216054 0.34757185 0.14859101 0.13102074
 0.12994067 0.02177119 0.07538263 0.18149064 0.41006213 0.1398001
 0.15731958 0.28548157 0.4361691  0.5003572  0.08816264 0.13607545
 0.11496648 0.4749314  0.27814373 0.24964912 0.00266549 0.25
 0.44307894 0.3509568  0.01910833 0.05646941 0.28625125 0.08249972
 0.05899588 0.20087045 0.1198876  0.38367516 0.2754754  0.27015147
 0.13837029 0.5        0.21252668 0.14661519 0.31280857 0.28590313
 0.1       ]


Balanced

### Assuming Homogeneity

In [None]:
def train_mlp(X_train, Y_train, input_dim, output_activation='linear', loss='mse', verbose=0):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_dim,)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation=output_activation)
    ])
    model.compile(optimizer='adam', loss=loss, metrics=['mae'])
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model.fit(X_train, Y_train, validation_split=0.2, epochs=100, batch_size=32,
              callbacks=[early_stopping], verbose=verbose)
    return model

In [70]:
# Function to perform Double ML for a single outcome
def double_ml_single_outcome_homogeneous(X, D, Y):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Prepare for K-Fold Cross-Fitting
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    W_hat, V_hat = np.zeros(len(Y)), np.zeros(len(Y))
    mse_fold = []  # List to store MSE for each fold

    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D[train_index], D[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        # Step 1: Estimate E[Y|X] using MLP (Outcome model)
        outcome_model = train_mlp(X_train, Y_train, input_dim=X_train.shape[1])
        gamma_hat = outcome_model.predict(X_test).flatten()

        # Step 2: Estimate E[D|X] using MLP (Propensity score model)
        pscore_model = train_mlp(X_train, D_train, input_dim=X_train.shape[1], 
                                 output_activation='sigmoid', loss='binary_crossentropy')
        pi_hat = pscore_model.predict(X_test).flatten()

        # Step 3: Calculate residuals W_hat and V_hat
        W_hat[test_index] = Y_test - gamma_hat
        V_hat[test_index] = D_test - pi_hat

        # Step 4: Calculate MSE for this fold
        mse_fold.append(mean_squared_error(Y_test, gamma_hat))

    # Step 5: Regress W_hat on V_hat to estimate theta_0
    regression = LinearRegression()
    regression.fit(V_hat.reshape(-1, 1), W_hat)
    theta_hat = regression.coef_[0]
    
    return theta_hat, np.mean(mse_fold)

In [71]:
# Main function to loop through multiple outcomes
def run_double_ml_mlp_homogeneous(dataset, covariates, treatment, outcomes):
    theta_hat_dr_values = []
    mse_values = []

    for outcome in outcomes:
        print(f"Processing outcome: {outcome}")
        
        X = dataset[covariates].values
        D = dataset[treatment].values
        Y = dataset[outcome].values

        # Perform Double ML for the single outcome
        theta_hat, mse = double_ml_single_outcome_homogeneous(X, D, Y)
        
        theta_hat_dr_values.append(theta_hat)
        mse_values.append(mse)

    # Create results DataFrame
    results_df = pd.DataFrame({
        "Outcome": outcomes,
        "Estimate": theta_hat_dr_values,
        "MSE": mse_values
    })
    return results_df


In [72]:
# Example Usage
mlp_result_df_homo = run_double_ml_mlp_homogeneous(
    dataset=double_ml_dataset, 
    covariates=all_covariates, 
    treatment=treatment, 
    outcomes=outcomes
)
mlp_result_df_homo

Processing outcome: pcorrupt
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
Processing outcome: ncorrupt_os
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1

In [73]:
mlp_result_df_homo

Unnamed: 0,Outcome,Estimate,MSE
0,pcorrupt,-0.014981,0.03877918
1,ncorrupt_os,0.012934,0.02207904
2,valor_corrupt,-123446.626703,329932900000.0
3,log_valor_corrupt,-0.744095,30.68615


## Neural Network

## Gradient Boosting


In [52]:
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

#### Assuming Heteogeneous treatment effct 

In [75]:
# Function to train propensity score model using Gradient Boosting
def train_gb_pscore(X_train, D_train, X_test, n_estimators=100, max_depth=3, random_state=42):
    gb_pscore = GradientBoostingClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    gb_pscore.fit(X_train, D_train)
    propensity_scores = gb_pscore.predict_proba(X_test)[:, 1]
    return propensity_scores

In [76]:
# Function to train outcome model for treated/untreated groups
def train_gb_outcome(X_train, Y_train, X_test, n_estimators=100, max_depth=3, random_state=42):
    gb_model = GradientBoostingRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    gb_model.fit(X_train, Y_train)
    predictions = gb_model.predict(X_test)
    return predictions

In [77]:
# Function to calculate the doubly robust treatment effect
def calculate_dr_treatment_effect(X, D, Y, k_folds=5):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # K-Fold Cross-Fitting
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    dr_ate_fold = []
    mse_fold = []

    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D.iloc[train_index], D.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
        
        # Step 1: Estimate propensity scores
        propensity_scores = train_gb_pscore(X_train, D_train, X_test)
        
        # Step 2: Trim p-scores (0.01, 0.99)
        trimmed_indices = (propensity_scores > 0.01) & (propensity_scores < 0.99)
        trimmed_X = X_test[trimmed_indices]
        trimmed_D = D_test.iloc[trimmed_indices]
        trimmed_Y = Y_test.iloc[trimmed_indices]
        trimmed_pscore = propensity_scores[trimmed_indices]
        
        # Step 3: Fit outcome models for treated and untreated groups
        gamma1 = train_gb_outcome(X_train[D_train == 1], Y_train[D_train == 1], trimmed_X)
        gamma0 = train_gb_outcome(X_train[D_train == 0], Y_train[D_train == 0], trimmed_X)
        
        # Step 4: Calculate doubly robust estimates
        trimmed_data = pd.DataFrame({
            "gamma1": gamma1,
            "gamma0": gamma0,
            "D": trimmed_D.values,
            "Y": trimmed_Y.values,
            "pscore": trimmed_pscore
        })
        
        trimmed_data['Y1_dr'] = (
            trimmed_data['gamma1'] +
            (trimmed_data['D'] / trimmed_data['pscore']) * (trimmed_data['Y'] - trimmed_data['gamma1'])
        )
        
        trimmed_data['Y0_dr'] = (
            trimmed_data['gamma0'] +
            ((1 - trimmed_data['D']) / (1 - trimmed_data['pscore'])) * (trimmed_data['Y'] - trimmed_data['gamma0'])
        )
        
        # Step 5: Calculate treatment effect and MSE for the fold
        dr_ate_fold.append(np.mean(trimmed_data['Y1_dr'] - trimmed_data['Y0_dr']))
        mse_fold.append(mean_squared_error(
            trimmed_data['Y'],
            trimmed_data['Y1_dr'] * trimmed_data['D'] + trimmed_data['Y0_dr'] * (1 - trimmed_data['D'])
        ))
    
    # Average treatment effect and MSE across folds
    dr_ate = np.mean(dr_ate_fold)
    mse = np.mean(mse_fold)
    return dr_ate, mse

In [78]:
def run_gradient_boosting_dml(dataset, covariates, treatment, outcomes):
    theta_hat_dr_values = []
    mse_values = []

    for outcome in outcomes:
        print(f"Processing outcome: {outcome}")
        X = dataset[covariates]
        D = dataset[treatment]
        Y = dataset[outcome]

        dr_ate, mse = calculate_dr_treatment_effect(X, D, Y)
        theta_hat_dr_values.append(dr_ate)
        mse_values.append(mse)
    
    # Create Results DataFrame
    results_df = pd.DataFrame({
        "Outcome": outcomes,
        "Estimate": theta_hat_dr_values,
        "MSE": mse_values
    })
    return results_df

In [79]:
gb_result_df_hetero = run_gradient_boosting_dml(
    dataset=double_ml_dataset, 
    covariates=all_covariates, 
    treatment=treatment, 
    outcomes=outcomes
)

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: valor_corrupt
Processing outcome: log_valor_corrupt


In [80]:
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting (Gradient Boosting):")
print(gb_result_df_hetero)

Doubly Robust Treatment Effect Estimates with Cross-Fitting (Gradient Boosting):
             Outcome       Estimate           MSE
0           pcorrupt      -0.025620  1.416204e-01
1        ncorrupt_os      -0.006851  2.309279e-02
2      valor_corrupt -100519.402595  3.362692e+12
3  log_valor_corrupt      -0.247871  3.027244e+02


In [82]:
# Checking for balance
weights_treated = 1 / propensity_scores[trimmed_D == 1]
weights_untreated = 1 / (1 - propensity_scores[trimmed_D == 0])

# Create DataFrames for treated and untreated groups
treated_data = trimmed_X[trimmed_D == 1]
untreated_data = trimmed_X[trimmed_D == 0]

# Compute weighted means for each covariate
treated_means = np.average(treated_data, axis=0, weights=weights_treated)
untreated_means = np.average(untreated_data, axis=0, weights=weights_untreated)

# Check absolute standardized mean differences
smd = np.abs(treated_means - untreated_means) / np.std(trimmed_X, axis=0)
print("Standardized Mean Differences (SMD):", smd)

Standardized Mean Differences (SMD): [0.06431737 0.17436421 0.08196042 0.42424825 0.2740144  0.05741303
 0.26795167 0.24450615 0.15918534 0.37349612 0.6        0.6
 0.28162652 0.24226102 0.18890478 0.16642605 0.11068814 0.2
 0.11865247 0.13747789 0.21630324 0.6        0.3615755  0.13864243
 0.1238498  0.25766122 0.26216054 0.34757185 0.14859101 0.13102074
 0.12994067 0.02177119 0.07538263 0.18149064 0.41006213 0.1398001
 0.15731958 0.28548157 0.4361691  0.5003572  0.08816264 0.13607545
 0.11496648 0.4749314  0.27814373 0.24964912 0.00266549 0.25
 0.44307894 0.3509568  0.01910833 0.05646941 0.28625125 0.08249972
 0.05899588 0.20087045 0.1198876  0.38367516 0.2754754  0.27015147
 0.13837029 0.5        0.21252668 0.14661519 0.31280857 0.28590313
 0.1       ]


Balanced

#### Assuming Homogeneous treatment effct 

In [83]:
def calculate_dr_homogeneous_gb(X, D, Y, k_folds=5):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # K-Fold Cross-Fitting
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    W_hat, V_hat = np.zeros(len(Y)), np.zeros(len(Y))
    mse_fold = []

    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D[train_index], D[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        # Step 1: Estimate propensity scores using the provided function
        pi_hat = train_gb_pscore(X_train, D_train, X_test)  # Function from the image

        # Trim propensity scores (0.01, 0.99)
        trimmed_indices = (pi_hat > 0.01) & (pi_hat < 0.99)
        trimmed_X, trimmed_D, trimmed_Y = X_test[trimmed_indices], D_test[trimmed_indices], Y_test[trimmed_indices]
        trimmed_pscore = pi_hat[trimmed_indices]

        # Step 2: Estimate E[Y|X] using the provided function
        gamma_hat = train_gb_outcome(X_train, Y_train, trimmed_X)  # Function from the image

        # Step 3: Calculate residuals W_hat and V_hat
        W_hat[test_index[trimmed_indices]] = trimmed_Y - gamma_hat
        V_hat[test_index[trimmed_indices]] = trimmed_D - trimmed_pscore

        # Step 4: Calculate MSE for this fold
        mse_fold.append(mean_squared_error(trimmed_Y, gamma_hat))
    
    # Step 5: Regress W_hat on V_hat to estimate theta_0
    regression = LinearRegression()
    regression.fit(V_hat.reshape(-1, 1), W_hat)
    theta_hat = regression.coef_[0]

    return theta_hat, np.mean(mse_fold)

In [84]:
def run_dr_homogeneous_gb(dataset, covariates, treatment, outcomes):
    theta_hat_dr_values = []
    mse_values = []

    for outcome in outcomes:
        print(f"Processing outcome: {outcome}")
        X = dataset[covariates].values
        D = dataset[treatment].values
        Y = dataset[outcome].values

        # Call the homogeneous treatment effect function
        dr_ate, mse = calculate_dr_homogeneous_gb(X, D, Y)
        theta_hat_dr_values.append(dr_ate)
        mse_values.append(mse)

    # Create Results DataFrame
    results_df = pd.DataFrame({
        "Outcome": outcomes,
        "Estimate": theta_hat_dr_values,
        "MSE": mse_values
    })
    return results_df

In [85]:
gb_result_df_homo = run_dr_homogeneous_gb(
    dataset=double_ml_dataset, 
    covariates=all_covariates, 
    treatment=treatment, 
    outcomes=outcomes
)

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: valor_corrupt
Processing outcome: log_valor_corrupt


In [86]:
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous, Gradient Boosting):")
print(gb_result_df_homo)

Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous, Gradient Boosting):
             Outcome      Estimate           MSE
0           pcorrupt     -0.018570  1.225573e-02
1        ncorrupt_os     -0.006758  2.253265e-03
2      valor_corrupt -94125.137029  3.690273e+11
3  log_valor_corrupt     -0.805899  2.877532e+01


## Analysis 

compare mse/theta,


## Conclusion