# __Econometric Game__: Effect of electoral accountability on corruption?
   
### Team 3: Katheryn Ding, Amber Wei, Max Ye



## Set up 

In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from scipy.special import expit
from sklearn.linear_model import LogisticRegression

In [3]:
corruption_df = pd.read_stata("corruptiondata.dta")
corruption_df.columns

Index(['uf', 'nsorteio', 'totrecursos', 'tot_os', 'pop', 'purb',
       'p_secundario', 'cod_ibge6', 'pib_capita_02', 'op_01_04',
       ...
       'uf_d18', 'uf_d19', 'uf_d20', 'uf_d21', 'uf_d22', 'uf_d23', 'uf_d24',
       'uf_d25', 'uf_d26', 'esample2'],
      dtype='object', length=116)

In [4]:
# Define covariates for each category

# Mayor characteristics
mayor_covariates = [
    "pref_idade_tse",  # Age
    "pref_masc",       # Gender
    "pref_escola",     # Schooling
    "winmargin2000",   # Margin of victory in 2000
    "exp_prefeito"     # Was previously a mayor in a consecutive term
] + [col for col in corruption_df.columns if col.startswith("party_d")]

# Municipal characteristics
municipal_covariates = [
    "lpop",           # Log of population in 2000
    "purb",           # Percentage of population in urban sectors
    "p_secundario",   # Percentage with at least secondary education
    "mun_novo",       # New municipality indicator
    "lpib02",         # Log of GDP per capita in 2002
    "gini_ipea"       # Gini coefficient
]

# Political and Judicial characteristics
political_judicial_covariates = [
    "ENEP2000",  # Effective number of parties in 2000 mayor elections
    "ENLP2000",  # Effective number of parties in 2000 legislative elections
    "p_cad_pref" # Proportion of legislators from the same party as the mayor
]

# Dummies
dummy_covariates = [
    col for col in corruption_df.columns if col.startswith("uf_d") or col.startswith("sorteio")
]


print("Mayor Covariates:", mayor_covariates)
print("Municipal Covariates:", municipal_covariates)
print("Political and Judicial Covariates:", political_judicial_covariates)
print("Dummy Covariates:", dummy_covariates)

Mayor Covariates: ['pref_idade_tse', 'pref_masc', 'pref_escola', 'winmargin2000', 'exp_prefeito', 'party_d1', 'party_d3', 'party_d4', 'party_d5', 'party_d6', 'party_d7', 'party_d8', 'party_d9', 'party_d10', 'party_d11', 'party_d12', 'party_d13', 'party_d14', 'party_d15', 'party_d16', 'party_d17', 'party_d18']
Municipal Covariates: ['lpop', 'purb', 'p_secundario', 'mun_novo', 'lpib02', 'gini_ipea']
Political and Judicial Covariates: ['ENEP2000', 'ENLP2000', 'p_cad_pref']
Dummy Covariates: ['sorteio1', 'sorteio2', 'sorteio3', 'sorteio4', 'sorteio5', 'sorteio6', 'sorteio7', 'sorteio8', 'sorteio9', 'sorteio10', 'uf_d1', 'uf_d2', 'uf_d3', 'uf_d4', 'uf_d5', 'uf_d6', 'uf_d7', 'uf_d8', 'uf_d9', 'uf_d10', 'uf_d11', 'uf_d12', 'uf_d13', 'uf_d14', 'uf_d15', 'uf_d16', 'uf_d17', 'uf_d18', 'uf_d19', 'uf_d20', 'uf_d21', 'uf_d22', 'uf_d23', 'uf_d24', 'uf_d25', 'uf_d26']


In [5]:
corruption_df["log_valor_corrupt"] = np.log(corruption_df["valor_corrupt"] + 1)

# Define the treatment and outcome variables
treatment = "first"  
outcomes = ["pcorrupt", "ncorrupt_os", "valor_corrupt",'log_valor_corrupt'] 

# Use all covariates
all_covariates = (
    mayor_covariates +
    municipal_covariates +
    political_judicial_covariates +
    dummy_covariates
)
#set up dataset:
required_columns = [treatment] + outcomes + all_covariates
double_ml_dataset = corruption_df[required_columns].dropna()

print("Dataset Columns:", double_ml_dataset.columns)
print("Number of rows in the Dataset:", double_ml_dataset.shape[0])

Dataset Columns: Index(['first', 'pcorrupt', 'ncorrupt_os', 'valor_corrupt',
       'log_valor_corrupt', 'pref_idade_tse', 'pref_masc', 'pref_escola',
       'winmargin2000', 'exp_prefeito', 'party_d1', 'party_d3', 'party_d4',
       'party_d5', 'party_d6', 'party_d7', 'party_d8', 'party_d9', 'party_d10',
       'party_d11', 'party_d12', 'party_d13', 'party_d14', 'party_d15',
       'party_d16', 'party_d17', 'party_d18', 'lpop', 'purb', 'p_secundario',
       'mun_novo', 'lpib02', 'gini_ipea', 'ENEP2000', 'ENLP2000', 'p_cad_pref',
       'sorteio1', 'sorteio2', 'sorteio3', 'sorteio4', 'sorteio5', 'sorteio6',
       'sorteio7', 'sorteio8', 'sorteio9', 'sorteio10', 'uf_d1', 'uf_d2',
       'uf_d3', 'uf_d4', 'uf_d5', 'uf_d6', 'uf_d7', 'uf_d8', 'uf_d9', 'uf_d10',
       'uf_d11', 'uf_d12', 'uf_d13', 'uf_d14', 'uf_d15', 'uf_d16', 'uf_d17',
       'uf_d18', 'uf_d19', 'uf_d20', 'uf_d21', 'uf_d22', 'uf_d23', 'uf_d24',
       'uf_d25', 'uf_d26'],
      dtype='object')
Number of rows in the Data

## Lasso 

In [6]:
theta_hat_dr_values = []

# Loop through each outcome
for outcome in outcomes:
    
    X = double_ml_dataset[all_covariates]
    D = double_ml_dataset[treatment]
    Y = double_ml_dataset[outcome]
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Prepare 5-Fold Cross-Fitting
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    dr_ate_fold = []
    
    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D.iloc[train_index], D.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
        
        # Estimate propensity scores using Lasso
        lasso_pscore = LassoCV(alphas=np.logspace(-4, 0, 50), cv=5, random_state=42).fit(X_train, D_train)
        D_pred = lasso_pscore.predict(X_test)
        propensity_scores = expit(D_pred) 
        
        # Trim p-scores (0.01, 0.99)
        trimmed_indices = (propensity_scores > 0.01) & (propensity_scores < 0.99)
        trimmed_X = X_test[trimmed_indices]
        trimmed_D = D_test.iloc[trimmed_indices]
        trimmed_Y = Y_test.iloc[trimmed_indices]
        trimmed_pscore = propensity_scores[trimmed_indices]
        
        # Fit outcome models for treated (D=1) and untreated (D=0) groups
        # Treated model
        treated_model = LassoCV(alphas=np.logspace(-4, 0, 50), cv=5, random_state=42)
        treated_model.fit(X_train[D_train == 1], Y_train[D_train == 1])
        gamma1 = treated_model.predict(trimmed_X)
        
        # Untreated model
        untreated_model = LassoCV(alphas=np.logspace(-4, 0, 50), cv=5, random_state=42)
        untreated_model.fit(X_train[D_train == 0], Y_train[D_train == 0])
        gamma0 = untreated_model.predict(trimmed_X)
        
        # Step 3: Calculate doubly robust estimates for potential outcomes
        trimmed_data = pd.DataFrame({
            "gamma1": gamma1,
            "gamma0": gamma0,
            "D": trimmed_D.values,
            "Y": trimmed_Y.values,
            "pscore": trimmed_pscore,
        })
        
        # DR Estimate for Y(1)
        trimmed_data['Y1_dr'] = (
            trimmed_data['gamma1'] +
            (trimmed_data['D'] / trimmed_data['pscore']) * (trimmed_data['Y'] - trimmed_data['gamma1'])
        )
        
        # DR Estimate for Y(0)
        trimmed_data['Y0_dr'] = (
            trimmed_data['gamma0'] +
            ((1 - trimmed_data['D']) / (1 - trimmed_data['pscore'])) * (trimmed_data['Y'] - trimmed_data['gamma0'])
        )
        
        # Calculate treatment effect for the fold
        dr_ate_fold.append(np.mean(trimmed_data['Y1_dr'] - trimmed_data['Y0_dr']))
    
    # Average treatment effect across folds
    dr_ate = np.mean(dr_ate_fold)
    theta_hat_dr_values.append(dr_ate)

# Print the final treatment effect estimates
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting:")
print(theta_hat_dr_values)


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

Doubly Robust Treatment Effect Estimates with Cross-Fitting:
[np.float32(-0.0220353), np.float32(-0.0085152965), np.float32(-148572.12), np.float32(-1.0674746)]


In [7]:
# Checking for balance
weights_treated = 1 / propensity_scores[trimmed_D == 1]
weights_untreated = 1 / (1 - propensity_scores[trimmed_D == 0])

# Create DataFrames for treated and untreated groups
treated_data = trimmed_X[trimmed_D == 1]
untreated_data = trimmed_X[trimmed_D == 0]

# Compute weighted means for each covariate
treated_means = np.average(treated_data, axis=0, weights=weights_treated)
untreated_means = np.average(untreated_data, axis=0, weights=weights_untreated)

# Check absolute standardized mean differences
smd = np.abs(treated_means - untreated_means) / np.std(trimmed_X, axis=0)
print("Standardized Mean Differences (SMD):", smd)

Standardized Mean Differences (SMD): [0.06885722 0.29011008 0.02904402 0.6848417  0.21398757 0.06088972
 0.25428903 0.41079307 0.09445282 0.04607635 0.6        0.6
 0.35854152 0.24288547 0.24353866 0.03111836 0.16987991 0.2
 0.17568141 0.1651254  0.21418199 0.6        0.02996127 0.04102391
 0.18319434 0.20693754 0.4547287  0.39264885 0.3478793  0.23060311
 0.28006348 0.00521759 0.20551337 0.15774375 0.38865724 0.08364097
 0.05980109 0.2552489  0.47479638 0.35756424 0.02450932 0.18689632
 0.19421004 0.06287776 0.35430452 0.26188627 0.01593608 0.25
 0.46131605 0.3687714  0.05878664 0.0249002  0.26350877 0.01907789
 0.1791523  0.16089886 0.17051342 0.3866724  0.177953   0.25078654
 0.23934054 0.625      0.04331091 0.07217444 0.17713386 0.03650402
 0.4       ]


Not very balanced.

#### Assuming Homogeneous treatment effct 

In [8]:
theta_hat_dr_values = []

# Loop through each outcome
for outcome in outcomes:
    
    X = double_ml_dataset[all_covariates]
    D = double_ml_dataset[treatment]
    Y = double_ml_dataset[outcome]
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Prepare 5-Fold Cross-Fitting
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    dr_ate_fold = []
    
    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D.iloc[train_index], D.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
        
        # Estimate propensity scores using Lasso
        lasso_pscore = LassoCV(alphas=np.logspace(-4, 0, 50), cv=5, random_state=42).fit(X_train, D_train)
        D_pred = lasso_pscore.predict(X_test)
        propensity_scores = expit(D_pred) 
        
        # Trim p-scores (0.01, 0.99)
        trimmed_indices = (propensity_scores > 0.01) & (propensity_scores < 0.99)
        trimmed_X = X_test[trimmed_indices]
        trimmed_D = D_test.iloc[trimmed_indices]
        trimmed_Y = Y_test.iloc[trimmed_indices]
        trimmed_pscore = propensity_scores[trimmed_indices]
        
        # Fit a single outcome model for homogeneous treatment effect
        outcome_model = LassoCV(alphas=np.logspace(-4, 0, 50), cv=5, random_state=42)
        outcome_model.fit(X_train, Y_train)
        gamma = outcome_model.predict(trimmed_X)
        
        # Step 3: Calculate doubly robust estimates for potential outcomes
        trimmed_data = pd.DataFrame({
            "gamma": gamma,
            "D": trimmed_D.values,
            "Y": trimmed_Y.values,
            "pscore": trimmed_pscore,
        })
        
        # DR Estimate for Y(1)
        trimmed_data['Y1_dr'] = (
            trimmed_data['gamma'] +
            (trimmed_data['D'] / trimmed_data['pscore']) * (trimmed_data['Y'] - trimmed_data['gamma'])
        )
        
        # DR Estimate for Y(0)
        trimmed_data['Y0_dr'] = (
            trimmed_data['gamma'] +
            ((1 - trimmed_data['D']) / (1 - trimmed_data['pscore'])) * (trimmed_data['Y'] - trimmed_data['gamma'])
        )
        
        # Calculate treatment effect for the fold
        dr_ate_fold.append(np.mean(trimmed_data['Y1_dr'] - trimmed_data['Y0_dr']))
    
    # Average treatment effect across folds
    dr_ate = np.mean(dr_ate_fold)
    theta_hat_dr_values.append(dr_ate)

# Print the final treatment effect estimates
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous):")
print(theta_hat_dr_values)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous):
[np.float32(-0.02297019), np.float32(-0.008827692), np.float32(-133287.66), np.float32(-1.0866188)]


## Random Forest

In [9]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

#### Assuming Heterogeneous Treatment Effect

In [10]:
# Assuming Heterogeneity

theta_hat_dr_values = []

# Loop through each outcome
for outcome in outcomes:
    
    X = double_ml_dataset[all_covariates]
    D = double_ml_dataset[treatment]
    Y = double_ml_dataset[outcome]
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Prepare 5-Fold Cross-Fitting
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    dr_ate_fold = []
    
    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D.iloc[train_index], D.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
        
        # Estimate propensity scores using Random Forest
        rf_pscore = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
        rf_pscore.fit(X_train, D_train)
        propensity_scores = rf_pscore.predict_proba(X_test)[:, 1]  # Probability of treatment
        
        # Trim p-scores (0.01, 0.99)
        trimmed_indices = (propensity_scores > 0.01) & (propensity_scores < 0.99)
        trimmed_X = X_test[trimmed_indices]
        trimmed_D = D_test.iloc[trimmed_indices]
        trimmed_Y = Y_test.iloc[trimmed_indices]
        trimmed_pscore = propensity_scores[trimmed_indices]
        
        # Fit outcome models for treated (D=1) and untreated (D=0) groups using Random Forest
        # Treated model
        treated_model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
        treated_model.fit(X_train[D_train == 1], Y_train[D_train == 1])
        gamma1 = treated_model.predict(trimmed_X)
        
        # Untreated model
        untreated_model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
        untreated_model.fit(X_train[D_train == 0], Y_train[D_train == 0])
        gamma0 = untreated_model.predict(trimmed_X)
        
        trimmed_data = pd.DataFrame({
            "gamma1": gamma1,
            "gamma0": gamma0,
            "D": trimmed_D.values,
            "Y": trimmed_Y.values,
            "pscore": trimmed_pscore,
        })
        
        # DR Estimate for Y(1)
        trimmed_data['Y1_dr'] = (
            trimmed_data['gamma1'] +
            (trimmed_data['D'] / trimmed_data['pscore']) * (trimmed_data['Y'] - trimmed_data['gamma1'])
        )
        
        # DR Estimate for Y(0)
        trimmed_data['Y0_dr'] = (
            trimmed_data['gamma0'] +
            ((1 - trimmed_data['D']) / (1 - trimmed_data['pscore'])) * (trimmed_data['Y'] - trimmed_data['gamma0'])
        )
        
        dr_ate_fold.append(np.mean(trimmed_data['Y1_dr'] - trimmed_data['Y0_dr']))
    
    # Average treatment effect across folds
    dr_ate = np.mean(dr_ate_fold)
    theta_hat_dr_values.append(dr_ate)

# Print the final treatment effect estimates
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting:")
print(theta_hat_dr_values)

Doubly Robust Treatment Effect Estimates with Cross-Fitting:
[np.float64(-0.02629609321501742), np.float64(-0.007445517008494921), np.float64(-144487.7141861997), np.float64(-0.9063899219840519)]


In [11]:
# Checking for balance
weights_treated = 1 / propensity_scores[trimmed_D == 1]
weights_untreated = 1 / (1 - propensity_scores[trimmed_D == 0])

# Create DataFrames for treated and untreated groups
treated_data = trimmed_X[trimmed_D == 1]
untreated_data = trimmed_X[trimmed_D == 0]

# Compute weighted means for each covariate
treated_means = np.average(treated_data, axis=0, weights=weights_treated)
untreated_means = np.average(untreated_data, axis=0, weights=weights_untreated)

# Check absolute standardized mean differences
smd = np.abs(treated_means - untreated_means) / np.std(trimmed_X, axis=0)
print("Standardized Mean Differences (SMD):", smd)

Standardized Mean Differences (SMD): [6.49184294e-02 2.62760235e-01 4.43222224e-02 5.68753905e-01
 1.40551625e-01 2.85807686e-02 2.39056540e-01 4.21860903e-01
 1.70018020e-01 3.12105272e-02 0.00000000e+00 0.00000000e+00
 3.40814863e-01 2.19399499e-01 2.04180331e-01 7.14762632e-04
 1.36900994e-01 3.72529030e-10 1.48405111e-01 1.57528656e-01
 1.77906427e-01 0.00000000e+00 1.64458273e-01 1.52965149e-01
 8.95551596e-03 2.80502021e-01 3.54546774e-01 4.13814144e-01
 1.65974856e-01 5.93788159e-02 1.06164538e-01 8.12075760e-03
 9.16461234e-02 1.52552942e-01 4.30769391e-01 9.90179767e-02
 1.11759226e-01 2.86542727e-01 4.33120980e-01 4.24938690e-01
 2.33273409e-02 1.68963928e-01 1.72929756e-01 1.03936479e-01
 3.15043425e-01 2.44930492e-01 7.20176146e-02 9.31322575e-10
 4.64608657e-01 3.91183930e-01 2.13956919e-02 8.02028429e-03
 2.69014499e-01 1.58192704e-03 1.63177218e-01 1.08772665e-01
 8.76095229e-02 4.67996839e-01 2.46536518e-01 2.45335631e-01
 1.55909249e-01 0.00000000e+00 2.62421393e-02 8.

Balanced

#### Assuming Homogeneous Treatment Effect

In [12]:
# Assuming Homogeneous effect 
theta_hat_dr_values = []

# Loop through each outcome
for outcome in outcomes:
    
    X = double_ml_dataset[all_covariates]
    D = double_ml_dataset[treatment]
    Y = double_ml_dataset[outcome]
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    dr_ate_fold = []
    
    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D.iloc[train_index], D.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
        
        # Estimate propensity scores using Random Forest
        rf_pscore = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
        rf_pscore.fit(X_train, D_train)
        propensity_scores = rf_pscore.predict_proba(X_test)[:, 1]
        
        # Trim p-scores (0.01, 0.99)
        trimmed_indices = (propensity_scores > 0.01) & (propensity_scores < 0.99)
        trimmed_X = X_test[trimmed_indices]
        trimmed_D = D_test.iloc[trimmed_indices]
        trimmed_Y = Y_test.iloc[trimmed_indices]
        trimmed_pscore = propensity_scores[trimmed_indices]
        

        outcome_model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
        outcome_model.fit(X_train, Y_train)
        gamma = outcome_model.predict(trimmed_X)
        
        # Calculate doubly robust estimates for potential outcomes
        trimmed_data = pd.DataFrame({
            "gamma": gamma,
            "D": trimmed_D.values,
            "Y": trimmed_Y.values,
            "pscore": trimmed_pscore,
        })
        
        # DR Estimate for Y(1)
        trimmed_data['Y1_dr'] = (
            trimmed_data['gamma'] +
            (trimmed_data['D'] / trimmed_data['pscore']) * (trimmed_data['Y'] - trimmed_data['gamma'])
        )
        
        # DR Estimate for Y(0)
        trimmed_data['Y0_dr'] = (
            trimmed_data['gamma'] +
            ((1 - trimmed_data['D']) / (1 - trimmed_data['pscore'])) * (trimmed_data['Y'] - trimmed_data['gamma'])
        )
        
        dr_ate_fold.append(np.mean(trimmed_data['Y1_dr'] - trimmed_data['Y0_dr']))
    
    # Average treatment effect across folds
    dr_ate = np.mean(dr_ate_fold)
    theta_hat_dr_values.append(dr_ate)

# Print the final treatment effect estimates
print("Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous):")
print(theta_hat_dr_values)

Doubly Robust Treatment Effect Estimates with Cross-Fitting (Homogeneous):
[np.float64(-0.02465544643359933), np.float64(-0.007469718397917146), np.float64(-130335.32695346007), np.float64(-0.8881962872137134)]


## Neural Network

## Gradient Boosting


In [13]:
from sklearn.ensemble import GradientBoostingRegressor

In [14]:
gb_results = []

# Loop through each outcome in the list of outcomes
for outcome in outcomes:
    
    X = double_ml_dataset[all_covariates]
    D = double_ml_dataset[treatment]
    Y = double_ml_dataset[outcome]
    
    # Standardize the covariates
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Prepare 5-Fold Cross-Fitting
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    folds = []
    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D.iloc[train_index], D.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
        folds.append((X_train, X_test, D_train, D_test, Y_train, Y_test))
    
    fold_estimates = []
    fold_mse = []
    
    # Loop through folds for cross-fitting
    for fold_idx, (X_train, X_test, D_train, D_test, Y_train, Y_test) in enumerate(folds):
        # Train Gradient Boosting model
        gb_model = GradientBoostingRegressor(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=3)
        gb_model.fit(X_train, Y_train)
        
        # Predict on test data
        Y_pred = gb_model.predict(X_test)
        
        # Theta (Treatment Effect)
        treatment_effect = (
            np.mean(Y_pred[D_test.values == 1]) - np.mean(Y_pred[D_test.values == 0])
        )
        fold_estimates.append(treatment_effect)
        
        # MSE
        mse = mean_squared_error(Y_test, Y_pred)
        fold_mse.append(mse)
    
    # Compute overall average estimate and MSE for the current outcome
    average_estimate = np.mean(fold_estimates)
    average_mse = np.mean(fold_mse)
    
    # Store the results
    gb_results.append({"Outcome": outcome, "Estimate": average_estimate, "MSE": average_mse})

gb_results_df = pd.DataFrame(gb_results)

print(gb_results_df)

NameError: name 'mean_squared_error' is not defined

In [None]:
outcome = outcomes[0]  
X = double_ml_dataset[all_covariates]
D = double_ml_dataset[treatment]
Y = double_ml_dataset[outcome]

# Standardize the covariates
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Prepare 5-Fold Cross-Fitting
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create train-test splits
folds = []
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    D_train, D_test = D.iloc[train_index], D.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
    folds.append((X_train, X_test, D_train, D_test, Y_train, Y_test))

# Regularization Path for Lasso
alphas = np.logspace(-4, 0, 50)  # 50 values between 10^-4 and 10^0

In [None]:

# Initialize lists to store fold estimates and MSE
fold_estimates = []
fold_mse = []


# Loop through folds for cross-fitting
for fold_idx, (X_train, X_test, D_train, D_test, Y_train, Y_test) in enumerate(folds):
    # Train Gradient Boosting model
    gb_model = GradientBoostingRegressor(random_state=42).fit(X_train, Y_train)
    
    # Predict on test data
    Y_pred = gb_model.predict(X_test)
    
    # Calculate the treatment effect
    treatment_effect = (
        np.mean(Y_pred[D_test.values == 1]) - np.mean(Y_pred[D_test.values == 0])
    )
    fold_estimates.append(treatment_effect)
    
    # MSE
    mse = mean_squared_error(Y_test, Y_pred)
    fold_mse.append(mse)
    
    print(f"Fold {fold_idx + 1}:")
    print(f"  Treatment Effect Estimate: {treatment_effect}")
    print(f"  MSE: {mse}")
    print()

# Compute overall average estimate and MSE
average_estimate = np.mean(fold_estimates)
average_mse = np.mean(fold_mse)

print("Final Results:")
print(f" Average Treatment Effect Estimate: {average_estimate}")
print(f" Average MSE: {average_mse}")

Fold 1:
  Treatment Effect Estimate: 0.00016003802844498372
  MSE: 0.015647467677191204

Fold 2:
  Treatment Effect Estimate: -0.0006888164775919192
  MSE: 0.01649880032634493

Fold 3:
  Treatment Effect Estimate: -0.013099382864855375
  MSE: 0.009203978743665922

Fold 4:
  Treatment Effect Estimate: 0.0018513191815970748
  MSE: 0.008108130135200117

Fold 5:
  Treatment Effect Estimate: -0.006359663711762623
  MSE: 0.011820252893298648

Final Results:
 Average Treatment Effect Estimate: -0.0036273011688335716
 Average MSE: 0.012255725955140165


## Analysis 

compare mse/theta,


## Conclusion