# __Econometric Game__: Effect of electoral accountability on corruption?
   
### Team 3: Katheryn Ding, Amber Wei, Max Ye



In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols
import matplotlib.ticker as ticker
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler


## Set up 

In [9]:
corruption_df = pd.read_stata("corruptiondata.dta")
corruption_df.columns

Index(['uf', 'nsorteio', 'totrecursos', 'tot_os', 'pop', 'purb',
       'p_secundario', 'cod_ibge6', 'pib_capita_02', 'op_01_04',
       ...
       'uf_d18', 'uf_d19', 'uf_d20', 'uf_d21', 'uf_d22', 'uf_d23', 'uf_d24',
       'uf_d25', 'uf_d26', 'esample2'],
      dtype='object', length=116)

In [10]:
# Define covariates for each category

# Mayor characteristics
mayor_covariates = [
    "pref_idade_tse",  # Age
    "pref_masc",       # Gender
    "pref_escola",     # Schooling
    "winmargin2000",   # Margin of victory in 2000
    "exp_prefeito"     # Was previously a mayor in a consecutive term
] + [col for col in corruption_df.columns if col.startswith("party_d")]

# Municipal characteristics
municipal_covariates = [
    "lpop",           # Log of population in 2000
    "purb",           # Percentage of population in urban sectors
    "p_secundario",   # Percentage with at least secondary education
    "mun_novo",       # New municipality indicator
    "lpib02",         # Log of GDP per capita in 2002
    "gini_ipea"       # Gini coefficient
]

# Political and Judicial characteristics
political_judicial_covariates = [
    "ENEP2000",  # Effective number of parties in 2000 mayor elections
    "ENLP2000",  # Effective number of parties in 2000 legislative elections
    "p_cad_pref" # Proportion of legislators from the same party as the mayor
]

# Dummies
dummy_covariates = [
    col for col in corruption_df.columns if col.startswith("uf_d") or col.startswith("sorteio")
]


print("Mayor Covariates:", mayor_covariates)
print("Municipal Covariates:", municipal_covariates)
print("Political and Judicial Covariates:", political_judicial_covariates)
print("Dummy Covariates:", dummy_covariates)

Mayor Covariates: ['pref_idade_tse', 'pref_masc', 'pref_escola', 'winmargin2000', 'exp_prefeito', 'party_d1', 'party_d3', 'party_d4', 'party_d5', 'party_d6', 'party_d7', 'party_d8', 'party_d9', 'party_d10', 'party_d11', 'party_d12', 'party_d13', 'party_d14', 'party_d15', 'party_d16', 'party_d17', 'party_d18']
Municipal Covariates: ['lpop', 'purb', 'p_secundario', 'mun_novo', 'lpib02', 'gini_ipea']
Political and Judicial Covariates: ['ENEP2000', 'ENLP2000', 'p_cad_pref']
Dummy Covariates: ['sorteio1', 'sorteio2', 'sorteio3', 'sorteio4', 'sorteio5', 'sorteio6', 'sorteio7', 'sorteio8', 'sorteio9', 'sorteio10', 'uf_d1', 'uf_d2', 'uf_d3', 'uf_d4', 'uf_d5', 'uf_d6', 'uf_d7', 'uf_d8', 'uf_d9', 'uf_d10', 'uf_d11', 'uf_d12', 'uf_d13', 'uf_d14', 'uf_d15', 'uf_d16', 'uf_d17', 'uf_d18', 'uf_d19', 'uf_d20', 'uf_d21', 'uf_d22', 'uf_d23', 'uf_d24', 'uf_d25', 'uf_d26']


In [11]:
# Define the treatment and outcome variables
treatment = "first"  
outcomes = ["pcorrupt", "ncorrupt_os", "valor_corrupt"] 

# Use all covariates
all_covariates = (
    mayor_covariates +
    municipal_covariates +
    political_judicial_covariates +
    dummy_covariates
)

# Constructing DoubleML dataset
required_columns = [treatment] + outcomes + all_covariates
double_ml_dataset = corruption_df[required_columns].dropna()

print("Double ML Dataset Columns:", double_ml_dataset.columns)
print("Number of rows in the Double ML Dataset:", double_ml_dataset.shape[0])

Double ML Dataset Columns: Index(['first', 'pcorrupt', 'ncorrupt_os', 'valor_corrupt', 'pref_idade_tse',
       'pref_masc', 'pref_escola', 'winmargin2000', 'exp_prefeito', 'party_d1',
       'party_d3', 'party_d4', 'party_d5', 'party_d6', 'party_d7', 'party_d8',
       'party_d9', 'party_d10', 'party_d11', 'party_d12', 'party_d13',
       'party_d14', 'party_d15', 'party_d16', 'party_d17', 'party_d18', 'lpop',
       'purb', 'p_secundario', 'mun_novo', 'lpib02', 'gini_ipea', 'ENEP2000',
       'ENLP2000', 'p_cad_pref', 'sorteio1', 'sorteio2', 'sorteio3',
       'sorteio4', 'sorteio5', 'sorteio6', 'sorteio7', 'sorteio8', 'sorteio9',
       'sorteio10', 'uf_d1', 'uf_d2', 'uf_d3', 'uf_d4', 'uf_d5', 'uf_d6',
       'uf_d7', 'uf_d8', 'uf_d9', 'uf_d10', 'uf_d11', 'uf_d12', 'uf_d13',
       'uf_d14', 'uf_d15', 'uf_d16', 'uf_d17', 'uf_d18', 'uf_d19', 'uf_d20',
       'uf_d21', 'uf_d22', 'uf_d23', 'uf_d24', 'uf_d25', 'uf_d26'],
      dtype='object')
Number of rows in the Double ML Dataset: 467

## Lasso 

In [14]:
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error

In [None]:
# Lasso on outcome: pcorrupt
outcome = outcomes[0]
X = double_ml_dataset[all_covariates]
D = double_ml_dataset[treatment]
Y = double_ml_dataset[outcome]

# Standardize the covariates
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Prepare 5-Fold Cross-Fitting
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create train-test splits
folds = []
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    D_train, D_test = D.iloc[train_index], D.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
    folds.append((X_train, X_test, D_train, D_test, Y_train, Y_test))

# Regularization Path for Lasso
alphas = np.logspace(-4, 0, 50)  # 50 values between 10^-4 and 10^0



# Initialize lists to store fold estimates and MSE
fold_estimates = []
fold_mse = []

# Regularization path
alphas = np.logspace(-4, 0, 50)  # Regularization strength values

# Loop through folds for cross-fitting
for fold_idx, (X_train, X_test, D_train, D_test, Y_train, Y_test) in enumerate(folds):
    # Train Lasso model using cross-validation to find the optimal alpha
    lasso = LassoCV(alphas=alphas, cv=5, random_state=42).fit(X_train, Y_train)
    optimal_alpha = lasso.alpha_
    
    # Predict on test data
    Y_pred = lasso.predict(X_test)
    
    # Theta
    treatment_effect = (
        np.mean(Y_pred[D_test.values == 1]) - np.mean(Y_pred[D_test.values == 0])
    )
    fold_estimates.append(treatment_effect)
    
    # MSE
    mse = mean_squared_error(Y_test, Y_pred)
    fold_mse.append(mse)
    
    print(f"Fold {fold_idx + 1}:")
    print(f"  Optimal Alpha: {optimal_alpha}")
    print(f"  Treatment Effect Estimate: {treatment_effect}")
    print(f"  MSE: {mse}")
    print()

# Ave theta/ mse
average_estimate = np.mean(fold_estimates)
average_mse = np.mean(fold_mse)

print("Final Results:")
print(f" Average Treatment Effect Estimate: {average_estimate}")
print(f" Average MSE: {average_mse}")

Fold 1:
  Optimal Alpha: 0.015998587196060572
  Treatment Effect Estimate: 0.001189984381198883
  MSE: 0.01379722636193037

Fold 2:
  Optimal Alpha: 0.010985411419875584
  Treatment Effect Estimate: 0.0006309449672698975
  MSE: 0.014642340131103992

Fold 3:
  Optimal Alpha: 0.009102981779915217
  Treatment Effect Estimate: -0.002018515020608902
  MSE: 0.006565865129232407

Fold 4:
  Optimal Alpha: 0.010985411419875584
  Treatment Effect Estimate: -0.0003968179225921631
  MSE: 0.0068343146704137325

Fold 5:
  Optimal Alpha: 0.013257113655901081
  Treatment Effect Estimate: 0.0015238262712955475
  MSE: 0.009493155404925346

Final Results:
 Average Treatment Effect Estimate: 0.0001858845353126526
 Average MSE: 0.010266579687595367


In [None]:
# Lasso on outcome: ncorrupt_os
outcome = outcomes[1]  
X = double_ml_dataset[all_covariates]
D = double_ml_dataset[treatment]
Y = double_ml_dataset[outcome]

# Standardize the covariates
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Prepare 5-Fold Cross-Fitting
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create train-test splits
folds = []
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    D_train, D_test = D.iloc[train_index], D.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
    folds.append((X_train, X_test, D_train, D_test, Y_train, Y_test))

# Regularization Path for Lasso
alphas = np.logspace(-4, 0, 50)  # 50 values between 10^-4 and 10^0



# Initialize lists to store fold estimates and MSE
fold_estimates = []
fold_mse = []

# Regularization path
alphas = np.logspace(-4, 0, 50)  # Regularization strength values

# Loop through folds for cross-fitting
for fold_idx, (X_train, X_test, D_train, D_test, Y_train, Y_test) in enumerate(folds):
    # Train Lasso model using cross-validation to find the optimal alpha
    lasso = LassoCV(alphas=alphas, cv=5, random_state=42).fit(X_train, Y_train)
    optimal_alpha = lasso.alpha_
    
    # Predict on test data
    Y_pred = lasso.predict(X_test)
    
    # Theta
    treatment_effect = (
        np.mean(Y_pred[D_test.values == 1]) - np.mean(Y_pred[D_test.values == 0])
    )
    fold_estimates.append(treatment_effect)
    
    # MSE
    mse = mean_squared_error(Y_test, Y_pred)
    fold_mse.append(mse)
    
    print(f"Fold {fold_idx + 1}:")
    print(f"  Optimal Alpha: {optimal_alpha}")
    print(f"  Treatment Effect Estimate: {treatment_effect}")
    print(f"  MSE: {mse}")
    print()

# Ave theta/ mse
average_estimate = np.mean(fold_estimates)
average_mse = np.mean(fold_mse)

print("Final Results:")
print(f" Average Treatment Effect Estimate: {average_estimate}")
print(f" Average MSE: {average_mse}")


Fold 1:
  Optimal Alpha: 0.0007906043210907702
  Treatment Effect Estimate: 0.003783721476793289
  MSE: 0.002381647238507867

Fold 2:
  Optimal Alpha: 0.0024420530945486497
  Treatment Effect Estimate: 0.0028401128947734833
  MSE: 0.001910327235236764

Fold 3:
  Optimal Alpha: 0.0016768329368110067
  Treatment Effect Estimate: -0.0018248558044433594
  MSE: 0.002253496553748846

Fold 4:
  Optimal Alpha: 0.0007906043210907702
  Treatment Effect Estimate: -0.005976390093564987
  MSE: 0.002003249479457736

Fold 5:
  Optimal Alpha: 0.0007906043210907702
  Treatment Effect Estimate: 0.0016842074692249298
  MSE: 0.0018415002850815654

Final Results:
 Average Treatment Effect Estimate: 0.00010135919001186267
 Average MSE: 0.0020780442282557487


In [20]:
# Lasso on outcome: valor_corrupt
outcome = outcomes[2]  
X = double_ml_dataset[all_covariates]
D = double_ml_dataset[treatment]
Y = double_ml_dataset[outcome]

# Standardize the covariates
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Prepare 5-Fold Cross-Fitting
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create train-test splits
folds = []
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    D_train, D_test = D.iloc[train_index], D.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
    folds.append((X_train, X_test, D_train, D_test, Y_train, Y_test))

# Regularization Path for Lasso
alphas = np.logspace(-4, 0, 50)  # 50 values between 10^-4 and 10^0



# Initialize lists to store fold estimates and MSE
fold_estimates = []
fold_mse = []

# Regularization path
alphas = np.logspace(-4, 0, 50)  # Regularization strength values

# Loop through folds for cross-fitting
for fold_idx, (X_train, X_test, D_train, D_test, Y_train, Y_test) in enumerate(folds):
    # Train Lasso model using cross-validation to find the optimal alpha
    lasso = LassoCV(alphas=alphas, cv=5, random_state=42).fit(X_train, Y_train)
    optimal_alpha = lasso.alpha_
    
    # Predict on test data
    Y_pred = lasso.predict(X_test)
    
    # Theta
    treatment_effect = (
        np.mean(Y_pred[D_test.values == 1]) - np.mean(Y_pred[D_test.values == 0])
    )
    fold_estimates.append(treatment_effect)
    
    # MSE
    mse = mean_squared_error(Y_test, Y_pred)
    fold_mse.append(mse)
    
    print(f"Fold {fold_idx + 1}:")
    print(f"  Optimal Alpha: {optimal_alpha}")
    print(f"  Treatment Effect Estimate: {treatment_effect}")
    print(f"  MSE: {mse}")
    print()

# Ave theta/ mse
average_estimate = np.mean(fold_estimates)
average_mse = np.mean(fold_mse)

print("Final Results:")
print(f" Average Treatment Effect Estimate: {average_estimate}")
print(f" Average MSE: {average_mse}")

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

Fold 1:
  Optimal Alpha: 0.07196856730011514
  Treatment Effect Estimate: 63934.328125
  MSE: 431124381696.0



  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

Fold 2:
  Optimal Alpha: 0.32374575428176433
  Treatment Effect Estimate: 46621.328125
  MSE: 366252818432.0



  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

Fold 3:
  Optimal Alpha: 0.07196856730011514
  Treatment Effect Estimate: 41338.71875
  MSE: 152824725504.0



  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

Fold 4:
  Optimal Alpha: 0.47148663634573895
  Treatment Effect Estimate: -51250.015625
  MSE: 272953720832.0



  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

Fold 5:
  Optimal Alpha: 0.033932217718953266
  Treatment Effect Estimate: 5530.78125
  MSE: 567256547328.0

Final Results:
 Average Treatment Effect Estimate: 21235.02734375
 Average MSE: 358082445312.0


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

## Random Forest

## Neural Network

## Gradient Boosting


## Analysis 

compare mse/theta,


## Conclusion