# __Econometric Game__: Effect of electoral accountability on corruption?
   
### Team 3: Katheryn Ding, Amber Wei, Max Ye



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols
import matplotlib.ticker as ticker
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler


## Set up 

In [2]:
corruption_df = pd.read_stata("corruptiondata.dta")
corruption_df.columns

Index(['uf', 'nsorteio', 'totrecursos', 'tot_os', 'pop', 'purb',
       'p_secundario', 'cod_ibge6', 'pib_capita_02', 'op_01_04',
       ...
       'uf_d18', 'uf_d19', 'uf_d20', 'uf_d21', 'uf_d22', 'uf_d23', 'uf_d24',
       'uf_d25', 'uf_d26', 'esample2'],
      dtype='object', length=116)

In [3]:
# Define covariates for each category

# Mayor characteristics
mayor_covariates = [
    "pref_idade_tse",  # Age
    "pref_masc",       # Gender
    "pref_escola",     # Schooling
    "winmargin2000",   # Margin of victory in 2000
    "exp_prefeito"     # Was previously a mayor in a consecutive term
] + [col for col in corruption_df.columns if col.startswith("party_d")]

# Municipal characteristics
municipal_covariates = [
    "lpop",           # Log of population in 2000
    "purb",           # Percentage of population in urban sectors
    "p_secundario",   # Percentage with at least secondary education
    "mun_novo",       # New municipality indicator
    "lpib02",         # Log of GDP per capita in 2002
    "gini_ipea"       # Gini coefficient
]

# Political and Judicial characteristics
political_judicial_covariates = [
    "ENEP2000",  # Effective number of parties in 2000 mayor elections
    "ENLP2000",  # Effective number of parties in 2000 legislative elections
    "p_cad_pref" # Proportion of legislators from the same party as the mayor
]

# Dummies
dummy_covariates = [
    col for col in corruption_df.columns if col.startswith("uf_d") or col.startswith("sorteio")
]


print("Mayor Covariates:", mayor_covariates)
print("Municipal Covariates:", municipal_covariates)
print("Political and Judicial Covariates:", political_judicial_covariates)
print("Dummy Covariates:", dummy_covariates)

Mayor Covariates: ['pref_idade_tse', 'pref_masc', 'pref_escola', 'winmargin2000', 'exp_prefeito', 'party_d1', 'party_d3', 'party_d4', 'party_d5', 'party_d6', 'party_d7', 'party_d8', 'party_d9', 'party_d10', 'party_d11', 'party_d12', 'party_d13', 'party_d14', 'party_d15', 'party_d16', 'party_d17', 'party_d18']
Municipal Covariates: ['lpop', 'purb', 'p_secundario', 'mun_novo', 'lpib02', 'gini_ipea']
Political and Judicial Covariates: ['ENEP2000', 'ENLP2000', 'p_cad_pref']
Dummy Covariates: ['sorteio1', 'sorteio2', 'sorteio3', 'sorteio4', 'sorteio5', 'sorteio6', 'sorteio7', 'sorteio8', 'sorteio9', 'sorteio10', 'uf_d1', 'uf_d2', 'uf_d3', 'uf_d4', 'uf_d5', 'uf_d6', 'uf_d7', 'uf_d8', 'uf_d9', 'uf_d10', 'uf_d11', 'uf_d12', 'uf_d13', 'uf_d14', 'uf_d15', 'uf_d16', 'uf_d17', 'uf_d18', 'uf_d19', 'uf_d20', 'uf_d21', 'uf_d22', 'uf_d23', 'uf_d24', 'uf_d25', 'uf_d26']


In [4]:
# Define the treatment and outcome variables
treatment = "first"  
outcomes = ["pcorrupt", "ncorrupt_os", "valor_corrupt"] 

# Use all covariates
all_covariates = (
    mayor_covariates +
    municipal_covariates +
    political_judicial_covariates +
    dummy_covariates
)

# Constructing DoubleML dataset
required_columns = [treatment] + outcomes + all_covariates
double_ml_dataset = corruption_df[required_columns].dropna()

print("Double ML Dataset Columns:", double_ml_dataset.columns)
print("Number of rows in the Double ML Dataset:", double_ml_dataset.shape[0])

Double ML Dataset Columns: Index(['first', 'pcorrupt', 'ncorrupt_os', 'valor_corrupt', 'pref_idade_tse',
       'pref_masc', 'pref_escola', 'winmargin2000', 'exp_prefeito', 'party_d1',
       'party_d3', 'party_d4', 'party_d5', 'party_d6', 'party_d7', 'party_d8',
       'party_d9', 'party_d10', 'party_d11', 'party_d12', 'party_d13',
       'party_d14', 'party_d15', 'party_d16', 'party_d17', 'party_d18', 'lpop',
       'purb', 'p_secundario', 'mun_novo', 'lpib02', 'gini_ipea', 'ENEP2000',
       'ENLP2000', 'p_cad_pref', 'sorteio1', 'sorteio2', 'sorteio3',
       'sorteio4', 'sorteio5', 'sorteio6', 'sorteio7', 'sorteio8', 'sorteio9',
       'sorteio10', 'uf_d1', 'uf_d2', 'uf_d3', 'uf_d4', 'uf_d5', 'uf_d6',
       'uf_d7', 'uf_d8', 'uf_d9', 'uf_d10', 'uf_d11', 'uf_d12', 'uf_d13',
       'uf_d14', 'uf_d15', 'uf_d16', 'uf_d17', 'uf_d18', 'uf_d19', 'uf_d20',
       'uf_d21', 'uf_d22', 'uf_d23', 'uf_d24', 'uf_d25', 'uf_d26'],
      dtype='object')
Number of rows in the Double ML Dataset: 467

## Lasso 

In [5]:
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error

In [31]:
# Initialize a list to store results for Lasso
lasso_results = []

# Loop through each outcome in the list of outcomes
for outcome in outcomes:
    print(f"Processing outcome: {outcome}")
    
    # Define X, D, and Y for the current outcome
    X = double_ml_dataset[all_covariates]
    D = double_ml_dataset[treatment]
    Y = double_ml_dataset[outcome]
    
    # Standardize the covariates
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Prepare 5-Fold Cross-Fitting
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # Create train-test splits
    folds = []
    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D.iloc[train_index], D.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
        folds.append((X_train, X_test, D_train, D_test, Y_train, Y_test))
    
    # Initialize lists to store fold estimates and MSE
    fold_estimates = []
    fold_mse = []
    
    # Regularization path
    alphas = np.logspace(-4, 0, 50)  # Regularization strength values
    
    # Loop through folds for cross-fitting
    for fold_idx, (X_train, X_test, D_train, D_test, Y_train, Y_test) in enumerate(folds):
        # Train Lasso model using cross-validation to find the optimal alpha
        lasso = LassoCV(alphas=alphas, cv=5, random_state=42).fit(X_train, Y_train)
        optimal_alpha = lasso.alpha_
        
        # Predict on test data
        Y_pred = lasso.predict(X_test)
        
        # Theta (Treatment Effect)
        treatment_effect = (
            np.mean(Y_pred[D_test.values == 1]) - np.mean(Y_pred[D_test.values == 0])
        )
        fold_estimates.append(treatment_effect)
        
        # MSE
        mse = mean_squared_error(Y_test, Y_pred)
        fold_mse.append(mse)
    
    # Compute overall average estimate and MSE for the current outcome
    average_estimate = np.mean(fold_estimates)
    average_mse = np.mean(fold_mse)
    
    # Store the results
    lasso_results.append({"Outcome": outcome, "Estimate": average_estimate, "MSE": average_mse})

# Convert the results to a DataFrame for better visualization
lasso_results_df = pd.DataFrame(lasso_results)

# Display the results DataFrame
print(lasso_results_df)

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: valor_corrupt
         Outcome      Estimate           MSE
0       pcorrupt      0.000186  1.026658e-02
1    ncorrupt_os      0.000101  2.078044e-03
2  valor_corrupt  21235.027344  3.580824e+11


In [27]:
import warnings
warnings.filterwarnings("ignore")

## Random Forest

In [24]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [None]:
rf_results = []

# Loop through each outcome in the list of outcomes
for outcome in outcomes:
    print(f"Processing outcome: {outcome}")
    # Define X, D, and Y for the current outcome
    X = double_ml_dataset[all_covariates]
    D = double_ml_dataset[treatment]
    Y = double_ml_dataset[outcome]
    
    # Standardize the covariates
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # 5-Fold Cross-Fitting
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # Create train-test splits
    folds = []
    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        D_train, D_test = D.iloc[train_index], D.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
        folds.append((X_train, X_test, D_train, D_test, Y_train, Y_test))
    
    # Initialize lists to store fold estimates and MSE
    fold_estimates = []
    fold_mse = []
    
    # Loop through folds for cross-fitting
    for fold_idx, (X_train, X_test, D_train, D_test, Y_train, Y_test) in enumerate(folds):
        # Train Random Forest model
        rf_model = RandomForestRegressor(random_state=4140, n_estimators=100, max_depth=None).fit(X_train, Y_train)
        
        # Predict on test data
        Y_pred = rf_model.predict(X_test)
        
        # Treatment Effect
        treatment_effect = (
            np.mean(Y_pred[D_test.values == 1]) - np.mean(Y_pred[D_test.values == 0])
        )
        fold_estimates.append(treatment_effect)
        
        # MSE
        mse = mean_squared_error(Y_test, Y_pred)
        fold_mse.append(mse)
    
    # Compute overall average estimate and MSE for the current outcome
    average_estimate = np.mean(fold_estimates)
    average_mse = np.mean(fold_mse)
    
    # Store the results
    rf_results.append({"Outcome": outcome, "Estimate": average_estimate, "MSE": average_mse})

# Convert the results to a DataFrame for better visualization
results_df = pd.DataFrame(rf_results)

# Display the results DataFrame
print(results_df)

Processing outcome: pcorrupt
Processing outcome: ncorrupt_os
Processing outcome: valor_corrupt
         Outcome     Estimate           MSE
0       pcorrupt    -0.001203  1.127543e-02
1    ncorrupt_os    -0.002318  2.293435e-03
2  valor_corrupt  8737.629605  3.360745e+11


## Neural Network

## Gradient Boosting


In [10]:
from sklearn.ensemble import GradientBoostingRegressor


In [16]:
outcome = outcomes[0]  
X = double_ml_dataset[all_covariates]
D = double_ml_dataset[treatment]
Y = double_ml_dataset[outcome]

# Standardize the covariates
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Prepare 5-Fold Cross-Fitting
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create train-test splits
folds = []
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    D_train, D_test = D.iloc[train_index], D.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
    folds.append((X_train, X_test, D_train, D_test, Y_train, Y_test))

# Regularization Path for Lasso
alphas = np.logspace(-4, 0, 50)  # 50 values between 10^-4 and 10^0

In [17]:

# Initialize lists to store fold estimates and MSE
fold_estimates = []
fold_mse = []


# Loop through folds for cross-fitting
for fold_idx, (X_train, X_test, D_train, D_test, Y_train, Y_test) in enumerate(folds):
    # Train Gradient Boosting model
    gb_model = GradientBoostingRegressor(random_state=42).fit(X_train, Y_train)
    
    # Predict on test data
    Y_pred = gb_model.predict(X_test)
    
    # Calculate the treatment effect
    treatment_effect = (
        np.mean(Y_pred[D_test.values == 1]) - np.mean(Y_pred[D_test.values == 0])
    )
    fold_estimates.append(treatment_effect)
    
    # MSE
    mse = mean_squared_error(Y_test, Y_pred)
    fold_mse.append(mse)
    
    print(f"Fold {fold_idx + 1}:")
    print(f"  Treatment Effect Estimate: {treatment_effect}")
    print(f"  MSE: {mse}")
    print()

# Compute overall average estimate and MSE
average_estimate = np.mean(fold_estimates)
average_mse = np.mean(fold_mse)

print("Final Results:")
print(f" Average Treatment Effect Estimate: {average_estimate}")
print(f" Average MSE: {average_mse}")

Fold 1:
  Treatment Effect Estimate: 0.00016003802844498372
  MSE: 0.015647467677191204

Fold 2:
  Treatment Effect Estimate: -0.0006888164775919192
  MSE: 0.01649880032634493

Fold 3:
  Treatment Effect Estimate: -0.013099382864855375
  MSE: 0.009203978743665922

Fold 4:
  Treatment Effect Estimate: 0.0018513191815970748
  MSE: 0.008108130135200117

Fold 5:
  Treatment Effect Estimate: -0.006359663711762623
  MSE: 0.011820252893298648

Final Results:
 Average Treatment Effect Estimate: -0.0036273011688335716
 Average MSE: 0.012255725955140165


## Analysis 

compare mse/theta,


## Conclusion