In [1]:

import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
df_elected = pd.read_csv("C:\Users\An\Aalto University\Cao Duong - ECONOMETRICS-CAPSTONE\submission file\elected.csv", skiprows=1, header=None)

df_capstone = pd.read_csv("Data_Capstone_cvs.csv", skiprows=1, header=None)

num_cols_elected = df_elected.shape[1]
df_elected.columns = [f"v{i+1}" for i in range(num_cols_elected)]

num_cols_capstone = df_capstone.shape[1]
df_capstone.columns = [f"v{i+1}" for i in range(num_cols_capstone)]


df_merged = pd.merge(df_capstone, df_elected, on=["v1", "v2", "v3"], how="outer", suffixes=("", "_elected"))


df_merged['elected'] = np.where(df_merged["v4_elected"].notna(), 1, 0)



SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (2746920114.py, line 1)

In [None]:

if df_merged.index.max() >= 927:  
    df_merged.loc[927, "v40"] = "600"

for i in range(1, 43):  # 1 to 42
    col = f"v{i}"
    if col in df_merged.columns and df_merged[col].dtype == object:
        df_merged[col] = df_merged[col].astype(str).str.replace("'", "", regex=True)
        df_merged[col] = df_merged[col].astype(str).str.replace(",", ".", regex=True)

# Convert columns to numeric
cols_to_destring_1 = [f"v{i}" for i in range(10, 26)]  # v10 to v25
cols_to_destring_2 = ["v28", "v31", "v34", "v37", "v40"]

for col in (cols_to_destring_1 + cols_to_destring_2):
    if col in df_merged.columns:
        df_merged[col] = pd.to_numeric(df_merged[col], errors='coerce')


# ### Rename and drop columns

# Create a dictionary of old->new column names
rename_dict = {
    "v1": "candidate_number",
    "v2": "first_name",
    "v3": "last_name",
    "v4": "title",
    "v5": "municipality",
    "v6": "party",
    "v7": "arrival_day",
    "v8": "date_modified",
    "v9": "support_group",
    "v10": "total_expenses",
    "v11": "newspaper",
    "v12": "radio",
    "v13": "TV",
    "v14": "info_network",
    "v15": "other_platform",
    "v16": "outdoor_ad",
    "v17": "purchases",
    "v18": "ad_design",
    "v19": "event",
    "v20": "aquisition_cost",
    "v21": "other_charges",
    "v22": "total_funding",
    "v23": "own_resources",
    "v24": "loans",
    "v25": "individual_support",
    "v28": "aid",
    "v31": "party_support",
    "v34": "party_support_association",
    "v37": "other_sources_support",
    "v40": "intermediated_aid"
}
df_merged.rename(columns=rename_dict, inplace=True)

# Example of generating ex_indi_min1500 from v26 == "X"
if "v26" in df_merged.columns:
    df_merged["ex_indi_min1500"] = df_merged["v26"].astype(str).eq("X")

# Drop columns no longer needed
cols_to_drop = [
    "v26", "v27", "v29", "v30", "v32", "v33", "v35", "v36", "v38", "v39", "v41", "v42",
    "title", "arrival_day", "date_modified", "support_group"
]
df_merged.drop(columns=[c for c in cols_to_drop if c in df_merged.columns], inplace=True, errors='ignore')


In [None]:

# ### Column reorder and correlation checks

# Reorder columns so that total_funding appears before total_expenses
cols = list(df_merged.columns)
if "total_funding" in cols and "total_expenses" in cols:
    cols.remove("total_funding")
    idx = cols.index("total_expenses")
    cols.insert(idx, "total_funding")
df_merged = df_merged[cols]

# Correlation among selected variables
vars_for_corr = ["elected", "newspaper", "radio", "TV", "info_network", 
                 "other_platform", "outdoor_ad", "purchases", "ad_design", "event"]
corr_matrix = df_merged[vars_for_corr].corr()
print("Correlation Matrix:")
print(corr_matrix)


In [None]:

# ### Logistic regression on total_funding

df_model_1 = df_merged.dropna(subset=["elected", "total_funding"]).copy()

# Build design matrices
X = sm.add_constant(df_model_1["total_funding"])  # Add intercept
y = df_model_1["elected"]

# Fit logistic regression
logit_model_1 = sm.Logit(y, X).fit(disp=0)
print(logit_model_1.summary())

# Marginal (partial) effects for total_funding
# statsmodels allows get_margeff for logistic models:
mfx = logit_model_1.get_margeff()
print(mfx.summary())

# Predicted probabilities
df_model_1["p"] = logit_model_1.predict(X)

# Plot predicted probabilities against total_funding
plt.figure(figsize=(8, 5))
plt.scatter(df_model_1["total_funding"], df_model_1["p"], alpha=0.5, label="Predicted Probabilities")
sns.regplot(x="total_funding", y="p", data=df_model_1, 
            scatter=False, lowess=True, color="blue", label="Lowess")
plt.xlabel("Total Funding")
plt.ylabel("Predicted Probability of Elected")
plt.title("Linearity of Logit (Predicted Probability vs. Total Funding)")
plt.legend()
plt.show()

In [None]:
# ### Illustrative Stepwise by AIC

# %%
predictors = ["aid", "other_sources_support", "own_resources", 
              "party_support_association", "intermediated_aid",
              "loans", "individual_support"]

df_model_2 = df_merged.dropna(subset=["elected"] + predictors).copy()
y = df_model_2["elected"]

best_model = None
best_aic = np.inf
current_predictors = []

# Forward stepwise approach
for var in predictors:
    test_model_predictors = current_predictors + [var]
    X_test = sm.add_constant(df_model_2[test_model_predictors])
    try:
        fit_test = sm.Logit(y, X_test).fit(disp=0)
        if fit_test.aic < best_aic:
            best_aic = fit_test.aic
            best_model = fit_test
            current_predictors = test_model_predictors
    except:
        # If the model fails to converge for any reason, skip
        continue
    
    print(f"Current model predictors: {test_model_predictors}")
    print(f"AIC: {fit_test.aic}")
    print(f"Best model so far: {current_predictors}, AIC = {best_aic}")
    print("-"*50)

print("Final model predictors:", current_predictors)
print("Final model AIC:", best_aic)

if best_model is not None:
    print(best_model.summary())

In [None]:
# %% [markdown]
# ### Compare Probit and Logit with multiple predictors

# %%
predictors_ad = ["newspaper", "radio", "TV", "info_network", 
                 "other_platform", "outdoor_ad", "purchases", "event"]
df_model_3 = df_merged.dropna(subset=["elected"] + predictors_ad)

X_ad = sm.add_constant(df_model_3[predictors_ad])
y_ad = df_model_3["elected"]

# Probit
probit_model = sm.Probit(y_ad, X_ad).fit(disp=0)
print("Probit model:\n", probit_model.summary())

# Logit
logit_model_2 = sm.Logit(y_ad, X_ad).fit(disp=0)
print("\nLogit model:\n", logit_model_2.summary())

# Marginal effects for Logit
mfx_logit_2 = logit_model_2.get_margeff()
print("\nMarginal effects (Logit):\n", mfx_logit_2.summary())

# Pairwise correlation
corr_matrix_ad = df_model_3[["elected"] + predictors_ad].corr()
print("\nPairwise correlations:\n", corr_matrix_ad)

# Histogram example
df_model_3["newspaper"].hist(bins=20)
plt.title("Histogram of 'newspaper'")
plt.xlabel("Spending\Value for Newspaper")
plt.ylabel("Frequency")
plt.show()

# %% [markdown]
# ### Heatmap of correlation matrix

# %%
plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix_ad, annot=True, cmap="coolwarm", fmt=".3f")
plt.title("Correlation Heatmap")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# %% [markdown]
# ### Residual diagnostics

# %%
df_model_3["resid"] = logit_model_2.resid_linear  # Linear predictors residual
plt.figure()
sns.histplot(df_model_3["resid"], kde=True)
plt.title("Histogram of Residuals")
plt.xlabel("Residual")
plt.show()

# Normal Q-Q plot
sm.qqplot(df_model_3["resid"], line='45', fit=True)
plt.title("Normal Q-Q of Residuals")
plt.show()

# Shapiro-Wilk test (like swilk)
from scipy.stats import shapiro
stat, p_value = shapiro(df_model_3["resid"].dropna())
print(f"Shapiro-Wilk Test: statistic={stat:.4f}, p-value={p_value:.4f}")


In [None]:
# %% [markdown]
# ### Subgroup analysis by region

# %%
df_merged["region"] = np.nan

# Example logic: "gen region = substr(municipality, 1, strpos(municipality, " ") - 1)"
# We'll try a Python version if municipality has spaces
def extract_region(x):
    if pd.isna(x):
        return np.nan
    parts = str(x).split()
    return parts[0] if len(parts) > 0 else np.nan

df_merged["region"] = df_merged["municipality"].apply(extract_region)

# Remove hyphens
df_merged["region"] = df_merged["region"].str.replace("-", "", regex=True)

# Drop municipality if desired
df_merged.drop(columns=["municipality"], inplace=True)

# Unique levels of region
regions = df_merged["region"].dropna().unique()

for reg in regions:
    df_sub = df_merged.loc[df_merged["region"] == reg].dropna(
        subset=["elected", "newspaper", "radio", "TV", 
                "info_network", "outdoor_ad", "purchases", "event"]
    )
    if df_sub.empty:
        continue


In [None]:
    
    X_sub = sm.add_constant(df_sub[["newspaper", "radio", "TV", 
                                   "info_network", "outdoor_ad", 
                                   "purchases", "event"]])
    y_sub = df_sub["elected"]
    
    try:
        fit_sub = sm.Logit(y_sub, X_sub).fit(disp=0)
        print(f"Region: {reg}")
        print(fit_sub.summary())
        mfx_sub = fit_sub.get_margeff()
        print("Marginal effects:\n", mfx_sub.summary())
        print("-"*60)
    except:
        print(f"Model failed for region: {reg}")
        print("-"*60)
