In [1]:
# Libraries for loading in data and building features
import numpy as np
import pandas as pd
# from docx import Document
from src.modules import * #contains functions used in common with processing election and IRS data
import os

In [2]:

### Libraries for  Gridsearch with Ridge and LASSO regression, as well as XGBoost and random forest modeling
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA # There are a LOT of features, so using PCA to reduce them seems like a good idea
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import Ridge, Lasso
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score #using MSE at first, remember to try other error metrics with future analysis

In [3]:

# Load data, this file is the np.log of the files created in 'merge_State_IRS_data.py', see 'transform_with_log' file for details
house_IRS_diff = pd.read_csv('data/logarithm_of_joined_data/house_IRS_d_log.csv')

### Create columns for incumbant voteshare and challenger voteshare

# Combines votes for incumbant candidates regardless of party
house_IRS_diff['Inc'] = (house_IRS_diff['R1'] + house_IRS_diff['D1']) 
# if future data contains incumbants who are not Dem or Rep, INCLUDE THOSE

# Combines votes for challengers regardless of party
house_IRS_diff['Challenger'] = (house_IRS_diff['R0'] + house_IRS_diff['D0'] + house_IRS_diff['OTHER0']) 

In [10]:
# Assign X and y
X = house_IRS_diff.drop(['D0', 'D1', 'OTHER0', 'R0', 'R1','Inc','Challenger'],axis=1)
### Our dependent variables for this model will just be 'Inc' and 'Challenger'
y = house_IRS_diff[['Inc','Challenger']]

# prepare scaler
scaler_ss = StandardScaler()
### Reminder - if StandardScaler() is used, consider re-running with Min-Max
X_scaled = scaler_ss.fit_transform(X)

# PCA to reduce the number of features
pca = PCA(n_components=15)
X_pca = pca.fit_transform(X_scaled)

# Split into train and test sets  
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=12)


In [11]:
# List Models and their respective hyperparameter grids 
### THESE will likely require more tuning
models = {
    'XGBoost': (XGBRegressor(), {'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [25, 50, 75, 150], 'max_depth': [3, 5, 7]}),
    'Ridge': (Ridge(), {'alpha': [0.1, 1.0, 10.0, 25]}),
    'Lasso': (Lasso(), {'alpha': [0.1, 1.0, 10.0, 25]}),
    'RandomForest': (RandomForestRegressor(), {'n_estimators': [50, 100, 150], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]})
}


In [12]:
# Perform GridSearchCV for each model
for model_name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Access best hyperparameters and model for each model
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    # Evaluate on the test set
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    print(f"Best hyperparameters for {model_name}: {best_params}")
    print(f"Best model for {model_name}: {best_model}")
    print(f"Mean Squared Error on Test Set: {mse}\n")

Best hyperparameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Best model for XGBoost: XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=3, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=50, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)
Mean Squared Error on Test Set: 27.469678489881176

Best hyperparameters for Ridge: {'alpha': 25}
Best model for Ridge: Ridge(alpha=25)
Mean S

In [13]:
### Re-run with min-max scaler

# Assign X and y
X = house_IRS_diff.drop(['D0', 'D1', 'OTHER0', 'R0', 'R1','Inc','Challenger'],axis=1)
### Our dependent variables for this model will just be 'Inc' and 'Challenger'
y = house_IRS_diff[['Inc','Challenger']]

# prepare scaler
scaler_MM = MinMaxScaler()
### Reminder - if StandardScaler() is used, consider re-running with Min-Max
X_scaled = scaler_MM.fit_transform(X)

# PCA to reduce the number of features
pca = PCA(n_components=15)
X_pca = pca.fit_transform(X_scaled)

# Split into train and test sets  
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=12)


In [14]:
# List Models and their respective hyperparameter grids 
### THESE will likely require more tuning
models = {
    'XGBoost': (XGBRegressor(), {'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [25, 50, 75, 150], 'max_depth': [3, 5, 7]}),
    'Ridge': (Ridge(), {'alpha': [0.1, 1.0, 10.0, 25]}),
    'Lasso': (Lasso(), {'alpha': [0.1, 1.0, 10.0, 25]}),
    'RandomForest': (RandomForestRegressor(), {'n_estimators': [50, 100, 150], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]})
}


In [15]:
# Perform GridSearchCV for each model
for model_name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Access best hyperparameters and model for each model
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    # Evaluate on the test set
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    print(f"Best hyperparameters for {model_name}: {best_params}")
    print(f"Best model for {model_name}: {best_model}")
    print(f"Mean Squared Error on Test Set: {mse}\n")

Best hyperparameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Best model for XGBoost: XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=3, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=50, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)
Mean Squared Error on Test Set: 27.26498074846682

Best hyperparameters for Ridge: {'alpha': 10.0}
Best model for Ridge: Ridge(alpha=10.0)
Mea

In [16]:
# With min-max scaling, we get a slightly better performance from Random Forest, so let's use that model going forward:

In [None]:
# Resetting all values:
# prepare scaler
scaler_MM = MinMaxScaler()
X_scaled = scaler_MM.fit_transform(X)

# PCA to reduce the number of features
pca = PCA(n_components=15)
X_pca = pca.fit_transform(X_scaled)

# Split into train and test sets  
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_min, test_size=0.2, random_state=12)

In [None]:
# Configure the model:
best_model = RandomForestRegressor(max_depth = 20, min_samples_leaf = 4, min_samples_split = 10, n_estimators = 100, random_state=12)