In [3]:
# Libraries for getting the data
import numpy as np
import pandas as pd
from src.modules import * #contains functions used in common with processing election and IRS data
import os

In [None]:
# Let's begin the process of determining a model to use. Since the data we have has a good number of outliers, we will be using the log of the data for this process

In [5]:
# Load data, these files are the np.log of the files created in 'merge_State_IRS_data.py', see 'transform_with_log' file for details
house_IRS_f = pd.read_csv(r'data/logarithm_of_joined_data/house_IRS_f_log.csv')
house_IRS_d = pd.read_csv(r'data/logarithm_of_joined_data/house_IRS_d_log.csv')

In [None]:
house_IRS_f.describe()

In [None]:
### Create columns for incumbant voteshare and challenger voteshare
# Load data, these files are the np.log of the files created in 'merge_State_IRS_data.py', see 'transform_with_log' file for details
house_IRS_diff = pd.read_csv('data/logarithm_of_joined_data/house_IRS_d_log.csv')
house_IRS_diff['Inc'] = (house_IRS_diff['R1'] + house_IRS_diff['D1'])
house_IRS_diff['Challenger'] = (house_IRS_diff['R0'] + house_IRS_diff['D0'])

In [None]:
# Load Libraries for visualizing the dfs

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import IsolationForest

In [None]:
# Scale the data
# Opting for MinMax here, but run with StandardScaler() as well, see if it produces different results
scaler_MM = MinMaxScaler()
data_scaled_MM = pd.DataFrame(scaler_MM.fit_transform(house_IRS_diff), columns=house_IRS_diff.columns)


In [None]:
# # Detect outliers using Isolation Forest 
# ### This was recommended, but has not been deployed yet as I do not fully understand it
# isolation_forest = IsolationForest(contamination=0.05, random_state=12)
# is_inlier = isolation_forest.fit_predict(f2_data_scaled_MM)


In [None]:
# Create histograms for each feature
### NOTE, running this is... less helfpul, because of the number of columns 
for c in house_IRS_diff.columns:
    plt.figure(figsize=(8, 4))
    sns.histplot(data_scaled_MM[c], bins=30, kde=True)
    plt.title(f'Histogram for {c}')
    plt.show()

In [None]:
# Create box plots for each feature
plt.figure(figsize=(16, 8))
sns.boxplot(data=data_scaled_MM)
plt.title('Box Plot for Each Variable')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Scale the data
# Opting for MinMax here, but run with StandardScaler() as well, see if it produces different results
scaler_ss = StandardScaler()
data_scaled_S = pd.DataFrame(scaler_ss.fit_transform(house_IRS_diff), columns=house_IRS_diff.columns)


In [None]:
# Create histograms for each feature
### NOTE, running this is... less helfpul, because of the number of columns 
for c in house_IRS_diff.columns:
    plt.figure(figsize=(8, 4))
    sns.histplot(data_scaled_S[c], bins=30, kde=True)
    plt.title(f'Histogram for {c}')
    plt.show()

In [None]:
# Create box plots for each feature
plt.figure(figsize=(16, 8))
sns.boxplot(data=data_scaled_S)
plt.title('Box Plot for Each Variable')
plt.xticks(rotation=90)
plt.show()

In [None]:
### Sooooo that looks like a lot of outliers

In [None]:
### Let's at least try a basic Gridsearch and random forest modeling

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA # There are a LOT of features, so using PCA to reduce them seems like a good idea
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score #using MSE at first, remember to try other error metrics

In [None]:
# Assign X and y
X = house_IRS_diff.drop(['D0', 'D1', 'OTHER0', 'R0', 'R1','Inc','Challenger'],axis=1)
### Our dependent variables for this model will just be 'Inc' and 'Challenger'
y = house_IRS_diff[['Inc','Challenger']]

In [None]:
# Scale the data with StandardScaler
X_scaled = scaler_ss.fit_transform(X)


# Perform PCA to reduce features
num_components = 15  # Consider adjusting this value if results are unsatisfactory
pca = PCA(n_components=num_components)
X_pca = pca.fit_transform(X_scaled)


# Split into train and test sets  
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=12)

# Assign model
model = RandomForestRegressor()

# first pass at guessing hyperparameters
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# build grid search
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)

print(f"Mean Squared Error on Test Set: {mse}")

In [None]:
### Check with min-max Scaling as well

# Scale data with Min-Max and re-run
X_scaled_MM = scaler_MM.fit_transform(X)

# Perform PCA to reduce features
num_components = 15  # Consider adjusting this value if results are unsatisfactory
pca = PCA(n_components=num_components)
X_pca = pca.fit_transform(X_scaled_MM)


# Split into train and test sets  
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=12)

# Assign model
model = RandomForestRegressor()

# first pass at guessing hyperparameters
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# build grid search
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)

print(f"Mean Squared Error on Test Set: {mse}")

In [None]:

### Let's build a more robust gridsearch

In [None]:
from sklearn.linear_model import Ridge, Lasso
from xgboost import XGBRegressor

In [None]:
# Assign X and y
X = house_IRS_diff.drop(['D0', 'D1', 'OTHER0', 'R0', 'R1','Inc','Challenger'],axis=1)
### Our dependent variables for this model will just be 'Inc' and 'Challenger'
y = house_IRS_diff[['Inc','Challenger']]

### Reminder - if StandardScaler() is used, consider re-running with Min-Max
X_scaled = scaler_ss.fit_transform(X)

# PCA to reduce the number of features
pca = PCA(n_components=15)
X_pca = pca.fit_transform(X_scaled)

# Split into train and test sets  
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=12)

# List Models and their respective hyperparameter grids 
### THESE will likely require more tuning
models = {
    'XGBoost': (XGBRegressor(), {'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [25, 50, 75, 150], 'max_depth': [3, 5, 7]}),
    'Ridge': (Ridge(), {'alpha': [0.1, 1.0, 10.0, 25]}),
    'Lasso': (Lasso(), {'alpha': [0.1, 1.0, 10.0, 25]}),
    'RandomForest': (RandomForestRegressor(), {'n_estimators': [50, 100, 150], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]})
}

# Perform GridSearchCV for each model
for model_name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Access best hyperparameters and model for each model
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    # Evaluate on the test set
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    print(f"Best hyperparameters for {model_name}: {best_params}")
    print(f"Best model for {model_name}: {best_model}")
    print(f"Mean Squared Error on Test Set: {mse}\n")

In [None]:

### Re-run with Min-max
X_scaled = scaler_ss.fit_transform(X)

# PCA to reduce the number of features
pca = PCA(n_components=15)
X_pca = pca.fit_transform(X_scaled)

# Split into train and test sets  
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=12)

# List Models and their respective hyperparameter grids 
### THESE will likely require more tuning
models = {
    'XGBoost': (XGBRegressor(), {'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [25, 50, 75, 150], 'max_depth': [3, 5, 7]}),
    'Ridge': (Ridge(), {'alpha': [0.1, 1.0, 10.0, 25]}),
    'Lasso': (Lasso(), {'alpha': [0.1, 1.0, 10.0, 25]}),
    'RandomForest': (RandomForestRegressor(), {'n_estimators': [50, 100, 150], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]})
}

# Perform GridSearchCV for each model
for model_name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Access best hyperparameters and model for each model
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    # Evaluate on the test set
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    print(f"Best hyperparameters for {model_name}: {best_params}")
    print(f"Best model for {model_name}: {best_model}")
    print(f"Mean Squared Error on Test Set: {mse}\n")

In [None]:
best_model = RandomForestRegressor(max_depth = 20, min_samples_leaf = 4, min_samples_split = 10, n_estimators = 100, random_state=12)

In [None]:
best_model.fit(X_train, y_train)

In [None]:
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)

r2 = r2_score(y_test, y_pred)

print(f'r2 = {r2} \n and mse = {mse}')

plt.scatter(y_test, y_pred)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values")
plt.show()

In [None]:
# Soooo that's not the best