In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split

### The IowaHouseElections.csv file is a modified version of the IowaHouse.csv file - I kept only the 2012-2022 election data and created a Total (T) vote count column for each individual election per House district.

In [3]:
# Read in csv file
iowa_senate = pd.read_csv('Files/IowaSenateElections.csv')

In [4]:
iowa_senate.shape

(50, 101)

In [5]:
iowa_senate.head()

Unnamed: 0,District ID,G22AgD,G22AgR,G22AgO,G22AgT,G22AgrCD,G22AgrCR,G22AgrCO,G22AgrCT,G22TreD,...,G14GovO,G14GovT,G14SenR,G14SenD,G14SenO,G14SenT,G12PreR,G12PreD,G12PreO,G12PreT
0,1,6740,7213,0,13953,5583,8336,0,13919,6503,...,411,15958,8089,7087,898,16074,10396,13708,426,24530
1,2,4334,19724,0,24058,2805,21421,0,24226,4346,...,178,22926,19059,3375,396,22830,24272,6630,238,31140
2,3,6811,14137,0,20948,4000,17104,0,21104,6793,...,398,21866,14589,6315,914,21818,18670,11442,313,30425
3,4,8867,13560,0,22427,5650,16798,0,22448,8913,...,1064,23682,13919,8560,1070,23549,16852,15419,318,32589
4,5,9506,17438,0,26944,6039,21033,0,27072,9634,...,404,26088,15454,9319,1164,25937,19713,15557,323,35593


In [6]:
# Subsetting for all elections democratic(D) vote totals
iowa_senate_d = iowa_senate.loc[:, iowa_senate.columns.str.contains('District ID|D$')]

In [7]:
iowa_senate_d.head()

Unnamed: 0,District ID,G22AgD,G22AgrCD,G22TreD,G22AudD,G22SosD,G22GovD,G22SenD,G20PreD,G20SenD,...,G16PreD,G16SenD,G14AgD,G14AgrCD,G14TreD,G14AudD,G14SosD,G14GovD,G14SenD,G12PreD
0,1,6740,5583,6503,6840,5782,5356,6023,11404,11333,...,10220,8852,8209,5831,7166,6809,7453,5967,7087,13708
1,2,4334,2805,4346,4723,3233,2845,3624,6282,6165,...,5008,3874,3910,2272,3780,3035,3843,2500,3375,6630
2,3,6811,4000,6793,7000,4791,4384,5299,8937,9322,...,7954,6552,8521,3919,8004,5539,7142,4887,6315,11442
3,4,8867,5650,8913,8955,6157,5928,6840,10403,10818,...,9934,8347,13375,5885,12100,8641,10041,7286,8560,15419
4,5,9506,6039,9634,9390,7209,6784,8011,11846,12641,...,10574,8774,12530,5671,12268,8703,10168,7098,9319,15557


In [8]:
# Calculate yearly averages for all elections democratic vote totals
for year in range(12, 23, 2):
    iowa_senate_d['20{}'.format(year)] = iowa_senate_d.loc[:, iowa_senate_d.columns.str.contains('{}'.format(year))].mean(axis=1)

In [9]:
iowa_senate_d.head()

Unnamed: 0,District ID,G22AgD,G22AgrCD,G22TreD,G22AudD,G22SosD,G22GovD,G22SenD,G20PreD,G20SenD,...,G14SosD,G14GovD,G14SenD,G12PreD,2012,2014,2016,2018,2020,2022
0,1,6740,5583,6503,6840,5782,5356,6023,11404,11333,...,7453,5967,7087,13708,13708.0,6931.714286,9536.0,9967.166667,10479.980159,6118.142857
1,2,4334,2805,4346,4723,3233,2845,3624,6282,6165,...,3843,2500,3375,6630,6630.0,3245.0,4441.0,6000.666667,5460.611111,3701.428571
2,3,6811,4000,6793,7000,4791,4384,5299,8937,9322,...,7142,4887,6315,11442,11442.0,6332.428571,7253.0,9051.166667,8722.93254,5582.571429
3,4,8867,5650,8913,8955,6157,5928,6840,10403,10818,...,10041,7286,8560,15419,15419.0,9412.571429,9140.5,11481.666667,11112.456349,7330.0
4,5,9506,6039,9634,9390,7209,6784,8011,11846,12641,...,10168,7098,9319,15557,15557.0,9393.857143,9674.0,12073.333333,11864.198413,8081.857143


In [10]:
# Subsetting for yearly averages columns
iowa_senate_y = iowa_senate_d.drop(iowa_senate_d.columns[1:26], axis=1)

In [11]:
iowa_senate_y.shape

(50, 7)

In [12]:
iowa_senate_y.head()

Unnamed: 0,District ID,2012,2014,2016,2018,2020,2022
0,1,13708.0,6931.714286,9536.0,9967.166667,10479.980159,6118.142857
1,2,6630.0,3245.0,4441.0,6000.666667,5460.611111,3701.428571
2,3,11442.0,6332.428571,7253.0,9051.166667,8722.93254,5582.571429
3,4,15419.0,9412.571429,9140.5,11481.666667,11112.456349,7330.0
4,5,15557.0,9393.857143,9674.0,12073.333333,11864.198413,8081.857143


In [13]:
iowa_senate_y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   District ID  50 non-null     int64  
 1   2012         50 non-null     float64
 2   2014         50 non-null     float64
 3   2016         50 non-null     float64
 4   2018         50 non-null     float64
 5   2020         50 non-null     float64
 6   2022         50 non-null     float64
dtypes: float64(6), int64(1)
memory usage: 2.9 KB


In [14]:
# Create features and target variables
features = iowa_senate_y.drop(['District ID','2022'], axis=1)
target = iowa_senate_y['2022']

In [15]:
# Split data into 80% training set and 20% testing set
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1)

### I selected the RandomForestRegressor algorithm for this predictive task due to its strong resistance to overfitting, versatility in handling various regression tasks, and the advantage of not requiring standardization and normalization.

In [16]:
# Instantiate RandomForestRegressor algorithm
rf = RandomForestRegressor(random_state=1)

In [17]:
# Fit model to the training set
rf.fit(X_train, y_train)

In [18]:
# R2 score using the testing set
rf.score(X_test, y_test)

0.8937402004804359

In [19]:
# Predict values using testing set features
y_pred = rf.predict(X_test)

In [20]:
# Root Mean Square Error (RMSE) model score
mse(y_test, y_pred, squared=False)

729.4583625524859

In [21]:
iowa_senate_y.head()

Unnamed: 0,District ID,2012,2014,2016,2018,2020,2022
0,1,13708.0,6931.714286,9536.0,9967.166667,10479.980159,6118.142857
1,2,6630.0,3245.0,4441.0,6000.666667,5460.611111,3701.428571
2,3,11442.0,6332.428571,7253.0,9051.166667,8722.93254,5582.571429
3,4,15419.0,9412.571429,9140.5,11481.666667,11112.456349,7330.0
4,5,15557.0,9393.857143,9674.0,12073.333333,11864.198413,8081.857143


In [22]:
# Create function to predict democratic vote totals per senate district
def array_pred(year):
    array = np.array(iowa_senate_y.drop(['District ID', year], axis=1).iloc[:,:])
    array_year = rf.predict(array)
    return array_year

In [23]:
# Average out democratic vote totals per senate district by dropping each year and predicting for 2024
array_avg_24 = (array_pred('2012')+array_pred('2014')+array_pred('2016')+array_pred('2018')+array_pred('2020')\
                +array_pred('2022'))/6

In [24]:
# Create new 2024 column with the averaged out prediction
iowa_senate_y['2024'] = array_avg_24

In [25]:
iowa_senate_y.head()

Unnamed: 0,District ID,2012,2014,2016,2018,2020,2022,2024
0,1,13708.0,6931.714286,9536.0,9967.166667,10479.980159,6118.142857,6432.035476
1,2,6630.0,3245.0,4441.0,6000.666667,5460.611111,3701.428571,5279.724286
2,3,11442.0,6332.428571,7253.0,9051.166667,8722.93254,5582.571429,6780.061667
3,4,15419.0,9412.571429,9140.5,11481.666667,11112.456349,7330.0,7592.820238
4,5,15557.0,9393.857143,9674.0,12073.333333,11864.198413,8081.857143,8185.600238


In [26]:
# Run new model to evaluate model performance using newly added predicted 2024 data
features = iowa_senate_y.drop(['District ID', '2024'], axis=1)
target = iowa_senate_y['2024']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9214971795857091

In [27]:
# Average out democratic vote totals per senate district by dropping each year and predicting for 2026
array_avg_26 = (array_pred('2012')+array_pred('2014')+array_pred('2016')+array_pred('2018')+array_pred('2020')\
                +array_pred('2022')+array_pred('2024'))/7

In [28]:
# Create new 2026 column with the averaged out prediction
iowa_senate_y['2026'] = array_avg_26

In [29]:
iowa_senate_y.head()

Unnamed: 0,District ID,2012,2014,2016,2018,2020,2022,2024,2026
0,1,13708.0,6931.714286,9536.0,9967.166667,10479.980159,6118.142857,6432.035476,6539.415388
1,2,6630.0,3245.0,4441.0,6000.666667,5460.611111,3701.428571,5279.724286,6005.204998
2,3,11442.0,6332.428571,7253.0,9051.166667,8722.93254,5582.571429,6780.061667,6538.189424
3,4,15419.0,9412.571429,9140.5,11481.666667,11112.456349,7330.0,7592.820238,7196.589869
4,5,15557.0,9393.857143,9674.0,12073.333333,11864.198413,8081.857143,8185.600238,7638.117541


In [30]:
# Run new model to evaluate model performance using newly added predicted 2026 data
features = iowa_senate_y.drop(['District ID', '2026'], axis=1)
target = iowa_senate_y['2026']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.997216232967377

In [31]:
# Average out democratic vote totals per senate district by dropping each year and predicting for 2028
array_avg_28 = (array_pred('2012')+array_pred('2014')+array_pred('2016')+array_pred('2018')+array_pred('2020')\
                +array_pred('2022')+array_pred('2024')+array_pred('2026'))/8

In [32]:
# Create new 2028 column with the averaged out prediction
iowa_senate_y['2028'] = array_avg_28

In [33]:
iowa_senate_y.head()

Unnamed: 0,District ID,2012,2014,2016,2018,2020,2022,2024,2026,2028
0,1,13708.0,6931.714286,9536.0,9967.166667,10479.980159,6118.142857,6432.035476,6539.415388,6543.217271
1,2,6630.0,3245.0,4441.0,6000.666667,5460.611111,3701.428571,5279.724286,6005.204998,6294.263969
2,3,11442.0,6332.428571,7253.0,9051.166667,8722.93254,5582.571429,6780.061667,6538.189424,6520.389892
3,4,15419.0,9412.571429,9140.5,11481.666667,11112.456349,7330.0,7592.820238,7196.589869,7033.187673
4,5,15557.0,9393.857143,9674.0,12073.333333,11864.198413,8081.857143,8185.600238,7638.117541,7404.063202


In [34]:
# # Run new model to evaluate model performance using newly added predicted 2028 data
features = iowa_senate_y.drop(['District ID', '2028'], axis=1)
target = iowa_senate_y['2028']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9972253215064096

In [35]:
# Average out democratic vote totals per senate district by dropping each year and predicting for 2030
array_avg_30 = (array_pred('2012')+array_pred('2014')+array_pred('2016')+array_pred('2018')+array_pred('2020')\
                +array_pred('2022')+array_pred('2024')+array_pred('2026')+array_pred('2028'))/9

In [36]:
# Create new 2030 column with the averaged out prediction
iowa_senate_y['2030'] = array_avg_30

In [37]:
iowa_senate_y.head()

Unnamed: 0,District ID,2012,2014,2016,2018,2020,2022,2024,2026,2028,2030
0,1,13708.0,6931.714286,9536.0,9967.166667,10479.980159,6118.142857,6432.035476,6539.415388,6543.217271,6548.761588
1,2,6630.0,3245.0,4441.0,6000.666667,5460.611111,3701.428571,5279.724286,6005.204998,6294.263969,6429.779679
2,3,11442.0,6332.428571,7253.0,9051.166667,8722.93254,5582.571429,6780.061667,6538.189424,6520.389892,6529.501747
3,4,15419.0,9412.571429,9140.5,11481.666667,11112.456349,7330.0,7592.820238,7196.589869,7033.187673,6966.408443
4,5,15557.0,9393.857143,9674.0,12073.333333,11864.198413,8081.857143,8185.600238,7638.117541,7404.063202,7239.078741


In [38]:
# Run new model to evaluate model performance using newly added predicted 2030 data
features = iowa_senate_y.drop(['District ID', '2030'], axis=1)
target = iowa_senate_y['2030']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9984221976281834

In [39]:
## Save to csv
# iowa_senate_y.to_csv('iowa_uh_dem.csv', index=False)