In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split

### The IowaHouseElections.csv file is a modified version of the IowaHouse.csv file - I kept only the 2012-2022 election data and created a Total (T) vote count column for each individual election per House district.

In [3]:
# Read in csv file
iowa_house = pd.read_csv('Files/IowaHouseElections.csv')

In [4]:
iowa_house.shape

(100, 101)

In [5]:
iowa_house.head()

Unnamed: 0,District ID,G22AgD,G22AgR,G22AgO,G22AgT,G22AgrCD,G22AgrCR,G22AgrCO,G22AgrCT,G22TreD,...,G14GovO,G14GovT,G14SenR,G14SenD,G14SenO,G14SenT,G12PreR,G12PreD,G12PreO,G12PreT
0,1,3057,2704,0,5761,2661,3076,0,5737,2970,...,166,6861,3082,3447,400,6929,3859,7133,163,11155
1,2,3683,4509,0,8192,2922,5260,0,8182,3533,...,245,9097,5007,3640,498,9145,6537,6575,263,13375
2,3,2736,8889,0,11625,1742,9998,0,11740,2703,...,97,11378,8796,2257,267,11320,11156,4146,125,15427
3,4,1598,10835,0,12433,1063,11423,0,12486,1643,...,81,11548,10263,1118,129,11510,13116,2484,113,15713
4,5,3196,8434,0,11630,1817,9881,0,11698,3155,...,129,12016,8583,2939,452,11974,10804,5402,151,16357


In [6]:
# Subsetting for all elections democratic(D) vote totals
iowa_house_d = iowa_house.loc[:, iowa_house.columns.str.contains('District ID|D$')]

In [7]:
iowa_house_d.head()

Unnamed: 0,District ID,G22AgD,G22AgrCD,G22TreD,G22AudD,G22SosD,G22GovD,G22SenD,G20PreD,G20SenD,...,G16PreD,G16SenD,G14AgD,G14AgrCD,G14TreD,G14AudD,G14SosD,G14GovD,G14SenD,G12PreD
0,1,3057,2661,2970,3097,2731,2525,2805,5610,5543,...,5236,4578,3939,2981,3506,3399,3598,2961,3447,7133
1,2,3683,2922,3533,3743,3051,2831,3218,5794,5790,...,4984,4274,4270,2850,3660,3410,3855,3006,3640,6575
2,3,2736,1742,2703,3036,2015,1758,2200,3912,3928,...,3138,2395,2531,1522,2416,2006,2691,1669,2257,4146
3,4,1598,1063,1643,1687,1218,1087,1424,2370,2237,...,1870,1479,1379,750,1364,1029,1152,831,1118,2484
4,5,3196,1817,3155,3282,2219,1928,2459,4054,4265,...,3521,2817,4108,1786,3653,2597,3437,2251,2939,5402


In [8]:
# Calculate yearly averages for all elections democratic vote totals
for year in range(12, 23, 2):
    iowa_house_d['20{}'.format(year)] = iowa_house_d.loc[:, iowa_house_d.columns.str.contains('{}'.format(year))].mean(axis=1)

In [9]:
iowa_house_d.head()

Unnamed: 0,District ID,G22AgD,G22AgrCD,G22TreD,G22AudD,G22SosD,G22GovD,G22SenD,G20PreD,G20SenD,...,G14SosD,G14GovD,G14SenD,G12PreD,2012,2014,2016,2018,2020,2022
0,1,3057,2661,2970,3097,2731,2525,2805,5610,5543,...,3598,2961,3447,7133,7133.0,3404.428571,4907.0,4758.333333,5225.960317,2835.142857
1,2,3683,2922,3533,3743,3051,2831,3218,5794,5790,...,3855,3006,3640,6575,6575.0,3527.285714,4629.0,5208.833333,5254.019841,3283.0
2,3,2736,1742,2703,3036,2015,1758,2200,3912,3928,...,2691,1669,2257,4146,4146.0,2156.0,2766.5,3625.166667,3422.277778,2312.857143
3,4,1598,1063,1643,1687,1218,1087,1424,2370,2237,...,1152,831,1118,2484,2484.0,1089.0,1674.5,2375.5,2038.333333,1388.571429
4,5,3196,1817,3155,3282,2219,1928,2459,4054,4265,...,3437,2251,2939,5402,5402.0,2967.285714,3169.0,4243.0,4016.714286,2579.428571


In [10]:
# Subsetting for yearly averages columns
iowa_house_y = iowa_house_d.drop(iowa_house_d.columns[1:26], axis=1)

In [11]:
iowa_house_y.shape

(100, 7)

In [12]:
iowa_house_y.head()

Unnamed: 0,District ID,2012,2014,2016,2018,2020,2022
0,1,7133.0,3404.428571,4907.0,4758.333333,5225.960317,2835.142857
1,2,6575.0,3527.285714,4629.0,5208.833333,5254.019841,3283.0
2,3,4146.0,2156.0,2766.5,3625.166667,3422.277778,2312.857143
3,4,2484.0,1089.0,1674.5,2375.5,2038.333333,1388.571429
4,5,5402.0,2967.285714,3169.0,4243.0,4016.714286,2579.428571


In [13]:
iowa_house_y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   District ID  100 non-null    int64  
 1   2012         100 non-null    float64
 2   2014         100 non-null    float64
 3   2016         100 non-null    float64
 4   2018         100 non-null    float64
 5   2020         100 non-null    float64
 6   2022         100 non-null    float64
dtypes: float64(6), int64(1)
memory usage: 5.6 KB


In [14]:
# Create features and target variables
features = iowa_house_y.drop(['District ID','2022'], axis=1)
target = iowa_house_y['2022']

In [15]:
# Split data into a 80% training set and 20% testing set
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1)

### I selected the RandomForestRegressor algorithm for this predictive task due to its strong resistance to overfitting, versatility in handling various regression tasks, and the advantage of not requiring standardization and normalization.

In [16]:
# Instantiate a RandomForestRegressor algorithm
rf = RandomForestRegressor(random_state=1)

In [17]:
# Fit the model to the training set
rf.fit(X_train, y_train)

In [18]:
# R2 score using the testing set
rf.score(X_test, y_test)

0.9122850991720006

In [19]:
# Predict values using testing set features
y_pred = rf.predict(X_test)

In [20]:
# Root Mean Square Error (RMSE) model score
mse(y_test, y_pred, squared=False)

430.71167019592144

In [21]:
iowa_house_y.head()

Unnamed: 0,District ID,2012,2014,2016,2018,2020,2022
0,1,7133.0,3404.428571,4907.0,4758.333333,5225.960317,2835.142857
1,2,6575.0,3527.285714,4629.0,5208.833333,5254.019841,3283.0
2,3,4146.0,2156.0,2766.5,3625.166667,3422.277778,2312.857143
3,4,2484.0,1089.0,1674.5,2375.5,2038.333333,1388.571429
4,5,5402.0,2967.285714,3169.0,4243.0,4016.714286,2579.428571


In [22]:
# Create function to predict democratic vote totals per house district
def array_pred(year):
    array = np.array(iowa_house_y.drop(['District ID', year], axis=1).iloc[:,:])
    array_year = rf.predict(array)
    return array_year

In [23]:
# Average out democratic vote totals per house district by dropping each year and predicting for 2024
array_avg_24 = (array_pred('2012')+array_pred('2014')+array_pred('2016')+array_pred('2018')+array_pred('2020')\
                +array_pred('2022'))/6

In [24]:
# Create new 2024 column with the averaged out prediction
iowa_house_y['2024'] = array_avg_24

In [25]:
iowa_house_y.head()

Unnamed: 0,District ID,2012,2014,2016,2018,2020,2022,2024
0,1,7133.0,3404.428571,4907.0,4758.333333,5225.960317,2835.142857,3089.948095
1,2,6575.0,3527.285714,4629.0,5208.833333,5254.019841,3283.0,3259.702619
2,3,4146.0,2156.0,2766.5,3625.166667,3422.277778,2312.857143,2333.152857
3,4,2484.0,1089.0,1674.5,2375.5,2038.333333,1388.571429,1800.150476
4,5,5402.0,2967.285714,3169.0,4243.0,4016.714286,2579.428571,2501.218095


In [26]:
# Run new model to evaluate model performance using newly added predicted 2024 data
features = iowa_house_y.drop(['District ID', '2024'], axis=1)
target = iowa_house_y['2024']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.979148330662009

In [27]:
# Average out democratic vote totals per house district by dropping each year and predicting for 2026
array_avg_26 = (array_pred('2012')+array_pred('2014')+array_pred('2016')+array_pred('2018')+array_pred('2020')\
                +array_pred('2022')+array_pred('2024'))/7

In [28]:
# Create new 2026 column with the averaged out prediction
iowa_house_y['2026'] = array_avg_26

In [29]:
iowa_house_y.head()

Unnamed: 0,District ID,2012,2014,2016,2018,2020,2022,2024,2026
0,1,7133.0,3404.428571,4907.0,4758.333333,5225.960317,2835.142857,3089.948095,3042.785988
1,2,6575.0,3527.285714,4629.0,5208.833333,5254.019841,3283.0,3259.702619,3128.643495
2,3,4146.0,2156.0,2766.5,3625.166667,3422.277778,2312.857143,2333.152857,2380.840656
3,4,2484.0,1089.0,1674.5,2375.5,2038.333333,1388.571429,1800.150476,2019.186469
4,5,5402.0,2967.285714,3169.0,4243.0,4016.714286,2579.428571,2501.218095,2492.106735


In [30]:
# Run new model to evaluate model performance using newly added predicted 2026 data
features = iowa_house_y.drop(['District ID', '2026'], axis=1)
target = iowa_house_y['2026']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.991877896607985

In [31]:
# Average out democratic vote totals per house district by dropping each year and predicting for 2028
array_avg_28 = (array_pred('2012')+array_pred('2014')+array_pred('2016')+array_pred('2018')+array_pred('2020')\
                +array_pred('2022')+array_pred('2024')+array_pred('2026'))/8

In [32]:
# Create new 2028 column with the averaged out prediction
iowa_house_y['2028'] = array_avg_28

In [33]:
iowa_house_y.head()

Unnamed: 0,District ID,2012,2014,2016,2018,2020,2022,2024,2026,2028
0,1,7133.0,3404.428571,4907.0,4758.333333,5225.960317,2835.142857,3089.948095,3042.785988,2910.982279
1,2,6575.0,3527.285714,4629.0,5208.833333,5254.019841,3283.0,3259.702619,3128.643495,2993.138549
2,3,4146.0,2156.0,2766.5,3625.166667,3422.277778,2312.857143,2333.152857,2380.840656,2411.034302
3,4,2484.0,1089.0,1674.5,2375.5,2038.333333,1388.571429,1800.150476,2019.186469,2168.220853
4,5,5402.0,2967.285714,3169.0,4243.0,4016.714286,2579.428571,2501.218095,2492.106735,2512.317636


In [34]:
# Run new model to evaluate model performance using newly added predicted 2028 data
features = iowa_house_y.drop(['District ID', '2028'], axis=1)
target = iowa_house_y['2028']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9943458933687702

In [35]:
# Average out democratic vote totals per house district by dropping each year and predicting for 2030
array_avg_30 = (array_pred('2012')+array_pred('2014')+array_pred('2016')+array_pred('2018')+array_pred('2020')\
                +array_pred('2022')+array_pred('2024')+array_pred('2026')+array_pred('2028'))/9

In [36]:
# Create new 2030 column with the averaged out prediction
iowa_house_y['2030'] = array_avg_30

In [37]:
iowa_house_y.head()

Unnamed: 0,District ID,2012,2014,2016,2018,2020,2022,2024,2026,2028,2030
0,1,7133.0,3404.428571,4907.0,4758.333333,5225.960317,2835.142857,3089.948095,3042.785988,2910.982279,2831.556577
1,2,6575.0,3527.285714,4629.0,5208.833333,5254.019841,3283.0,3259.702619,3128.643495,2993.138549,2879.224065
2,3,4146.0,2156.0,2766.5,3625.166667,3422.277778,2312.857143,2333.152857,2380.840656,2411.034302,2443.500267
3,4,2484.0,1089.0,1674.5,2375.5,2038.333333,1388.571429,1800.150476,2019.186469,2168.220853,2266.248062
4,5,5402.0,2967.285714,3169.0,4243.0,4016.714286,2579.428571,2501.218095,2492.106735,2512.317636,2511.066558


In [38]:
# Run new model to evaluate model performance using newly added predicted 2030 data
features = iowa_house_y.drop(['District ID', '2030'], axis=1)
target = iowa_house_y['2030']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9939225216789774

In [39]:
## Save to csv
# iowa_house_y.to_csv('iowa_lh_dem.csv', index=False)