# Predicting the 2020 Presidential Election

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBRegressor

from data_dicts import PERCENT_FEATURES, COUNT_FEATURES
from data_prep import (county_info_2012, county_info_2016, county_info_2018,
                       create_targets, results_info)
from model_prep import (lin_mod_func, run_model,
                        make_predictions, combine_predictions)
from visualizations import display_results, state_results_check, state_results_predict

## Data

The demographic information was obtained from the U.S. Cencus website. The 5 year study was used in order to obtain the most accurate information for smaller countues. The results data was obtained from the MIT Election Data and Science Lab.

Note: Due to issues with demographic info for Alaska, the state of Alaska is not incorporated into our analysis. These issues will be addressed at a later time.

In [2]:
features_2012 = county_info_2012()
features_2016 = county_info_2016()
features_2018 = county_info_2018()
results_2012 = results_info(2012)
results_2016 = results_info(2016)

In [3]:
df_2012 = pd.merge(features_2012, results_2012, on='County')
df_2012 = df_2012.drop(columns='State_y')
df_2012 = df_2012.rename(columns={'State_x': 'State'})
df_2012 = create_targets(df_2012)

In [4]:
df_2016 = pd.merge(features_2016, results_2016, on='County')
df_2016 = df_2016.drop(columns='State_y')
df_2016 = df_2016.rename(columns={'State_x': 'State'})
df_2016 = create_targets(df_2016)

## Methodology

The chosen target is the total number of votes each political party obtained in each county. Republican, Democrat, and Third were chosen as our political parties. After many tirals with different types of models, an XGBoost model was selected for our purposes.

2012 data was used to train the model while 2016 data was used to validate it. From here the 2012 and 2016 data were used to predict the 2020 results separately.

Both the count and percentage will be predicted on different models, meaning there will be a total of six models.

## Predict 2016 with 2012
### Count

In [5]:
X_train_c = df_2012[COUNT_FEATURES]
X_test_c = df_2016[COUNT_FEATURES]
scaler = StandardScaler()
mms = MinMaxScaler()
mms.fit(X_train_c)
#scaler.fit(X_train)

X_train_c_scale = mms.transform(X_train_c)
X_test_c_scale = mms.transform(X_test_c)
# X_train_scale = scaler.transform(X_train)
# X_test_scale = scaler.transform(X_test)

y_train_R_c = df_2012['Republican']
y_train_D_c = df_2012['Democrat']
y_train_T_c = df_2012['Third']

y_test_R_c = df_2016['Republican']
y_test_D_c = df_2016['Democrat']
y_test_T_c = df_2016['Third']

XGB = XGBRegressor(random_state=2020)

#parameters = {}

parameters = {'max_depth': [4, 5, 6, 7, 8],
              'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5]}

In [6]:
model_tuned_R_c = GridSearchCV(XGB, parameters, cv=5, n_jobs= -1)
model_tuned_R_c.fit(X_train_c_scale, y_train_R_c)
print(model_tuned_R_c.best_estimator_)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=2020,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)


In [7]:
model_tuned_D_c = GridSearchCV(XGB, parameters, cv=5, n_jobs= -1)
model_tuned_D_c.fit(X_train_c_scale, y_train_D_c)
print(model_tuned_D_c.best_estimator_)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=2020,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)


In [8]:
model_tuned_T_c = GridSearchCV(XGB, parameters, cv=5, n_jobs= -1)
model_tuned_T_c.fit(X_train_c_scale, y_train_T_c)
print(model_tuned_T_c.best_estimator_)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=2020,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)


In [9]:
predictions_R = make_predictions('Republican', model_tuned_R_c, X_test_c_scale, y_test_R_c, df_2016)
predictions_D = make_predictions('Democrat', model_tuned_D_c, X_test_c_scale, y_test_D_c, df_2016)
predictions_T = make_predictions('Third', model_tuned_T_c, X_test_c_scale, y_test_T_c, df_2016)

In [10]:
predictions_2016 = combine_predictions(predictions_R, predictions_D,
                                       predictions_T, df_2016)

predictions_2016 = predictions_2016.drop(columns=['Republican_y', 'Democrat_y', 'Third_y'])
predictions_2016 = predictions_2016.rename(columns={'Republican_x': 'Republican',
                                                    'Democrat_x': 'Democrat',
                                                    'Third_x': 'Third'})

In [11]:
predictions_2016.sample(5).round()

Unnamed: 0,State,County,Total population,Republican,Republican Predictions,Democrat,Democrat Predictions,Third,Third Predictions
878,Indiana,"White County, Indiana, 2016",17033,6893,13836.0,2590,14020.0,570,387.0
771,Louisiana,"Cameron County, Louisiana, 2016",16893,3256,14095.0,323,13789.0,113,436.0
665,Kentucky,"Clay County, Kentucky, 2016",16089,5861,22055.0,752,13095.0,154,1024.0
2718,Texas,"McLennan County, Texas, 2016",19585,48260,13974.0,27063,10544.0,3752,368.0
907,Iowa,"Delaware County, Iowa, 2016",27836,5694,19996.0,2957,23381.0,590,440.0


In [12]:
state_results = state_results_check(predictions_2016)

In [13]:
state_results

Unnamed: 0,State,Democrat,Democrat Predictions,Republican,Republican Predictions,Third,Third Predictions,Winner
0,Alabama,729547,1218410,1318250,1197831,75570,69012,20578
1,Arizona,1161167,374756,1252401,348848,190709,21195,25907
2,Arkansas,380494,1484418,684872,1739457,64530,84410,-255038
3,California,8753788,1590383,4483810,1671056,943997,87219,-80673
4,Colorado,1338870,2582649,1202484,1524017,238866,100110,1058632
5,Connecticut,897572,146484,673215,134644,74133,5250,11840
6,Delaware,235603,43084,185127,45527,22267,2010,-2442
7,District of Columbia,282830,187086,12723,23695,15715,12785,163391
8,Florida,4472151,1834295,4562488,1669547,294573,49128,164747
9,Georgia,1877963,3792742,2089104,3002667,147644,184551,790075


In [14]:
print(state_results['Democrat Predictions'].sum(),
      state_results['Republican Predictions'].sum(),
      state_results['Third Predictions'].sum())

print(state_results['Democrat'].sum(),
      state_results['Republican'].sum(),
      state_results['Third'].sum())

62892634 54733724 2973286
64242482 61988748 7492673


### Percent

## Predict 2020 with only 2012

In [15]:
features_2020 = features_2018[COUNT_FEATURES]
X = features_2020
mms.fit(X)
X_test_2020 = features_2020

X = mms.transform(X)
X_test_2020_scale = mms.transform(X_test_2020)
# X = scaler.transform(X)
# X_test_2020_scale = scaler.transform(X_test_2020)

y_hat_R = model_tuned_R_c.predict(X)
y_hat_D = model_tuned_D_c.predict(X)
y_hat_T = model_tuned_T_c.predict(X)

In [16]:
res_1220 = pd.DataFrame(df_2016['County']).merge(pd.DataFrame(y_hat_R),
                                                 left_index=True, right_index=True)
res_1220 = res_1220.rename(columns={0: 'Republican Predictions'})
res_1220['Republican Predictions'] = res_1220['Republican Predictions'].astype(int)
res_1220 = res_1220.merge(pd.DataFrame(y_hat_D), left_index=True, right_index=True)
res_1220 = res_1220.rename(columns={0: 'Democrat Predictions'})
res_1220['Democrat Predictions'] = res_1220['Democrat Predictions'].astype(int)
res_1220 = res_1220.merge(pd.DataFrame(y_hat_T), left_index=True, right_index=True)
res_1220 = res_1220.rename(columns={0: 'Third Predictions'})
res_1220['Third Predictions'] = res_1220['Third Predictions'].astype(int)

In [17]:
_ = res_1220['County'].str.split(',', expand=True)
_ = _.rename(columns={0: 'Cnt', 1: 'State'})
_['Cnt'] = _['Cnt'].apply(lambda x: x.strip())
_['State'] = _['State'].apply(lambda x: x.strip())
res_1220 = res_1220.merge(_['State'], left_index=True, right_index=True)

In [18]:
res_1220.sample(5)

Unnamed: 0,County,Republican Predictions,Democrat Predictions,Third Predictions,State
600,"Franklin County, Idaho, 2016",25113,22286,336,Idaho
1850,"Merrimack County, New Hampshire, 2016",26250,38765,2033,New Hampshire
731,"Ohio County, Kentucky, 2016",12896,17959,914,Kentucky
1908,"Allegany County, New York, 2016",18598,10329,269,New York
1611,"Pamlico County, North Carolina, 2016",12422,18504,3524,North Carolina


In [19]:
print(res_1220['Democrat Predictions'].sum(),
      res_1220['Republican Predictions'].sum(),
      res_1220['Third Predictions'].sum())

print(state_results['Democrat Predictions'].sum(),
      state_results['Republican Predictions'].sum(),
      state_results['Third Predictions'].sum())

47882220 50376781 2502215
62892634 54733724 2973286


In [20]:
state_results_1220 = state_results_predict(res_1220)

In [21]:
state_results_1220

Unnamed: 0,State,Democrat Predictions,Republican Predictions,Third Predictions,Winner
0,Alabama,834161,1106698,50870,-272537
1,Arizona,128313,181730,17682,-53417
2,Arkansas,2088279,3074603,111674,-986324
3,California,2128657,1548175,89085,580482
4,Colorado,1284648,1314187,47238,-29539
5,Connecticut,138610,314518,6994,-175908
6,Delaware,33513,36843,3365,-3330
7,District of Columbia,10079,15168,788,-5089
8,Florida,1558889,1480410,64558,78479
9,Georgia,2223265,2296180,180455,-72915


## Predict 2020 with only 2016

In [22]:
X_train = df_2016.drop(columns=drop)
mms.fit(X_train)

X_train_scale = mms.transform(X_train)
X_test_scale = mms.transform(X_test)
# X_train_scale = scaler.transform(X_train)
# X_test_scale = scaler.transform(X_test)

y_train_R = df_2016['Republican']
y_train_D = df_2016['Democrat']
y_train_O = df_2016['Other']

model_tuned_R.fit(X_train_scale, y_train_R)
model_tuned_D.fit(X_train_scale, y_train_D)
model_tuned_O.fit(X_train_scale, y_train_O)

NameError: name 'drop' is not defined

In [31]:
features_2020 = features_2018.drop(columns=['State', 'County', 'Total population', 'Size'])
X_test_2020 = features_2020
X_test_2020_scale = mms.transform(X_test_2020)
# X_test_2020_scale = scaler.transform(X_test_2020)

y_hat_R = model_tuned_R.predict(X_test_2020_scale)
y_hat_D = model_tuned_D.predict(X_test_2020_scale)
y_hat_O = model_tuned_O.predict(X_test_2020_scale)

In [32]:
res_1620 = pd.DataFrame(df_2016['County']).merge(pd.DataFrame(y_hat_R),
                                                 left_index=True, right_index=True)
res_1620 = res_1620.rename(columns={0: 'Republican Predictions'})
res_1620['Republican Predictions'] = res_1620['Republican Predictions'].astype(int)
res_1620 = res_1620.merge(pd.DataFrame(y_hat_D), left_index=True, right_index=True)
res_1620 = res_1620.rename(columns={0: 'Democrat Predictions'})
res_1620['Democrat Predictions'] = res_1620['Democrat Predictions'].astype(int)
res_1620 = res_1620.merge(pd.DataFrame(y_hat_O), left_index=True, right_index=True)
res_1620 = res_1620.rename(columns={0: 'Third Predictions'})
res_1620['Third Predictions'] = res_1620['Third Predictions'].astype(int)

In [33]:
res_1620

Unnamed: 0,County,Republican Predictions,Democrat Predictions,Third Predictions
0,"Wilcox County, Alabama, 2016",19526,7776,2785
1,"Winston County, Alabama, 2016",23346,19516,1848
2,"Apache County, Arizona, 2016",17058,40198,1193
3,"Cochise County, Arizona, 2016",17977,14347,2163
4,"Coconino County, Arizona, 2016",25330,27179,2269
...,...,...,...,...
3057,"Teton County, Wyoming, 2016",14818,13189,1453
3058,"Uinta County, Wyoming, 2016",32108,17575,2344
3059,"Washakie County, Wyoming, 2016",27335,26906,4417
3060,"Weston County, Wyoming, 2016",22611,27159,4503


In [34]:
print("2012 to 2020:",
      res_1220['Democrat Predictions'].sum(),
      res_1220['Republican Predictions'].sum(),
      res_1220['Other Predictions'].sum())

print("2016 to 2020:",
      res_1620['Democrat Predictions'].sum(),
      res_1620['Republican Predictions'].sum(),
      res_1620['Third Predictions'].sum())

print("2012 to 2016:",
      state_results['Democrat Predictions'].sum(),
      state_results['Republican Predictions'].sum(),
      state_results['Other Predictions'].sum())

print("Actual  2016:",
      state_results['Democrat'].sum(),
      state_results['Republican'].sum(),
      state_results['Other'].sum())

2012 to 2020: 218930267 138983722 13387610
2016 to 2020: 62285473 59858912 7358253
2012 to 2016: 447008434 439941372 30704915
Actual  2016: 64242482 61988748 7492673


## Predict 2020 with 2012 and 2016

In [35]:
# Combine 2012 and 2016
features_2012_2016 = features_2012.append(features_2016, sort=False)

In [36]:
df_2012_2016 = df_2012.append(df_2016, sort=False)

In [37]:
# Rio Arriba County, New Mexico does not have a reproted Household Median or Mean income 
Rio_2016_Median = features_2016[features_2016['County'] == 'Rio Arriba County, New Mexico, 2016']['Households Median income']
Rio_2016_Mean = features_2016[features_2016['County'] == 'Rio Arriba County, New Mexico, 2016']['Households Mean income']

In [38]:
# Rio Arriba County, New Mexico is missing mean and median income info. Replacing with 2016 info.
values = {'Households Median income': Rio_2016_Median.values[0],
          'Households Mean income': Rio_2016_Mean.values[0]}

features_2018 = features_2018.fillna(value=values)

In [39]:
features_2020 = features_2018.drop(columns=['State', 'County', 'Total population', 'Size'])

In [40]:
X_train = df_2012_2016.drop(columns=drop)
X_test = features_2020
# scaler.fit(X_train)
mms.fit(X_train)

X_train_scale = mms.transform(X_train)
X_test_scale = mms.transform(X_test)
# X_train_scale = scaler.transform(X_train)
# X_test_scale = scaler.transform(X_test)

y_train_R = df_2012_2016['Republican']
y_train_D = df_2012_2016['Democrat']
y_train_O = df_2012_2016['Other']

model_tuned_R.fit(X_train_scale, y_train_R)
model_tuned_D.fit(X_train_scale, y_train_D)
model_tuned_O.fit(X_train_scale, y_train_O)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bytree=1,
                                    gamma=0, learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=1,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=True,
                                    subsample=1),
             iid='warn', n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5],
                         'max_depth': [4, 5, 6, 7, 8]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             sco

In [41]:
y_hat_R_all = model_tuned_R.predict(X_test_scale)
y_hat_D_all = model_tuned_D.predict(X_test_scale)
y_hat_O_all = model_tuned_O.predict(X_test_scale)

In [42]:
res_121620 = pd.DataFrame(df_2016['County']).merge(pd.DataFrame(y_hat_R),
                                                   left_index=True, right_index=True)
res_121620 = res_121620.rename(columns={0: 'Republican Predictions'})
res_121620['Republican Predictions'] = res_121620['Republican Predictions'].astype(int)
res_121620 = res_121620.merge(pd.DataFrame(y_hat_D),
                              left_index=True, right_index=True)
res_121620 = res_121620.rename(columns={0: 'Democrat Predictions'})
res_121620['Democrat Predictions'] = res_121620['Democrat Predictions'].astype(int)
res_121620 = res_121620.merge(pd.DataFrame(y_hat_O),
                              left_index=True, right_index=True)
res_121620 = res_121620.rename(columns={0: 'Third Predictions'})
res_121620['Third Predictions'] = res_121620['Third Predictions'].astype(int)

In [43]:
res_121620.head()

Unnamed: 0,County,Republican Predictions,Democrat Predictions,Third Predictions
0,"Wilcox County, Alabama, 2016",19526,7776,2785
1,"Winston County, Alabama, 2016",23346,19516,1848
2,"Apache County, Arizona, 2016",17058,40198,1193
3,"Cochise County, Arizona, 2016",17977,14347,2163
4,"Coconino County, Arizona, 2016",25330,27179,2269


In [44]:
_ = res_121620['County'].str.split(',', expand=True)
_ = _.rename(columns={1: 'State'})
_['State'] = _['State'].str.strip()
_ = _.drop(columns={0, 2})

res_121620_state = res_121620.merge(_, left_index=True, right_index=True) 
res_121620_state

Unnamed: 0,County,Republican Predictions,Democrat Predictions,Third Predictions,State
0,"Wilcox County, Alabama, 2016",19526,7776,2785,Alabama
1,"Winston County, Alabama, 2016",23346,19516,1848,Alabama
2,"Apache County, Arizona, 2016",17058,40198,1193,Arizona
3,"Cochise County, Arizona, 2016",17977,14347,2163,Arizona
4,"Coconino County, Arizona, 2016",25330,27179,2269,Arizona
...,...,...,...,...,...
3057,"Teton County, Wyoming, 2016",14818,13189,1453,Wyoming
3058,"Uinta County, Wyoming, 2016",32108,17575,2344,Wyoming
3059,"Washakie County, Wyoming, 2016",27335,26906,4417,Wyoming
3060,"Weston County, Wyoming, 2016",22611,27159,4503,Wyoming


In [45]:
print("12 - 16 - 20:",
      res_121620['Democrat Predictions'].sum(),
      res_121620['Republican Predictions'].sum(),
      res_121620['Third Predictions'].sum())

print("2012 to 2020:",
      res_1220['Democrat Predictions'].sum(),
      res_1220['Republican Predictions'].sum(),
      res_1220['Other Predictions'].sum())

print("2016 to 2020:",
      res_1620['Democrat Predictions'].sum(),
      res_1620['Republican Predictions'].sum(),
      res_1620['Third Predictions'].sum())

print("2012 to 2016:",
      state_results['Democrat Predictions'].sum(),
      state_results['Republican Predictions'].sum(),
      state_results['Other Predictions'].sum())

print("Actual  2016:",
      state_results['Democrat'].sum(),
      state_results['Republican'].sum(),
      state_results['Other'].sum())

12 - 16 - 20: 62285473 59858912 7358253
2012 to 2020: 218930267 138983722 13387610
2016 to 2020: 62285473 59858912 7358253
2012 to 2016: 447008434 439941372 30704915
Actual  2016: 64242482 61988748 7492673


## Limitations

Vote counts are not independent even though the models treat them as if they were.

## Future Work

Error Analysis

Fix Alaska

Predict percentages instead of hard vote count