# Predicting the 2020 Presidential Election

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBRegressor

from data_dicts import drop_cols
from data_prep_values import county_info_2012, county_info_2016, county_info_2018, results_info
from model_prep import lin_mod_func, run_model, make_predictions, combine_predictions
from visualizations import display_results

## Data

The demographic information was obtained from the U.S. Cencus website. The 5 year study was used in order to obtain the most accurate information for smaller countues. The results data was obtained from the MIT Election Data and Science Lab.

Note: Due to issues with demographic info for Alaska, the state of Alaska is not incorporated into our analysis. These issues will be addressed at a later time.

In [2]:
features_2012 = county_info_2012()
features_2016 = county_info_2016()
features_2018 = county_info_2018()
results_2012 = results_info(2012)
results_2016 = results_info(2016)

In [3]:
df_2012 = pd.merge(features_2012, results_2012, on='County')
df_2012 = df_2012.drop(columns='State_y')
df_2012 = df_2012.rename(columns={'State_x': 'State'})

In [4]:
df_2016 = pd.merge(features_2016, results_2016, on='County')
df_2016 = df_2016.drop(columns='State_y')
df_2016 = df_2016.rename(columns={'State_x': 'State'})

## Methodology

The chosen target is the total number of votes each political party obtained in each county. Republican, Democrat, and Third were chosen as our political parties. After many tirals with different types of models, an XGBoost model was selected for our purposes.

In [11]:
drop = ['State', 'County', 'Size', 'Total population',
        'Republican', 'Democrat', 'Other']

In [12]:
X_train = df_2012.drop(columns=drop)
X_test = df_2016.drop(columns=drop)
scaler = StandardScaler()
mms = MinMaxScaler()
mms.fit(X_train)
#scaler.fit(X_train)

X_train_scale = mms.transform(X_train)
X_test_scale = mms.transform(X_test)
# X_train_scale = scaler.transform(X_train)
# X_test_scale = scaler.transform(X_test)

y_train_R = df_2012['Republican']
y_train_D = df_2012['Democrat']
y_train_O = df_2012['Other']

y_test_R = df_2016['Republican']
y_test_D = df_2016['Democrat']
y_test_O = df_2016['Other']

XGB = XGBRegressor(random_state=2020)

#parameters = {}

parameters = {'max_depth': [4, 5, 6, 7, 8],
               'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5]}

In [13]:
model_tuned_R = GridSearchCV(XGB, parameters, cv=5, n_jobs= -1)
model_tuned_R.fit(X_train_scale, y_train_R)
model_tuned_R.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=1,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

In [14]:
model_tuned_D = GridSearchCV(XGB, parameters, cv=5, n_jobs= -1)
model_tuned_D.fit(X_train_scale, y_train_D)
model_tuned_D.best_estimator_



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=1,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

In [15]:
model_tuned_O = GridSearchCV(XGB, parameters, cv=5, n_jobs= -1)

#model_tuned_O = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
#             colsample_bytree=1, gamma=0, learning_rate=0.3, max_delta_step=0,
#             max_depth=6, min_child_weight=1, missing=None, n_estimators=100,
#             n_jobs=1, nthread=None, objective='reg:linear', random_state=2020,
#             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
#             silent=True, subsample=1)

model_tuned_O.fit(X_train_scale, y_train_O)
model_tuned_O.best_estimator_



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=1,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

In [16]:
print(model_tuned_R.best_estimator_)
print(model_tuned_D.best_estimator_)
print(model_tuned_O.best_estimator_)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=1,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=1,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=4, min

In [17]:
predictions_R = make_predictions('Republican', model_tuned_R, X_test_scale, y_test_R, df_2016)
predictions_D = make_predictions('Democrat', model_tuned_D, X_test_scale, y_test_D, df_2016)
predictions_O = make_predictions('Other', model_tuned_O, X_test_scale, y_test_O, df_2016)

In [18]:
predictions_2016 = combine_predictions(predictions_R, predictions_D,
                                       predictions_O, df_2016)

predictions_2016 = predictions_2016.drop(columns=['Republican_y', 'Democrat_y', 'Other_y'])
predictions_2016 = predictions_2016.rename(columns={'Republican_x': 'Republican',
                                                    'Democrat_x': 'Democrat',
                                                    'Other_x': 'Other'})

In [19]:
states = predictions_2016.State.unique().tolist()

interests2 = ['County','Republican','Republican Predictions',
              'Democrat', 'Democrat Predictions', 'Other','Other Predictions']

In [20]:
predictions_2016[interests2].sample(10).round()

Unnamed: 0,County,Republican,Republican Predictions,Democrat,Democrat Predictions,Other,Other Predictions
193,"Mercer County, Illinois, 2016",4807,122053.0,3071,120713.0,647,7263.0
148,"Russell County, Alabama, 2016",9210,145462.0,9579,231874.0,467,7448.0
326,"Pitkin County, Colorado, 2016",2550,135692.0,7333,104327.0,640,17606.0
2163,"Cherokee County, Oklahoma, 2016",9994,138854.0,5456,118594.0,1040,6993.0
2579,"Pend Oreille County, Washington, 2016",4373,134178.0,1934,131584.0,495,17175.0
3032,"Washington County, Wisconsin, 2016",51729,154156.0,20855,148023.0,5419,5484.0
2507,"King George County, Virginia, 2016",7341,138899.0,4007,114882.0,711,6807.0
3050,"Natrona County, Wyoming, 2016",23552,110687.0,6577,105705.0,3219,6857.0
1281,"Gladwin County, Michigan, 2016",8124,146027.0,3794,267983.0,554,27900.0
1083,"Wyandotte County, Kansas, 2016",15806,140897.0,30146,196567.0,2829,6556.0


In [21]:
interests = ['Republican','Republican Predictions',
             'Democrat', 'Democrat Predictions',
             'Other','Other Predictions']

In [22]:
state_results = pd.DataFrame(states)
results = pd.DataFrame()
state_results = state_results.rename(columns={0: 'State'})
for state in states:
    results = results.append(predictions_2016[predictions_2016.State == state][interests].sum(),
                                     ignore_index=True)

state_results = state_results.merge(results, left_index=True, right_index=True)
state_results['Winner'] = state_results['Democrat Predictions'] - state_results['Republican Predictions']
state_results['Winner'] = state_results['Winner'].astype(int)
state_results['Democrat'] = state_results['Democrat'].astype(int)
state_results['Democrat Predictions'] = state_results['Democrat Predictions'].astype(int)
state_results['Other'] = state_results['Other'].astype(int)
state_results['Other Predictions'] = state_results['Other Predictions'].astype(int)
state_results['Republican'] = state_results['Republican'].astype(int)
state_results['Republican Predictions'] = state_results['Republican Predictions'].astype(int)

In [52]:
state_results.head()

Unnamed: 0,State,Democrat,Democrat Predictions,Other,Other Predictions,Republican,Republican Predictions,Winner
0,Alabama,729547,9559326,75570,553672,1318250,9932449,-373123
1,Arizona,1161167,2093676,190709,113717,1252401,3136717,-1043041
2,Arkansas,380494,10753154,64530,613384,684872,10784380,-31225
3,Illinois,3090729,14932456,321963,1126605,2146015,14818256,114199
4,California,8753788,8516185,943997,642992,4483810,8033985,482199


In [24]:
print(state_results['Democrat Predictions'].sum(),
      state_results['Republican Predictions'].sum(),
      state_results['Other Predictions'].sum())

print(state_results['Democrat'].sum(),
      state_results['Republican'].sum(),
      state_results['Other'].sum())

447008434 439941372 30704915
64242482 61988748 7492673


## Predict 2020 with only 2012

In [25]:
features_2020 = features_2018.drop(columns=['State', 'County', 'Total population', 'Size'])
X = features_2020
mms.fit(X)
X_test_2020 = features_2020

X = mms.transform(X)
X_test_2020_scale = mms.transform(X_test_2020)
# X = scaler.transform(X)
# X_test_2020_scale = scaler.transform(X_test_2020)

y_hat_R = model_tuned_R.predict(X)
y_hat_D = model_tuned_D.predict(X)
y_hat_O = model_tuned_O.predict(X)

In [26]:
res_1220 = pd.DataFrame(df_2016['County']).merge(pd.DataFrame(y_hat_R),
                                                 left_index=True, right_index=True)
res_1220 = res_1220.rename(columns={0: 'Republican Predictions'})
res_1220['Republican Predictions'] = res_1220['Republican Predictions'].astype(int)
res_1220 = res_1220.merge(pd.DataFrame(y_hat_D), left_index=True, right_index=True)
res_1220 = res_1220.rename(columns={0: 'Democrat Predictions'})
res_1220['Democrat Predictions'] = res_1220['Democrat Predictions'].astype(int)
res_1220 = res_1220.merge(pd.DataFrame(y_hat_O), left_index=True, right_index=True)
res_1220 = res_1220.rename(columns={0: 'Other Predictions'})
res_1220['Other Predictions'] = res_1220['Other Predictions'].astype(int)

In [27]:
res_1220

Unnamed: 0,County,Republican Predictions,Democrat Predictions,Other Predictions
0,"Wilcox County, Alabama, 2016",14511,23506,2568
1,"Winston County, Alabama, 2016",27044,20348,2672
2,"Apache County, Arizona, 2016",29489,46885,3374
3,"Cochise County, Arizona, 2016",31629,46342,4191
4,"Coconino County, Arizona, 2016",16552,-9472,1069
...,...,...,...,...
3057,"Teton County, Wyoming, 2016",21199,22719,3139
3058,"Uinta County, Wyoming, 2016",43755,37053,6544
3059,"Washakie County, Wyoming, 2016",18923,-3201,1723
3060,"Weston County, Wyoming, 2016",8238,48456,4740


In [28]:
res_1220.sample(5)

Unnamed: 0,County,Republican Predictions,Democrat Predictions,Other Predictions
1357,"Dakota County, Minnesota, 2016",30982,63975,3477
700,"Knox County, Kentucky, 2016",28507,28589,2314
2754,"Randall County, Texas, 2016",20160,25590,4200
1474,"Newton County, Mississippi, 2016",9804,34883,2471
1909,"Bronx County, New York, 2016",18012,25420,1883


In [29]:
print(res_1220['Democrat Predictions'].sum(),
      res_1220['Republican Predictions'].sum(),
      res_1220['Other Predictions'].sum())

print(state_results['Democrat Predictions'].sum(),
      state_results['Republican Predictions'].sum(),
      state_results['Other Predictions'].sum())

218930267 138983722 13387610
447008434 439941372 30704915


## Predict 2020 with only 2016

In [30]:
X_train = df_2016.drop(columns=drop)
mms.fit(X_train)

X_train_scale = mms.transform(X_train)
X_test_scale = mms.transform(X_test)
# X_train_scale = scaler.transform(X_train)
# X_test_scale = scaler.transform(X_test)

y_train_R = df_2016['Republican']
y_train_D = df_2016['Democrat']
y_train_O = df_2016['Other']

model_tuned_R.fit(X_train_scale, y_train_R)
model_tuned_D.fit(X_train_scale, y_train_D)
model_tuned_O.fit(X_train_scale, y_train_O)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bytree=1,
                                    gamma=0, learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=1,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=True,
                                    subsample=1),
             iid='warn', n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5],
                         'max_depth': [4, 5, 6, 7, 8]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             sco

In [31]:
features_2020 = features_2018.drop(columns=['State', 'County', 'Total population', 'Size'])
X_test_2020 = features_2020
X_test_2020_scale = mms.transform(X_test_2020)
# X_test_2020_scale = scaler.transform(X_test_2020)

y_hat_R = model_tuned_R.predict(X_test_2020_scale)
y_hat_D = model_tuned_D.predict(X_test_2020_scale)
y_hat_O = model_tuned_O.predict(X_test_2020_scale)

In [32]:
res_1620 = pd.DataFrame(df_2016['County']).merge(pd.DataFrame(y_hat_R),
                                                 left_index=True, right_index=True)
res_1620 = res_1620.rename(columns={0: 'Republican Predictions'})
res_1620['Republican Predictions'] = res_1620['Republican Predictions'].astype(int)
res_1620 = res_1620.merge(pd.DataFrame(y_hat_D), left_index=True, right_index=True)
res_1620 = res_1620.rename(columns={0: 'Democrat Predictions'})
res_1620['Democrat Predictions'] = res_1620['Democrat Predictions'].astype(int)
res_1620 = res_1620.merge(pd.DataFrame(y_hat_O), left_index=True, right_index=True)
res_1620 = res_1620.rename(columns={0: 'Third Predictions'})
res_1620['Third Predictions'] = res_1620['Third Predictions'].astype(int)

In [33]:
res_1620

Unnamed: 0,County,Republican Predictions,Democrat Predictions,Third Predictions
0,"Wilcox County, Alabama, 2016",19526,7776,2785
1,"Winston County, Alabama, 2016",23346,19516,1848
2,"Apache County, Arizona, 2016",17058,40198,1193
3,"Cochise County, Arizona, 2016",17977,14347,2163
4,"Coconino County, Arizona, 2016",25330,27179,2269
...,...,...,...,...
3057,"Teton County, Wyoming, 2016",14818,13189,1453
3058,"Uinta County, Wyoming, 2016",32108,17575,2344
3059,"Washakie County, Wyoming, 2016",27335,26906,4417
3060,"Weston County, Wyoming, 2016",22611,27159,4503


In [34]:
print("2012 to 2020:",
      res_1220['Democrat Predictions'].sum(),
      res_1220['Republican Predictions'].sum(),
      res_1220['Other Predictions'].sum())

print("2016 to 2020:",
      res_1620['Democrat Predictions'].sum(),
      res_1620['Republican Predictions'].sum(),
      res_1620['Third Predictions'].sum())

print("2012 to 2016:",
      state_results['Democrat Predictions'].sum(),
      state_results['Republican Predictions'].sum(),
      state_results['Other Predictions'].sum())

print("Actual  2016:",
      state_results['Democrat'].sum(),
      state_results['Republican'].sum(),
      state_results['Other'].sum())

2012 to 2020: 218930267 138983722 13387610
2016 to 2020: 62285473 59858912 7358253
2012 to 2016: 447008434 439941372 30704915
Actual  2016: 64242482 61988748 7492673


## Predict 2020 with 2012 and 2016

In [35]:
# Combine 2012 and 2016
features_2012_2016 = features_2012.append(features_2016, sort=False)

In [36]:
df_2012_2016 = df_2012.append(df_2016, sort=False)

In [37]:
# Rio Arriba County, New Mexico does not have a reproted Household Median or Mean income 
Rio_2016_Median = features_2016[features_2016['County'] == 'Rio Arriba County, New Mexico, 2016']['Households Median income']
Rio_2016_Mean = features_2016[features_2016['County'] == 'Rio Arriba County, New Mexico, 2016']['Households Mean income']

In [38]:
# Rio Arriba County, New Mexico is missing mean and median income info. Replacing with 2016 info.
values = {'Households Median income': Rio_2016_Median.values[0],
          'Households Mean income': Rio_2016_Mean.values[0]}

features_2018 = features_2018.fillna(value=values)

In [39]:
features_2020 = features_2018.drop(columns=['State', 'County', 'Total population', 'Size'])

In [40]:
X_train = df_2012_2016.drop(columns=drop)
X_test = features_2020
# scaler.fit(X_train)
mms.fit(X_train)

X_train_scale = mms.transform(X_train)
X_test_scale = mms.transform(X_test)
# X_train_scale = scaler.transform(X_train)
# X_test_scale = scaler.transform(X_test)

y_train_R = df_2012_2016['Republican']
y_train_D = df_2012_2016['Democrat']
y_train_O = df_2012_2016['Other']

model_tuned_R.fit(X_train_scale, y_train_R)
model_tuned_D.fit(X_train_scale, y_train_D)
model_tuned_O.fit(X_train_scale, y_train_O)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bytree=1,
                                    gamma=0, learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=1,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=True,
                                    subsample=1),
             iid='warn', n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5],
                         'max_depth': [4, 5, 6, 7, 8]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             sco

In [41]:
y_hat_R_all = model_tuned_R.predict(X_test_scale)
y_hat_D_all = model_tuned_D.predict(X_test_scale)
y_hat_O_all = model_tuned_O.predict(X_test_scale)

In [42]:
res_121620 = pd.DataFrame(df_2016['County']).merge(pd.DataFrame(y_hat_R),
                                                   left_index=True, right_index=True)
res_121620 = res_121620.rename(columns={0: 'Republican Predictions'})
res_121620['Republican Predictions'] = res_121620['Republican Predictions'].astype(int)
res_121620 = res_121620.merge(pd.DataFrame(y_hat_D),
                              left_index=True, right_index=True)
res_121620 = res_121620.rename(columns={0: 'Democrat Predictions'})
res_121620['Democrat Predictions'] = res_121620['Democrat Predictions'].astype(int)
res_121620 = res_121620.merge(pd.DataFrame(y_hat_O),
                              left_index=True, right_index=True)
res_121620 = res_121620.rename(columns={0: 'Third Predictions'})
res_121620['Third Predictions'] = res_121620['Third Predictions'].astype(int)

In [43]:
res_121620.head()

Unnamed: 0,County,Republican Predictions,Democrat Predictions,Third Predictions
0,"Wilcox County, Alabama, 2016",19526,7776,2785
1,"Winston County, Alabama, 2016",23346,19516,1848
2,"Apache County, Arizona, 2016",17058,40198,1193
3,"Cochise County, Arizona, 2016",17977,14347,2163
4,"Coconino County, Arizona, 2016",25330,27179,2269


In [44]:
_ = res_121620['County'].str.split(',', expand=True)
_ = _.rename(columns={1: 'State'})
_['State'] = _['State'].str.strip()
_ = _.drop(columns={0, 2})

res_121620_state = res_121620.merge(_, left_index=True, right_index=True) 
res_121620_state

Unnamed: 0,County,Republican Predictions,Democrat Predictions,Third Predictions,State
0,"Wilcox County, Alabama, 2016",19526,7776,2785,Alabama
1,"Winston County, Alabama, 2016",23346,19516,1848,Alabama
2,"Apache County, Arizona, 2016",17058,40198,1193,Arizona
3,"Cochise County, Arizona, 2016",17977,14347,2163,Arizona
4,"Coconino County, Arizona, 2016",25330,27179,2269,Arizona
...,...,...,...,...,...
3057,"Teton County, Wyoming, 2016",14818,13189,1453,Wyoming
3058,"Uinta County, Wyoming, 2016",32108,17575,2344,Wyoming
3059,"Washakie County, Wyoming, 2016",27335,26906,4417,Wyoming
3060,"Weston County, Wyoming, 2016",22611,27159,4503,Wyoming


In [45]:
print("12 - 16 - 20:",
      res_121620['Democrat Predictions'].sum(),
      res_121620['Republican Predictions'].sum(),
      res_121620['Third Predictions'].sum())

print("2012 to 2020:",
      res_1220['Democrat Predictions'].sum(),
      res_1220['Republican Predictions'].sum(),
      res_1220['Other Predictions'].sum())

print("2016 to 2020:",
      res_1620['Democrat Predictions'].sum(),
      res_1620['Republican Predictions'].sum(),
      res_1620['Third Predictions'].sum())

print("2012 to 2016:",
      state_results['Democrat Predictions'].sum(),
      state_results['Republican Predictions'].sum(),
      state_results['Other Predictions'].sum())

print("Actual  2016:",
      state_results['Democrat'].sum(),
      state_results['Republican'].sum(),
      state_results['Other'].sum())

12 - 16 - 20: 62285473 59858912 7358253
2012 to 2020: 218930267 138983722 13387610
2016 to 2020: 62285473 59858912 7358253
2012 to 2016: 447008434 439941372 30704915
Actual  2016: 64242482 61988748 7492673


## Future Work

Error Analysis