In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBRegressor

from data_dicts_values import drop_cols
from data_prep_values import county_info_2012, county_info_2016, county_info_2018, results_info
from model_prep import lin_mod_func, run_model, make_predictions, combine_predictions
from visualizations import display_results

In [2]:
features_2012 = county_info_2012()
features_2016 = county_info_2016()
features_2018 = county_info_2018()
results_2012 = results_info(2012)
results_2016 = results_info(2016)

In [3]:
df_2012 = pd.merge(features_2012, results_2012, on='County')
df_2012 = df_2012.drop(columns='State_y')
df_2012 = df_2012.rename(columns={'State_x': 'State'})

In [4]:
df_2016 = pd.merge(features_2016, results_2016, on='County')
df_2016 = df_2016.drop(columns='State_y')
df_2016 = df_2016.rename(columns={'State_x': 'State'})

In [5]:
X_train = df_2012.drop(columns=drop_cols)
X_test = df_2016.drop(columns=drop_cols)
scaler = StandardScaler()
mms = MinMaxScaler()
mms.fit(X_train)
scaler.fit(X_train)

X_train_scale = mms.transform(X_train)
X_test_scale = mms.transform(X_test)

y_train_R = df_2012['Republican']
y_train_D = df_2012['Democrat']
y_train_O = df_2012['Other']

y_test_R = df_2016['Republican']
y_test_D = df_2016['Democrat']
y_test_O = df_2016['Other']

XGB = XGBRegressor(random_state=2020)

parameters = {}

In [6]:
X_train

Unnamed: 0,Median age,Sex ratio (males per 100 females),Male Median age,Female Median age,Total White,Total Black or African American,Total American Indian and Alaska Native,Total Asian,Total Hispanic or Latino,Households Median income,Households Mean income,Total Less than 9th grade,"Total 9th to 12th grade, no diploma",Total High school graduate,"Total Some college, no degree",Total Associate's degree,Total Bachelor's degree,Total Graduate or professional degree
0,42.1,99.0,40.7,43.1,43841,10126,523,672,54590,42738,55059,697.671,1176.669,3988.179,2405.403,499.824,676.845,333.216
1,37.3,88.1,34.4,39.7,160739,18164,2584,1985,183226,27219,41342,2142.954,3701.466,5238.332,4350.846,2034.724,1904.848,952.424
2,39.8,100.5,37.1,41.9,13816,12993,368,310,27469,32087,45426,731.965,1700.411,3355.778,2657.596,585.572,889.619,506.745
3,39.6,94.2,38.2,40.7,17661,5171,131,40,22769,31743,41421,2793.684,3847.442,10978.688,4362.068,1004.746,1470.360,710.674
4,43.4,95.8,42.3,44.7,55359,1023,710,191,57466,33479,46789,1191.030,2361.525,8583.630,5359.635,1416.915,1622.265,780.330
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3058,42.6,96.9,40.8,44.4,16349,1275,129,160,17747,45287,58885,1282.905,2230.281,9671.130,5407.938,1263.168,2151.333,809.217
3059,36.2,98.3,35.2,37.0,6111,566,15,10,6679,66422,91107,25666.984,39417.154,154460.243,112751.394,30708.713,96251.190,42167.188
3060,30.8,104.6,29.4,31.7,30934,9152,512,3222,43223,48489,57963,3852.405,3025.335,4178.880,4374.765,1197.075,2002.380,870.600
3061,43.0,92.5,41.1,44.5,144673,69450,2491,8839,223233,39122,50444,826.496,1536.766,3732.146,2879.822,1149.346,1201.002,723.184


In [7]:
model_tuned_R = GridSearchCV(XGB, parameters, cv=5, n_jobs= -1)
model_tuned_R.fit(X_train_scale, y_train_R)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bytree=1,
                                    gamma=0, learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=2020,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=True,
                                    subsample=1),
             iid='warn', n_jobs=-1, param_grid={}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False, scoring=None, verbose=0)

In [8]:
model_tuned_D = GridSearchCV(XGB, parameters, cv=5, n_jobs= -1)
model_tuned_D.fit(X_train_scale, y_train_D)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bytree=1,
                                    gamma=0, learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=2020,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=True,
                                    subsample=1),
             iid='warn', n_jobs=-1, param_grid={}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False, scoring=None, verbose=0)

In [9]:
model_tuned_O = GridSearchCV(XGB, parameters, cv=5, n_jobs= -1)
model_tuned_O.fit(X_train_scale, y_train_O)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bytree=1,
                                    gamma=0, learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=2020,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=True,
                                    subsample=1),
             iid='warn', n_jobs=-1, param_grid={}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False, scoring=None, verbose=0)

In [10]:
predictions_R = make_predictions('Republican', model_tuned_R, X_test_scale, y_test_R, df_2016)
predictions_D = make_predictions('Democrat', model_tuned_D, X_test_scale, y_test_D, df_2016)
predictions_O = make_predictions('Other', model_tuned_O, X_test_scale, y_test_O, df_2016)

In [11]:
predictions_2016 = combine_predictions(predictions_R, predictions_D,
                                       predictions_O, df_2016)

In [12]:
predictions_2016 = predictions_2016.drop(columns=['Republican_y', 'Democrat_y', 'Other_y'])
predictions_2016 = predictions_2016.rename(columns={'Republican_x': 'Republican',
                                                    'Democrat_x': 'Democrat',
                                                    'Other_x': 'Other'})

In [13]:
states = predictions_2016.State.unique().tolist()

interests = ['Republican','Republican Predictions',
             'Democrat', 'Democrat Predictions',
             'Other','Other Predictions']

In [14]:
state_results = pd.DataFrame(states)
results = pd.DataFrame()
state_results = state_results.rename(columns={0: 'State'})
for state in states:
    results = results.append(predictions_2016[predictions_2016.State == state][interests].sum(),
                                     ignore_index=True)

state_results = state_results.merge(results, left_index=True, right_index=True)
state_results['Winner'] = state_results['Democrat Predictions'] - state_results['Republican Predictions']

In [15]:
print(state_results['Democrat Predictions'].sum(),
      state_results['Republican Predictions'].sum(),
      state_results['Other Predictions'].sum())

print(state_results['Democrat'].sum(),
      state_results['Republican'].sum(),
      state_results['Other'].sum())

65758561.623046875 62456192.8343811 -50822.77391350269
64274035.0 62037609.0 7495173.0


## Predict 2020 with only 2012

In [36]:
features_2020 = features_2018.drop(columns=['County','Total population'])
X = features_2020
mms.fit(X)
X = mms.transform(X)
X_test_2020 = features_2020
X_test_2020_scale = mms.transform(X_test_2020)

y_hat_R = model_tuned_R.predict(X)
y_hat_D = model_tuned_D.predict(X)
y_hat_O = model_tuned_O.predict(X)

In [37]:
res_1220 = pd.DataFrame(df_2016['County']).merge(pd.DataFrame(y_hat_R), left_index=True, right_index=True)
res_1220 = res_1220.rename(columns={0: 'Republican Predictions'})

res_1220 = res_1220.merge(pd.DataFrame(y_hat_D), left_index=True, right_index=True)
res_1220 = res_1220.rename(columns={0: 'Democrat Predictions'})

res_1220 = res_1220.merge(pd.DataFrame(y_hat_O), left_index=True, right_index=True)
res_1220 = res_1220.rename(columns={0: 'Other Predictions'})

In [38]:
res_1220

Unnamed: 0,County,Republican Predictions,Democrat Predictions,Other Predictions
0,"Wilcox County, Alabama, 2016",14615.018555,7875.178223,1611.814697
1,"Winston County, Alabama, 2016",52511.687500,45702.136719,5028.849121
2,"Apache County, Arizona, 2016",6326.541992,2644.563721,409.211090
3,"Cochise County, Arizona, 2016",6551.736328,2073.810547,336.385895
4,"Coconino County, Arizona, 2016",14831.523438,6832.583984,941.753113
...,...,...,...,...
3059,"Teton County, Wyoming, 2016",25184.111328,11953.192383,1613.312866
3060,"Uinta County, Wyoming, 2016",8648.686523,5865.178711,1084.655884
3061,"Washakie County, Wyoming, 2016",13040.608398,8292.007812,1411.221436
3062,"Weston County, Wyoming, 2016",11112.780273,6754.032227,1376.617798


In [39]:
print(res_1220['Democrat Predictions'].sum(),
      res_1220['Republican Predictions'].sum(),
      res_1220['Other Predictions'].sum())

print(state_results['Democrat Predictions'].sum(),
      state_results['Republican Predictions'].sum(),
      state_results['Other Predictions'].sum())

68857410.0 64478304.0 7835213.0
65758561.623046875 62456192.8343811 -50822.77391350269


## Predict 2020 with only 2016

In [40]:
X_train = df_2016.drop(columns=drop_cols)
mms.fit(X_train)

X_train_scale = mms.transform(X_train)
X_test_scale = mms.transform(X_test)

y_train_R = df_2016['Republican']
y_train_D = df_2016['Democrat']
y_train_O = df_2016['Other']

model_tuned_R.fit(X_train_scale, y_train_R)
model_tuned_D.fit(X_train_scale, y_train_D)
model_tuned_O.fit(X_train_scale, y_train_O)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bytree=1,
                                    gamma=0, learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=2020,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=True,
                                    subsample=1),
             iid='warn', n_jobs=-1, param_grid={}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False, scoring=None, verbose=0)

In [41]:
features_2020 = features_2018.drop(columns=['County','Total population'])
X_test_2020 = features_2020
X_test_2020_scale = mms.transform(X_test_2020)

y_hat_R = model_tuned_R.predict(X_test_2020_scale)
y_hat_D = model_tuned_D.predict(X_test_2020_scale)
y_hat_O = model_tuned_O.predict(X_test_2020_scale)

In [42]:
res_1620 = pd.DataFrame(df_2016['County']).merge(pd.DataFrame(y_hat_R), left_index=True, right_index=True)
res_1620 = res_1620.rename(columns={0: 'Republican Predictions'})

res_1620 = res_1620.merge(pd.DataFrame(y_hat_D), left_index=True, right_index=True)
res_1620 = res_1620.rename(columns={0: 'Democrat Predictions'})

res_1620 = res_1620.merge(pd.DataFrame(y_hat_O), left_index=True, right_index=True)
res_1620 = res_1620.rename(columns={0: 'Other Predictions'})

In [43]:
res_1620

Unnamed: 0,County,Republican Predictions,Democrat Predictions,Other Predictions
0,"Wilcox County, Alabama, 2016",15005.354492,7909.238281,1611.814697
1,"Winston County, Alabama, 2016",55751.218750,40174.570312,5495.940918
2,"Apache County, Arizona, 2016",6326.541992,2443.563965,409.211090
3,"Cochise County, Arizona, 2016",6551.736328,2078.232910,336.385895
4,"Coconino County, Arizona, 2016",15536.689453,6382.310059,1059.162354
...,...,...,...,...
3059,"Teton County, Wyoming, 2016",27402.343750,11565.702148,1725.521729
3060,"Uinta County, Wyoming, 2016",9381.265625,5500.175293,1097.043213
3061,"Washakie County, Wyoming, 2016",12773.359375,8292.007812,1550.195435
3062,"Weston County, Wyoming, 2016",10942.172852,6403.193848,1554.838745


In [44]:
print("2012 to 2020:",
      res_1220['Democrat Predictions'].sum(),
      res_1220['Republican Predictions'].sum(),
      res_1220['Other Predictions'].sum())

print("2016 to 2020:",
      res_1620['Democrat Predictions'].sum(),
      res_1620['Republican Predictions'].sum(),
      res_1620['Other Predictions'].sum())

print("2012 to 2016:",
      state_results['Democrat Predictions'].sum(),
      state_results['Republican Predictions'].sum(),
      state_results['Other Predictions'].sum())

print("Actual  2016:",
      state_results['Democrat'].sum(),
      state_results['Republican'].sum(),
      state_results['Other'].sum())

2012 to 2020: 68857410.0 64478304.0 7835213.0
2016 to 2020: 65897176.0 64053492.0 7722065.0
2012 to 2016: 65758561.623046875 62456192.8343811 -50822.77391350269
Actual  2016: 64274035.0 62037609.0 7495173.0


## Predict 2020 with 2012 and 2016

In [48]:
# Combine 2012 and 2016
features_2012_2016 = features_2012.append(features_2016, sort=False)

In [58]:
df_2012_2016 = df_2012.append(df_2016, sort=False)

In [54]:
# Rio Arriba County, New Mexico does not have a reproted Household Median or Mean income 
Rio_2016_Median = features_2016[features_2016['County'] == 'Rio Arriba County, New Mexico, 2016']['Households Median income']
Rio_2016_Mean = features_2016[features_2016['County'] == 'Rio Arriba County, New Mexico, 2016']['Households Mean income']

In [55]:
# Rio Arriba County, New Mexico is missing mean and median income info. Replacing with 2016 info.
values = {'Households Median income': Rio_2016_Median.values[0],
          'Households Mean income': Rio_2016_Mean.values[0]}

features_2018 = features_2018.fillna(value=values)

In [56]:
features_2020 = features_2018.drop(columns=['County','Total population'])

In [61]:
X_train = df_2012_2016.drop(columns=drop_cols)
X_test = features_2020
mms.fit(X_train)

X_train_scale = mms.transform(X_train)
X_test_scale = mms.transform(X_test)

y_train_R = df_2012_2016['Republican']
y_train_D = df_2012_2016['Democrat']
y_train_O = df_2012_2016['Other']

model_tuned_R.fit(X_train_scale, y_train_R)
model_tuned_D.fit(X_train_scale, y_train_D)
model_tuned_O.fit(X_train_scale, y_train_O)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bytree=1,
                                    gamma=0, learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=2020,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=True,
                                    subsample=1),
             iid='warn', n_jobs=-1, param_grid={}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False, scoring=None, verbose=0)

In [62]:
y_hat_R_all = model_tuned_R.predict(X_test_scale)
y_hat_D_all = model_tuned_D.predict(X_test_scale)
y_hat_O_all = model_tuned_O.predict(X_test_scale)

In [63]:
res_121620 = pd.DataFrame(df_2016['County']).merge(pd.DataFrame(y_hat_R), left_index=True, right_index=True)
res_121620 = res_121620.rename(columns={0: 'Republican Predictions'})

res_121620 = res_121620.merge(pd.DataFrame(y_hat_D), left_index=True, right_index=True)
res_121620 = res_121620.rename(columns={0: 'Democrat Predictions'})

res_121620 = res_121620.merge(pd.DataFrame(y_hat_O), left_index=True, right_index=True)
res_121620 = res_121620.rename(columns={0: 'Other Predictions'})

In [66]:
print("12 - 16 - 20:",
      res_121620['Democrat Predictions'].sum(),
      res_121620['Republican Predictions'].sum(),
      res_121620['Other Predictions'].sum())

print("2012 to 2020:",
      res_1220['Democrat Predictions'].sum(),
      res_1220['Republican Predictions'].sum(),
      res_1220['Other Predictions'].sum())

print("2016 to 2020:",
      res_1620['Democrat Predictions'].sum(),
      res_1620['Republican Predictions'].sum(),
      res_1620['Other Predictions'].sum())

print("2012 to 2016:",
      state_results['Democrat Predictions'].sum(),
      state_results['Republican Predictions'].sum(),
      state_results['Other Predictions'].sum())

print("Actual  2016:",
      state_results['Democrat'].sum(),
      state_results['Republican'].sum(),
      state_results['Other'].sum())

12 - 16 - 20: 65897176.0 64053492.0 7722065.0
2012 to 2020: 68857410.0 64478304.0 7835213.0
2016 to 2020: 65897176.0 64053492.0 7722065.0
2012 to 2016: 65758561.623046875 62456192.8343811 -50822.77391350269
Actual  2016: 64274035.0 62037609.0 7495173.0
