In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
import pandas as pd
from matplotlib import style
style.use("ggplot")
import plotly.express as px
from pandas import DataFrame

In [2]:
features = [
    "females",
    "males",
    "hispanic",
    "not hispanic",
    "White Alone",
    "Black or African American Alone",
    "American Indian or Alaska Native Alone",
    "Asian Alone",
    "Native Hawaiian and Other Pacific Islander Alone",
    "Two or more races",
    "Age: [0-10]",
    "Age: [11-20]",
    "Age: [21-30]",
    "Age: [31-40]",
    "Age: [41-50]",
    "Age: [51-60]",
    "Age: [61-70]",
    "Age: [71-80]",
    "Age: [81-84]",
    "Age: [85]+",
    "GDP Per Year (Normalized)",
    "Personal income per capita (Normalized)",
    "Personal Consumption expenditure per capita (Normalized)",
    "Presidential Approval Rating",
    "GOP/Total Senate Seats pre-election",
    "GOP/Total House Seats pre-election"
]

In [3]:
labels = [
    "GOP Votes/Total Votes for Presidential Election",
    "DNC Votes/Total Votes for Presidential Election",
    "1 = Voted GOP, 0 = Voted DNC"
]

In [4]:
states = ["Alaska",
          "Alabama",
          "Arkansas",
          "Arizona",
          "California",
          "Colorado",
          "Connecticut",
          "Delaware",
          "Florida",
          "Georgia",
          "Hawaii",
          "Iowa",
          "Idaho",
          "Illinois",
          "Indiana",
          "Kansas",
          "Kentucky",
          "Louisiana",
          "Massachusetts",
          "Maryland",
          "Maine",
          "Michigan",
          "Minnesota",
          "Missouri",
          "Mississippi",
          "Montana",
          "North Carolina",
          "North Dakota",
          "Nebraska",
          "New Hampshire",
          "New Jersey",
          "New Mexico",
          "Nevada",
          "New York",
          "Ohio",
          "Oklahoma",
          "Oregon",
          "Pennsylvania",
          "Rhode Island",
          "South Carolina",
          "South Dakota",
          "Tennessee",
          "Texas",
          "Utah",
          "Virginia",
          "Vermont",
          "Washington",
          "Wisconsin",
          "West Virginia",
          "Wyoming"
         ]

In [5]:
states_abbrevs = ["AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DE", "FL", "GA", 
          "HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", 
          "ME", "MI", "MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", 
          "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY"]

In [6]:
data_df_train = pd.read_csv("combined-data_no-states_train.csv")

In [7]:
data_df_test = pd.read_csv("combined-data_no-states_test.csv")

In [8]:
X_train = np.array(data_df_train[features].values)
y_gop_train = (data_df_train[labels[0]].values.tolist())
y_dnc_train = (data_df_train[labels[1]].values.tolist())
y_binary_train = (data_df_train[labels[2]].values.tolist())

In [9]:
X_test = np.array(data_df_test[features].values)
y_gop_test = (data_df_test[labels[0]].values.tolist())
y_dnc_test = (data_df_test[labels[1]].values.tolist())
y_binary_test = (data_df_test[labels[2]].values.tolist())

In [17]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()


In [19]:
from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [20]:
rf_random = RandomizedSearchCV(estimator = clf, 
                               param_distributions = random_grid, 
                               n_iter = 100, 
                               cv = 3, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs = -1)

rf_random.fit(X_train, y_binary_train)
print(rf_random.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   16.8s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   41.9s finished


{'n_estimators': 1000, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 100, 'bootstrap': False}


In [52]:
param_grid = {
    'bootstrap': [False],
    'max_depth': [80, 90, 100, 110],
    'max_features': [3, 5, 7],
    'min_samples_leaf': [1, 2, 3, 4],
    'min_samples_split': [3, 5, 7],
    'n_estimators': [100, 300, 500, 1000, 1500, 2000]
}

In [54]:
from sklearn.model_selection import GridSearchCV

# Create a based model
rf = RandomForestClassifier()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X_train, y_binary_train)

print(grid_search.best_params_)

Fitting 3 folds for each of 864 candidates, totalling 2592 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 333 tasks      | elapsed:   30.4s
[Parallel(n_jobs=-1)]: Done 616 tasks      | elapsed:   59.6s
[Parallel(n_jobs=-1)]: Done 981 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1426 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 1953 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 2560 tasks      | elapsed:  4.2min


{'bootstrap': False, 'max_depth': 110, 'max_features': 7, 'min_samples_leaf': 2, 'min_samples_split': 3, 'n_estimators': 100}


[Parallel(n_jobs=-1)]: Done 2592 out of 2592 | elapsed:  4.2min finished


In [55]:
optimized_clf = RandomForestClassifier(n_estimators = 100, 
                                       min_samples_split = 3, 
                                       min_samples_leaf = 2, 
                                       max_features = 7, 
                                       max_depth = 110, 
                                       bootstrap = False)

In [69]:
optimized_clf.fit(X_train, y_binary_train)
clf.fit(X_train, y_binary_train)

RandomForestClassifier()

In [70]:
y_optimized_pred = optimized_clf.predict(X_test)
y_pred = clf.predict(X_test)


In [71]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_binary_test,y_pred))

print(classification_report(y_binary_test,y_optimized_pred))

              precision    recall  f1-score   support

           0       0.89      0.64      0.74        25
           1       0.72      0.92      0.81        25

    accuracy                           0.78        50
   macro avg       0.80      0.78      0.78        50
weighted avg       0.80      0.78      0.78        50

              precision    recall  f1-score   support

           0       0.94      0.68      0.79        25
           1       0.75      0.96      0.84        25

    accuracy                           0.82        50
   macro avg       0.85      0.82      0.82        50
weighted avg       0.85      0.82      0.82        50



In [72]:
y_pred = y_optimized_pred

df_states = DataFrame(states,columns=['State'])
df_y_pred = DataFrame(y_pred, columns=['y_pred'])
df_y_binary_test = DataFrame(y_binary_test, columns=['y_binary_test'])
df_states_abbrevs = DataFrame(states_abbrevs, columns=['states_abbrevs'])

In [73]:
array5 = []
i = 0

while i < len(y_pred):
    if (int(df_y_pred._get_value(i, "y_pred")) != int(df_y_binary_test._get_value(i, "y_binary_test"))):
        array5.append(0)
    else:
        array5.append(1)
    i += 1 
    
correct = pd.DataFrame(data=array5, columns=["correct"])

In [74]:
result = pd.concat([df_states, df_states_abbrevs, df_y_pred, df_y_binary_test, correct], axis=1)
result.head()

Unnamed: 0,State,states_abbrevs,y_pred,y_binary_test,correct
0,Alaska,AK,1.0,1,1
1,Alabama,AL,1.0,1,1
2,Arkansas,AR,1.0,1,1
3,Arizona,AZ,1.0,0,0
4,California,CA,0.0,0,1


In [75]:
fig = px.choropleth(
                    result,
                    locations=result['states_abbrevs'],
                    locationmode="USA-states",
                    color='correct',
                    color_continuous_scale="YlGn",
                    range_color=[0,1],
                    scope="usa",
                    hover_name="State")
fig.update_geos(fitbounds='locations', visible=False)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [30]:
from sklearn.ensemble import RandomForestRegressor

regr = RandomForestRegressor()

RandomForestRegressor(max_depth=10)

In [None]:
from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
rf_random = RandomizedSearchCV(estimator = clf, 
                               param_distributions = random_grid, 
                               n_iter = 100, 
                               cv = 3, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs = -1)

rf_random.fit(X_train, y_binary_train)
print(rf_random.best_params_)

In [None]:
regr.fit(X_train, y_gop_train)

In [46]:
y_gop_predict = regr.predict(X_test)

In [47]:
from sklearn.metrics import explained_variance_score, mean_squared_error, r2_score

y_gop_truth = list(np.array(y_gop_test, dtype='float') / 100)

print(y_gop_predict)
print(y_gop_truth)

print(explained_variance_score(y_gop_truth, y_gop_predict))
print(mean_squared_error(y_gop_truth, y_gop_predict))
print(r2_score(y_gop_truth, y_gop_predict))

[0.48149389 0.48109812 0.40858673 0.48605042 0.54384509 0.52592038
 0.50897167 0.48405543 0.50372927 0.1301884  0.51552096 0.4444788
 0.42197323 0.5056099  0.48107295 0.49189737 0.50099725 0.50664275
 0.52195163 0.50023565 0.50944826 0.50162635 0.40428942 0.50149498
 0.53164145 0.52073583 0.41038941 0.4989054  0.5261351  0.47281286
 0.4998942  0.52015523 0.47782578 0.48823484 0.46364395 0.50215524
 0.53652013 0.43491803 0.4946648  0.48253418 0.40533643 0.51578806
 0.52991938 0.50445821 0.55112387 0.33663864 0.52889366 0.47270498
 0.47806516 0.47351585]
[0.531, 0.621, 0.624, 0.491, 0.342, 0.419, 0.392, 0.39799999999999996, 0.512, 0.493, 0.34299999999999997, 0.532, 0.638, 0.406, 0.57, 0.565, 0.621, 0.585, 0.324, 0.324, 0.435, 0.478, 0.45399999999999996, 0.5670000000000001, 0.575, 0.5670000000000001, 0.499, 0.6509999999999999, 0.585, 0.455, 0.41200000000000003, 0.435, 0.47700000000000004, 0.418, 0.532, 0.654, 0.405, 0.488, 0.389, 0.551, 0.618, 0.607, 0.52, 0.5820000000000001, 0.44, 0.307,