This file is used to architect the data for a neural network

In [4]:
import pandas as pd 

original_df = pd.read_csv('../data/data_election_2020.csv')


num_trump = len(original_df[original_df['majority'] == 'Trump'])
num_biden = len(original_df[original_df['majority'] == 'Biden'])

print('Number of Trump counties: ', num_trump)
print('Number of Biden counties: ', num_biden)

Number of Trump counties:  2524
Number of Biden counties:  503


Merge the two datasets

In [5]:
new_df = pd.read_csv('../data/county_to_be_merged.csv')
merged_df = pd.merge(original_df, new_df, left_index=True, right_index=True)
merged_df.to_csv('../data/merged_data_2020_election.csv', index=False)


Examining the data

In [6]:
print(merged_df.head())
col_list = list(merged_df.columns)
print(f"Number of columns {len(col_list)}")
print(col_list)

  `"state"   county majority  trump16  clinton16  otherpres16  romney12   
0  Alabama  Autauga    Trump    18172       5936          865     17379  \
1  Alabama  Baldwin    Trump    72883      18458         3874     66016   
2  Alabama  Barbour    Trump     5454       4871          144      5550   
3  Alabama     Bibb    Trump     6738       1874          207      6132   
4  Alabama   Blount    Trump    22859       2156          573     20757   

   obama12  otherpres12  demsen16  ...  poverty_under_18_2019   
0     6363          190    6331.0  ...                   23.2  \
1    18424          898   19145.0  ...                   13.4   
2     5912           47    4777.0  ...                   50.1   
3     2202           86    2082.0  ...                    NaN   
4     2970          279    2980.0  ...                   18.4   

   two_plus_races_2019  unemployment_rate_2019  uninsured_2019   
0                  2.2                     3.5             7.1  \
1                  1.7    

In [7]:
new_df = pd.read_csv('../data/merged_data_2020_election.csv')
print(new_df.head())
col_list = list(new_df.columns)
print(f"Number of columns {len(col_list)}")
print(col_list)

  `"state"   county majority  trump16  clinton16  otherpres16  romney12   
0  Alabama  Autauga    Trump    18172       5936          865     17379  \
1  Alabama  Baldwin    Trump    72883      18458         3874     66016   
2  Alabama  Barbour    Trump     5454       4871          144      5550   
3  Alabama     Bibb    Trump     6738       1874          207      6132   
4  Alabama   Blount    Trump    22859       2156          573     20757   

   obama12  otherpres12  demsen16  ...  poverty_under_18_2019   
0     6363          190    6331.0  ...                   23.2  \
1    18424          898   19145.0  ...                   13.4   
2     5912           47    4777.0  ...                   50.1   
3     2202           86    2082.0  ...                    NaN   
4     2970          279    2980.0  ...                   18.4   

   two_plus_races_2019  unemployment_rate_2019  uninsured_2019   
0                  2.2                     3.5             7.1  \
1                  1.7    

We need to perform one-hot encoding

In [5]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
# Fix the 'uninsured' dtype rendering as type 'object'ArithmeticError
new_df = pd.read_csv('../data/merged_data_2020_election.csv')
non_numerical_cols = new_df.select_dtypes(exclude=['int64', 'float64']).columns
non_numerical_cols = list(non_numerical_cols)
new_df['uninsured_age_under_6_2017'] = new_df['uninsured_age_under_6_2017'].apply(lambda x: x if x != '-' else 0)
new_df['uninsured_age_under_6_2017'] = new_df['uninsured_age_under_6_2017'].astype('float64')
# Encode the categorical columns
columns_to_encode = ['state', 'majority']
new_df = new_df.drop('county', axis=1, inplace=False)
one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_columns = one_hot_encoder.fit_transform(new_df[columns_to_encode])
encoded_df = pd.DataFrame(encoded_columns, columns=one_hot_encoder.get_feature_names_out(columns_to_encode))
new_df = pd.concat([new_df, encoded_df], axis=1)
new_df.drop(columns=columns_to_encode, inplace=True)
merged_encoded_df = new_df.to_csv('../data/merged_encoded_data_2020_election.csv', index=False)



Let's preprocess the data

In [24]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pandas as pd

scaler = StandardScaler()
df = pd.read_csv('../output/merged_encoded_data_2020_election.csv')
df = df.drop(columns=['state.1', 'name', 'fips', 'majority_Trump'], axis=1, inplace=False)
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit_transform(df)
df_imputed = pd.DataFrame(imp.fit_transform(df), columns=df.columns)
X = df_imputed.drop('majority_Biden', axis=1, inplace=False)
X = pd.DataFrame(scaler.fit_transform(X))
y = df_imputed['majority_Biden']

        0         1         2         3         4         5         6    \
0 -0.050101 -0.187409 -0.211717 -0.049035 -0.197572 -0.220460 -0.229879   
1  1.205277 -0.033304  0.185099  1.032118 -0.036067  0.016656 -0.079023   
2 -0.341923 -0.200515 -0.306800 -0.311982 -0.203612 -0.268352 -0.248174   
3 -0.312461 -0.237399 -0.298492 -0.299045 -0.253291 -0.255290 -0.279901   
4  0.057446 -0.233928 -0.250225  0.026055 -0.243007 -0.190653 -0.269329   

        7         8         9    ...       151       152       153       154  \
0 -0.057110 -0.423608 -0.172514  ... -0.146968 -0.177048 -0.300696 -0.098352   
1  1.785669 -0.377874 -0.268999  ... -0.146968 -0.177048 -0.300696 -0.098352   
2 -0.479291 -0.434918 -0.201253  ... -0.146968 -0.177048 -0.300696 -0.098352   
3 -0.440454 -0.434672 -0.243791  ... -0.146968 -0.177048 -0.300696 -0.098352   
4  0.073302 -0.427050 -0.238432  ... -0.146968 -0.177048 -0.300696 -0.098352   

        155       156       157       158      159       160  
0 -0.

Let's split the data into train / test split - Note that I dropped majority_Trump since Biden will be 0 or 1 indicating Trump or Biden

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Architecting out the neural network

In [13]:
print(len(X_train.columns))

161


In [14]:
# Let's architect out the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(161, input_dim=len(X_train.columns), activation='relu')) 
model.add(Dense(81, activation='relu'))
model.add(Dense(40, activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=100, batch_size=64)

loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy}')


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

As you can see we have heavily overfitted the data

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

model = Sequential()
model.add(Dense(161, input_dim=len(X_train.columns), activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.5))
model.add(Dense(81, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.5))
model.add(Dense(40, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.5))
model.add(Dense(20, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=10)

model.fit(X_train, y_train, epochs=100, batch_size=64, validation_split=0.2, callbacks=[early_stopping])

loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy}')


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Accuracy: 0.9240924119949341


Performing a grid CV search to optimise the best hyperparameters. Note: I have changed the evaluation metric to F1 score

In [None]:
# First I want to oversample the minority class
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)


In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.regularizers import l2
from tensorflow.keras.caallbacks import EarlyStopping
from sklearn.metrics import make_scorer, f1_score

def create_model(dropout_rate=0.0, regularization_rate=0.0):
    model = Sequential()
    model.add(Dense(161, input_dim=len(X_train_resampled.columns), activation='relu', kernel_regularizer=l2(regularization_rate)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=64, verbose=0)
param_grid = {
    'dropout_rate': [0.2, 0.3, 0.5],
    'regularization_rate': [0.001, 0.01],
    'optimizer': ['adam', 'sgd'],
    'batch_size': [64, 128],
    'epochs': [50, 100],
    'init_mode': ['uniform', 'normal'],
    'activation': ['relu', 'tanh']
}

f1_scorer = make_scorer(f1_score)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=f1_scorer, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train_resampled, y_train_resampled)
print(f'Best F1 Score: {grid_result.best_score_} using {grid_result.best_params_}')


KeyboardInterrupt: 

Best F1 Score: 0.9748581014730419 using {'model__activation': 'relu', 'model__dropout_rate': 0.5, 'model__init_mode': 'normal', 'model__optimizer': 'adam', 'model__regularization_rate': 0.001}
This is the best model