This file is used to architect the data for a neural network

In [19]:
import pandas as pd 

original_df = pd.read_csv('../data/data_election_2020.csv')


num_trump = len(original_df[original_df['majority'] == 'Trump'])

num_biden = len(original_df[original_df['majority'] == 'Biden'])

print('Number of Trump counties: ', num_trump)
print('Number of Biden counties: ', num_biden)

Number of Trump counties:  2524
Number of Biden counties:  503


Merge the two datasets

In [20]:
new_df = pd.read_csv('../data/county_to_be_merged.csv')
merged_df = pd.merge(original_df, new_df, left_index=True, right_index=True)
merged_df.to_csv('../data/merged_data_2020_election.csv', index=False)


Examining the data

In [21]:
print(merged_df.head())
col_list = list(merged_df.columns)
print(f"Number of columns {len(col_list)}")
print(col_list)

   state_x   county majority  trump16  clinton16  otherpres16  romney12  \
0  Alabama  Autauga    Trump    18172       5936          865     17379   
1  Alabama  Baldwin    Trump    72883      18458         3874     66016   
2  Alabama  Barbour    Trump     5454       4871          144      5550   
3  Alabama     Bibb    Trump     6738       1874          207      6132   
4  Alabama   Blount    Trump    22859       2156          573     20757   

   obama12  otherpres12  demsen16  ...  poverty_under_18_2019  \
0     6363          190    6331.0  ...                   23.2   
1    18424          898   19145.0  ...                   13.4   
2     5912           47    4777.0  ...                   50.1   
3     2202           86    2082.0  ...                    NaN   
4     2970          279    2980.0  ...                   18.4   

   two_plus_races_2019  unemployment_rate_2019  uninsured_2019  \
0                  2.2                     3.5             7.1   
1                  1.7    

In [22]:
new_df = pd.read_csv('../data/merged_data_2020_election.csv')
print(new_df.head())
col_list = list(new_df.columns)
print(f"Number of columns {len(col_list)}")
print(col_list)

   state_x   county majority  trump16  clinton16  otherpres16  romney12  \
0  Alabama  Autauga    Trump    18172       5936          865     17379   
1  Alabama  Baldwin    Trump    72883      18458         3874     66016   
2  Alabama  Barbour    Trump     5454       4871          144      5550   
3  Alabama     Bibb    Trump     6738       1874          207      6132   
4  Alabama   Blount    Trump    22859       2156          573     20757   

   obama12  otherpres12  demsen16  ...  poverty_under_18_2019  \
0     6363          190    6331.0  ...                   23.2   
1    18424          898   19145.0  ...                   13.4   
2     5912           47    4777.0  ...                   50.1   
3     2202           86    2082.0  ...                    NaN   
4     2970          279    2980.0  ...                   18.4   

   two_plus_races_2019  unemployment_rate_2019  uninsured_2019  \
0                  2.2                     3.5             7.1   
1                  1.7    

We need to perform one-hot encoding

In [23]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
# Fix the 'uninsured' dtype rendering as type 'object'ArithmeticError
new_df = pd.read_csv('../data/merged_data_2020_election.csv')
non_numerical_cols = new_df.select_dtypes(exclude=['int64', 'float64']).columns
non_numerical_cols = list(non_numerical_cols)
new_df['uninsured_age_under_6_2017'] = new_df['uninsured_age_under_6_2017'].apply(lambda x: x if x != '-' else 0)
new_df['uninsured_age_under_6_2017'] = new_df['uninsured_age_under_6_2017'].astype('float64')
# Encode the categorical columns
columns_to_encode = ['state_x', 'majority']
new_df = new_df.drop('county', axis=1, inplace=False)
one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_columns = one_hot_encoder.fit_transform(new_df[columns_to_encode])
encoded_df = pd.DataFrame(encoded_columns, columns=one_hot_encoder.get_feature_names_out(columns_to_encode))
new_df = pd.concat([new_df, encoded_df], axis=1)
new_df.drop(columns=columns_to_encode, inplace=True)
merged_encoded_df = new_df.to_csv('../data/merged_encoded_data_2020_election.csv', index=False)



Let's preprocess the data

In [24]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pandas as pd

scaler = StandardScaler()
df = pd.read_csv('../output/merged_encoded_data_2020_election.csv')
df = df.drop(columns=['state.1', 'name', 'fips', 'majority_Trump'], axis=1, inplace=False)
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit_transform(df)
df_imputed = pd.DataFrame(imp.fit_transform(df), columns=df.columns)
X = df_imputed.drop('majority_Biden', axis=1, inplace=False)
X = pd.DataFrame(scaler.fit_transform(X))
y = df_imputed['majority_Biden']

Let's split the data into train / test split - Note that I dropped majority_Trump since Biden will be 0 or 1 indicating Trump or Biden

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Architecting out the neural network

In [26]:
print(len(X_train.columns))

161


In [27]:
# Let's architect out the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(161, input_dim=len(X_train.columns), activation='relu')) 
model.add(Dense(81, activation='relu'))
model.add(Dense(40, activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=100, batch_size=64)

loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy}')


Epoch 1/100
Epoch 2/100
Epoch 3/100

KeyboardInterrupt: 

As you can see we have heavily overfitted the data

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

model = Sequential()
model.add(Dense(161, input_dim=len(X_train.columns), activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.5))
model.add(Dense(81, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.5))
model.add(Dense(40, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.5))
model.add(Dense(20, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=10)

model.fit(X_train, y_train, epochs=100, batch_size=64, validation_split=0.2, callbacks=[early_stopping])

loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy}')


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Accuracy: 0.9240924119949341


Performing a grid CV search to optimise the best hyperparameters. Note: I have changed the evaluation metric to F1 score

In [28]:
# First I want to oversample the minority class
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)


In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.regularizers import l2
from sklearn.metrics import make_scorer, f1_score
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam, SGD

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

def create_model(dropout_rate=0.0, regularization_rate=0.0, optimizer='adam', init_mode='uniform', activation='relu', neurons=161, layers=3, learning_rate=0.001):
    model = Sequential()
    model.add(Dense(neurons, input_dim=len(X_train_resampled.columns), activation=activation, kernel_initializer=init_mode, kernel_regularizer=l2(regularization_rate)))
    model.add(Dropout(dropout_rate))
    for i in range(layers - 1):
        model.add(Dense(neurons // (2 ** (i + 1)), activation=activation, kernel_initializer=init_mode, kernel_regularizer=l2(regularization_rate)))
        model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    if optimizer == 'adam':
        opt = Adam(learning_rate=learning_rate)
    elif optimizer == 'sgd':
        opt = SGD(learning_rate=learning_rate)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    return model

model = KerasClassifier(model=create_model, epochs=100, batch_size=64, verbose=1, callbacks=[early_stopping])

param_grid = {
    'model__dropout_rate': [0.2, 0.3, 0.4, 0.5],
    'model__regularization_rate': [0.001, 0.01, 0.05],
    'model__optimizer': ['adam', 'sgd'],
    'model__init_mode': ['uniform', 'normal', 'he_normal'],
    'model__activation': ['relu', 'tanh', 'sigmoid'],
    'model__neurons': [50, 100, 150],
    'model__layers': [2, 3, 4],
    'batch_size': [32, 64, 128],
    'epochs': [50, 100, 150],
    'model__learning_rate': [0.001, 0.01, 0.1]
}

f1_scorer = make_scorer(f1_score)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=f1_scorer, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train_resampled, y_train_resampled)
print(f'Best F1 Score: {grid_result.best_score_} using {grid_result.best_params_}')


In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.regularizers import l2
from sklearn.metrics import make_scorer, f1_score

# Function to create model for KerasClassifier
def create_model(dropout_rate=0.0, regularization_rate=0.0):
    model = Sequential()
    model.add(Dense(161, input_dim=len(X_train_resampled.columns), activation='relu', kernel_regularizer=l2(regularization_rate)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=64, verbose=0)
param_grid = {
    'dropout_rate': [0.2, 0.3, 0.5],
    'regularization_rate': [0.001, 0.01],
    'optimizer': ['adam', 'sgd'],
    'batch_size': [64, 128],
    'epochs': [50, 100],
    'init_mode': ['uniform', 'normal'],
    'activation': ['relu', 'tanh']
}

f1_scorer = make_scorer(f1_score)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=f1_scorer, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train_resampled, y_train_resampled)
print(f'Best F1 Score: {grid_result.best_score_} using {grid_result.best_params_}')





ModuleNotFoundError: No module named 'tensorflow.keras.wrappers'

Best F1 Score: 0.9748581014730419 using {'model__activation': 'relu', 'model__dropout_rate': 0.5, 'model__init_mode': 'normal', 'model__optimizer': 'adam', 'model__regularization_rate': 0.001}
This is the best model

Let's recreate and save the best model using different imbalance techniques

In [14]:
# I want to test the different imbalance techniques with the best model

from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.callbacks import EarlyStopping
from imblearn.over_sampling import RandomOverSampler, SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from joblib import dump


# Random Oversampling
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_train, y_train)

# Random Undersampling
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_train, y_train)

# SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)


# Define the model with the best hyperparameters
def create_best_model():
    model = Sequential()
    model.add(Dense(161, input_dim=len(X_train_resampled.columns), activation='relu', kernel_initializer='normal', kernel_regularizer=l2(0.001)))
    model.add(Dropout(0.5))
    for i in range(3 - 1):
        model.add(Dense(161 // (2 ** (i + 1)), activation='relu', kernel_initializer='normal', kernel_regularizer=l2(0.001)))
        model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    return model

# Define early stopping
early_stopping = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)

# Train and evaluate the model with different resampling techniques
resampling_techniques = {
    'Random Oversampling': (X_ros, y_ros),
    'Random Undersampling': (X_rus, y_rus),
    'SMOTE': (X_smote, y_smote)
}

for technique_name, (X_sampled, y_sampled) in resampling_techniques.items():
    print(f"Training with {technique_name}...")

    # Create and train the model
    model = create_best_model()
    model.fit(X_sampled, y_sampled, epochs=100, batch_size=64, verbose=1, callbacks=[early_stopping])

    # Evaluate the model
    y_pred = model.predict(X_test)
    y_pred = (y_pred > 0.5).astype(int)

    print(f"\nResults for {technique_name}:")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}\n")

    # Optionally save the model
    if technique_name == 'Random Oversampling':
        model_path = f"../models/best_model_{technique_name}.joblib"
        dump(model, model_path)
        print(f"Model saved at {model_path}\n")

Training with Random Oversampling...
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100

Results for Random Oversampling:
              p


Results for Random Oversampling:
              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97       506
         1.0       0.84      0.85      0.85       100

    accuracy                           0.95       606
   macro avg       0.91      0.91      0.91       606
weighted avg       0.95      0.95      0.95       606

Accuracy: 0.9488448844884488


Results for Random Undersampling:
              precision    recall  f1-score   support

         0.0       0.99      0.90      0.94       506
         1.0       0.65      0.94      0.77       100

    accuracy                           0.91       606
   macro avg       0.82      0.92      0.85       606
weighted avg       0.93      0.91      0.91       606

Accuracy: 0.905940594059406

Results for SMOTE:
              precision    recall  f1-score   support

         0.0       0.96      0.97      0.97       506
         1.0       0.85      0.80      0.82       100

    accuracy                           0.94       606
   macro avg       0.91      0.89      0.90       606
weighted avg       0.94      0.94      0.94       606

Accuracy: 0.9438943894389439


In [34]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import Callback, EarlyStopping
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, make_scorer
from imblearn.over_sampling import RandomOverSampler

# Split data into training, testing, and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Oversampling
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# Define custom callback for F1 score
class F1ScoreCallback(Callback):
    def __init__(self, threshold=0.995):
        super().__init__()
        self.threshold = threshold

    def on_epoch_end(self, epoch, logs=None):
        val_predict = (np.asarray(self.model.predict(X_val))).round()
        _val_f1 = f1_score(y_val, val_predict)
        if _val_f1 > self.threshold:
            self.model.stop_training = True

# Function to create the Keras model
def create_model(dropout_rate=0.0, regularization_rate=0.0, optimizer='adam'):
    model = Sequential()
    model.add(Dense(161, input_dim=len(X_train_resampled.columns), activation='relu', kernel_regularizer=l2(regularization_rate)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Early stopping callback
early_stopping = EarlyStopping(monitor='loss', patience=10)

# KerasClassifier wrapper
model = KerasClassifier(model=create_model, epochs=100, batch_size=64, verbose=0, callbacks=[F1ScoreCallback(), early_stopping])

# GridSearchCV parameters
param_grid = {
    'model__dropout_rate': [0.2, 0.3, 0.5],
    'model__regularization_rate': [0.001, 0.01],
    'model__optimizer': ['adam', 'sgd'],
    'batch_size': [64, 128],
    'epochs': [50, 100]
}

f1_scorer = make_scorer(f1_score)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=f1_scorer, n_jobs=1, cv=3)
grid_result = grid.fit(X_train_resampled, y_train_resampled)
print(f'Best F1 Score: {grid_result.best_score_} using {grid_result.best_params_}')


