In [1]:
import pandas as pd

election_2016 = pd.read_csv("C:/Users/patty/Documents/INFSCI 510/510_Project/2016_Election.csv")
election_2020 = pd.read_csv("C:/Users/patty/Documents/INFSCI 510/510_Project/2020_Election.csv")
demographics = pd.read_csv("C:/Users/patty/Documents/INFSCI 510/510_Project/Demographics.csv")
gdp = pd.read_csv("C:/Users/patty/Documents/INFSCI 510/510_Project/GDP 2016-2023.csv")
inflation = pd.read_csv("C:/Users/patty/Documents/INFSCI 510/510_Project/Inflation 2019-2024.csv")
unemployment = pd.read_csv("C:/Users/patty/Documents/INFSCI 510/510_Project/Unemployment Rate 2016-2024.csv")

In [2]:
#Display datasets

datasets = [election_2016, election_2020, demographics, gdp, inflation, unemployment]
dataset_names = ["Election 2016", "Election 2020", "Demographics", "GDP", "Inflation", "Unemployment"]

for name, dataset in zip(dataset_names, datasets):
    print(f"{name} - First 5 rows:\n{dataset.head()}\n")

Election 2016 - First 5 rows:
     state   county        candidate     party  votes
0  Alabama  Autauga   Bernie Sanders  Democrat    544
1  Alabama  Autauga  Hillary Clinton  Democrat   2387
2  Alabama  Baldwin   Bernie Sanders  Democrat   2694
3  Alabama  Baldwin  Hillary Clinton  Democrat   5290
4  Alabama  Barbour   Bernie Sanders  Democrat    222

Election 2020 - First 5 rows:
      state             county      candidate party   votes
0  Delaware        Kent County      Joe Biden   DEM   44552
1  Delaware        Kent County   Donald Trump   REP   41009
2  Delaware        Kent County   Jo Jorgensen   LIB    1044
3  Delaware        Kent County  Howie Hawkins   GRN     420
4  Delaware  New Castle County      Joe Biden   DEM  195034

Demographics - First 5 rows:
                            Label (Grouping) United States!!Estimate  \
0                                SEX AND AGE                     NaN   
1                           Total population             333,287,562   
2        

# Merged & Cleaned

In [4]:
# Group by state, county, and candidate to sum votes, but also include 'party' in the aggregation
winners = elections_merged.groupby(['state', 'county', 'party']).agg({'votes': 'sum'}).reset_index()
# Sort to get the party with the highest votes in each county
winners = winners.sort_values(by=['state', 'county', 'votes'], ascending=False).drop_duplicates(subset=['state', 'county'])
# Create a binary variable 'winner' where 1 indicates Democrat
winners['winner'] = (winners['party'] == 'democrat').astype(int)

# Merge the winners back to the main dataframe
elections_merged = pd.merge(elections_merged, winners[['state', 'county', 'winner']], on=['state', 'county'], how='left')

print(elections_merged.head())


     state   county        candidate     party  votes  winner
0  alabama  autauga   Bernie Sanders  Democrat    544       0
1  alabama  autauga  Hillary Clinton  Democrat   2387       0
2  alabama  baldwin   Bernie Sanders  Democrat   2694       0
3  alabama  baldwin  Hillary Clinton  Democrat   5290       0
4  alabama  barbour   Bernie Sanders  Democrat    222       0


# Feature Engineering

In [11]:
import pandas as pd

# Assuming the merged elections data is already loaded as `elections_merged`
# and it includes data from both 2016 and 2020.

# Load data (demonstrative; adjust paths as necessary)
election_2016 = pd.read_csv("C:/Users/patty/Documents/INFSCI 510/510_Project/2016_Election.csv")
election_2020 = pd.read_csv("C:/Users/patty/Documents/INFSCI 510/510_Project/2020_Election.csv")

# Combine the datasets for uniform processing
elections_merged = pd.concat([election_2016, election_2020])

# Normalize text fields to ensure consistency
elections_merged['party'] = elections_merged['party'].str.lower()
elections_merged['state'] = elections_merged['state'].str.lower()
elections_merged['county'] = elections_merged['county'].str.lower().str.replace(' county', '')

# Group by state, county, and sum votes for each party
winning_party = elections_merged.groupby(['state', 'county', 'party'])['votes'].sum().reset_index()

# Determine the party with the maximum votes in each county
winning_party = winning_party.sort_values('votes', ascending=False).drop_duplicates(['state', 'county'])

# Create a binary target variable: 1 if Democrats won, 0 otherwise
winning_party['winner'] = (winning_party['party'] == 'democrat').astype(int)

# Merge this information back into your main DataFrame if needed for further analysis
# Assuming `elections_merged` needs this 'winner' column
elections_merged = pd.merge(elections_merged, winning_party[['state', 'county', 'winner']], on=['state', 'county'], how='left')

print(winning_party[['state', 'county', 'party', 'votes', 'winner']].head())
print(elections_merged.head())


            state       county party    votes  winner
2015   california  los angeles   dem  3028885       0
6949     illinois         cook   dem  1725973       0
843       arizona     maricopa   dem  1040774       0
2161   california    san diego   dem   964650       0
29355       texas       harris   dem   918193       0
     state   county        candidate     party  votes  winner
0  alabama  autauga   Bernie Sanders  democrat    544       0
1  alabama  autauga  Hillary Clinton  democrat   2387       0
2  alabama  baldwin   Bernie Sanders  democrat   2694       0
3  alabama  baldwin  Hillary Clinton  democrat   5290       0
4  alabama  barbour   Bernie Sanders  democrat    222       0


In [12]:
# Assuming the dataset 'winning_party' is already sorted and deduplicated as shown in previous steps

# First, let's verify the logic by checking the top entries per county to ensure 'winner' flags are correct
top_entries_per_county = winning_party.sort_values(by=['state', 'county', 'votes'], ascending=False) \
                                       .drop_duplicates(['state', 'county'])

# Print top entries to see the most voted party in each county
print(top_entries_per_county.head())

# Now, check if the winner determination is correct
# Assuming that 'party' column values are correctly labeled as 'democrat' for Democrats
# Make sure there's no mismatch, such as 'dem' or 'democratic' that should be standardized to 'democrat'
winning_party['winner'] = (winning_party['party'] == 'democrat').astype(int)

# Check if any county has Democrats as winners:
print("Count of Democrat wins:", winning_party['winner'].sum())

# If the sum is 0, it means no counties are marked as won by Democrats even if they should be.


         state          county       party  votes  winner
37769  wyoming          weston         rep   3107       0
37763  wyoming        washakie         rep   3245       0
37758  wyoming   uinta-lincoln  republican     53       0
37756  wyoming           uinta         rep   7496       0
37751  wyoming  teton-sublette  republican     40       0
Count of Democrat wins: 114


In [13]:
# Filtering the dataframe to show only where Democrats are the winning party
democrat_wins = winning_party[winning_party['winner'] == 1]

# Print entries where Democrats won
print("Entries where Democrats won:")
print(democrat_wins.head())


Entries where Democrats won:
           state            county     party   votes  winner
6901    illinois           chicago  democrat  678179       1
23284   new york          brooklyn  democrat  290563       1
23476   new york         manhattan  democrat  267723       1
10982  louisiana           orleans  democrat   53687       1
10754  louisiana  east baton rouge  democrat   40930       1


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming 'winning_party' has the correct 'winner' flags and the features you want to include
# Extracting feature columns: assuming 'votes' and other derived features are included
X = winning_party.drop(['state', 'county', 'party', 'winner'], axis=1)
y = winning_party['winner']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

# Standardize the features (if your features require standardization)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Train ML model

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

# Initialize the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for ROC-AUC


# Evaluate Model


In [16]:
# Calculate accuracy, ROC-AUC, and confusion matrix
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"ROC-AUC: {roc_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")


Accuracy: 0.976313079299691
ROC-AUC: 0.67473858007705
Confusion Matrix:
[[940   8]
 [ 15   8]]


In [17]:
# Adjusting class weights for imbalance
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Retrain and re-evaluate the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Adjusted Accuracy: {accuracy}")
print(f"Adjusted ROC-AUC: {roc_auc}")
print(f"Adjusted Confusion Matrix:\n{conf_matrix}")


Adjusted Accuracy: 0.9711637487126673
Adjusted ROC-AUC: 0.6746927169326729
Adjusted Confusion Matrix:
[[935  13]
 [ 15   8]]


# Model Evaluation

Based on historical voting data from 2016 and 2020, our predictive model utilizes shifts in vote shares to forecast election outcomes. While the model shows promising accuracy in back-testing, predicting the 2024 election accurately hinges on several factors: continued voting patterns, demographic shifts, and political climate changes leading up to the election. As such, while our predictions provide a data-driven forecast based on past trends, they should be interpreted with caution, considering potential future variables that could influence voter behavior.