# Federal Election Results 2020

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=Warning)
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
import cartopy.crs as ccrs
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

import cartopy.io.shapereader as shpreader
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
import matplotlib.patches as mpatches
import geopandas as gpd

import psycopg2
from sqlalchemy import create_engine

## 2020 Election Data

In [2]:
"""
Extract multiple sheets from the Excel file and load them into Pandas data frames. 
A dictionary allows you to store each data frame under a unique key (the sheet name), 
making it easy to access and work with each data frame later.
"""
# Define the file path to the Excel file
file_path = 'Election_Results/federalelections2020.xlsx'

# Define a list of sheet names to extract
sheet_names = ['9. 2020 Pres General Results', 
               '12. US Senate Results by State', 
               '13. US House Results by State', 
               '16. Appendix 1A']

# Create an empty dictionary to store the data frames
data_frames = {}

# Loop over the sheet names and read them into data frames
for name in sheet_names:
    data_frames[name] = pd.read_excel(file_path, sheet_name=name)

# Print the data frames to verify that they were loaded correctly
# for name, df in data_frames.items():
#     print(f"{name}:")
#     print(df.head())


10. 2020 Pres Primary Results
13. US House Results by State
10. 2020 Pres Primary Results
13. US House Results by State
10. 2020 Pres Primary Results
13. US House Results by State
10. 2020 Pres Primary Results
13. US House Results by State


## 2020 Presidential Data

In [3]:
df_pres_2020 = data_frames['9. 2020 Pres General Results']
df_pres_2020.head()
# 681 rows 18 columns

Unnamed: 0,1,FEC ID,STATE,STATE ABBREVIATION,GENERAL ELECTION DATE,FIRST NAME,LAST NAME,"LAST NAME, FIRST",TOTAL VOTES,PARTY,GENERAL RESULTS,GENERAL %,TOTAL VOTES #,COMBINED GE PARTY TOTALS (NY),COMBINED % (NY),WINNER INDICATOR,ELECTORAL VOTES,FOOTNOTES
0,2,,,,NaT,,,,,,,,,,,,,
1,3,P80001571,Alabama,AL,2020-11-03,Donald J.,Trump,"Trump, Donald J.",,R,1441170.0,0.620316,,,,W,9.0,
2,4,P80000722,Alabama,AL,2020-11-03,Joseph R.,Biden,"Biden, Joseph R.",,D,849624.0,0.3657,,,,,,
3,5,P00013524,Alabama,AL,2020-11-03,Jo,Jorgensen,"Jorgensen, Jo",,IND,25176.0,0.010836,,,,,,
4,6,,Alabama,AL,2020-11-03,,Scattered,Scattered,,W,7312.0,0.003147,,,,,,


In [4]:
# select the columns wanted and rename them
df_selection = df_pres_2020[['STATE ABBREVIATION', 'LAST NAME', 'PARTY', 'GENERAL %', 'GENERAL RESULTS']]
df_pres_2020 = df_selection.rename(columns={
    'STATE ABBREVIATION': 'State',
    'LAST NAME': 'Last Name',
    'PARTY': 'Party',
    'GENERAL %': 'Vote %',
    'GENERAL RESULTS': 'Vote count'
})
df_pres_2020.head()
# gives 681 rows

Unnamed: 0,State,Last Name,Party,Vote %,Vote count
0,,,,,
1,AL,Trump,R,0.620316,1441170.0
2,AL,Biden,D,0.3657,849624.0
3,AL,Jorgensen,IND,0.010836,25176.0
4,AL,Scattered,W,0.003147,7312.0


In [5]:
# add year and office columns
df_pres_2020['Year'] = 2020
df_pres_2020['Office'] = 'Pres'

# Arrange columns: Move the Year and Office columns to the front
new_cols = ['Year', 'Office', 'State', 'Last Name', 'Party', 'Vote %', 'Vote count']
df_pres_2020 = df_pres_2020.reindex(columns=new_cols)

# Include only Candidates R, D, IND (independent)
df_pres_2020 = df_pres_2020[df_pres_2020['Party'].isin(['R', 'D', 'IND'])]

df_pres_2020.head()
# gives 135 rows

Unnamed: 0,Year,Office,State,Last Name,Party,Vote %,Vote count
1,2020,Pres,AL,Trump,R,0.620316,1441170.0
2,2020,Pres,AL,Biden,D,0.3657,849624.0
3,2020,Pres,AL,Jorgensen,IND,0.010836,25176.0
7,2020,Pres,AK,Trump,R,0.528331,189951.0
8,2020,Pres,AK,Biden,D,0.42772,153778.0


In [6]:
# Remove any unseen NaN's from num rows.
df_p_2020 = df_pres_2020.dropna(subset=['Vote %'])
df_p_2020 = df_p_2020.dropna(subset=['Vote count'])
# confirms NaN's are gone
print(df_p_2020[df_p_2020['Vote %'].isna()])
print(df_p_2020[df_p_2020['Vote count'].isna()])

Empty DataFrame
Columns: [Year, Office, State, Last Name, Party, Vote %, Vote count]
Index: []
Empty DataFrame
Columns: [Year, Office, State, Last Name, Party, Vote %, Vote count]
Index: []


In [7]:
# convert Vote count to float for math
df_p_2020['Vote count'] = df_p_2020['Vote count'].astype(int)
df_p_2020.dtypes

Year            int64
Office         object
State          object
Last Name      object
Party          object
Vote %        float64
Vote count      int32
dtype: object

In [8]:
df_p_2020.shape
# gives 135 rows.

(135, 7)

## 2020 Senate data


In [9]:
df_senate_2020 = data_frames['12. US Senate Results by State']
df_senate_2020.head()

Unnamed: 0,1,STATE ABBREVIATION,STATE,DISTRICT,FEC ID,(I) Incumbent Indicator,CANDIDATE NAME (First),CANDIDATE NAME (Last),CANDIDATE NAME,TOTAL VOTES,...,RUNOFF %,GENERAL VOTES,GENERAL %,"GE RUNOFF ELECTION VOTES (GA, LA)","GE RUNOFF ELECTION % (GA, LA)",COMBINED GE PARTY TOTALS (when applicable),COMBINED % (when applicable),PE WINNER INDICATOR,GE WINNER INDICATOR,FOOTNOTES
0,2,,,,,,,,,,...,,,,,,,,,,
1,3,AL,Alabama,S,S0AL00230,,Tommy,Tuberville,"Tuberville, Tommy",,...,0.607256,1392076.0,0.600954,,,,,W,W,
2,4,AL,Alabama,S,S6AL00195,,Jeff,Sessions,"Sessions, Jeff",,...,0.392744,,,,,,,,,
3,5,AL,Alabama,S,S0AL00206,,Bradley,Byrne,"Byrne, Bradley",,...,,,,,,,,,,
4,6,AL,Alabama,S,S0AL00297,,Roy,Moore,"Moore, Roy",,...,,,,,,,,,,


In [10]:
# select the columns wanted and rename them
df_sel_senate = df_senate_2020[['STATE ABBREVIATION', 
                                'CANDIDATE NAME (Last)', 
                                'PARTY', 'GENERAL %', 
                                'GENERAL VOTES ']]

df_sen_2020 = df_sel_senate.rename(columns={
    'STATE ABBREVIATION': 'State',
    'CANDIDATE NAME (Last)': 'Last Name',
    'PARTY': 'Party',
    'GENERAL %': 'Vote %',
    'GENERAL VOTES ': 'Vote count'
})
df_sen_2020.head()
# gives 541 rows.

Unnamed: 0,State,Last Name,Party,Vote %,Vote count
0,,,,,
1,AL,Tuberville,R,0.600954,1392076.0
2,AL,Sessions,R,,
3,AL,Byrne,R,,
4,AL,Moore,R,,


In [11]:
# add year and office columns
df_sen_2020['Year'] = 2020
df_sen_2020['Office'] = 'Senate'

# Move the Year and Office columns to the front
new_cols = ['Year', 'Office', 'State', 'Last Name', 'Party', 'Vote %', 'Vote count']
df_sen_2020 = df_sen_2020.reindex(columns=new_cols)

# Include only Candidates R, D, IND (independent)
df_sen_2020 = df_sen_2020[df_sen_2020['Party'].isin(['R', 'D', 'IND'])]

df_sen_2020.head()
# gives 319 rows

Unnamed: 0,Year,Office,State,Last Name,Party,Vote %,Vote count
1,2020,Senate,AL,Tuberville,R,0.600954,1392076.0
2,2020,Senate,AL,Sessions,R,,
3,2020,Senate,AL,Byrne,R,,
4,2020,Senate,AL,Moore,R,,
5,2020,Senate,AL,Nelson,R,,


In [12]:
# Remove any unseen NaN's from num rows.
df_s_2020 = df_sen_2020.dropna(subset=['Vote %'])
df_s_2020 = df_s_2020.dropna(subset=['Vote count'])
# confirms NaN's are gone
print(df_s_2020[df_s_2020['Vote %'].isna()])
print(df_s_2020[df_s_2020['Vote count'].isna()])

Empty DataFrame
Columns: [Year, Office, State, Last Name, Party, Vote %, Vote count]
Index: []
Empty DataFrame
Columns: [Year, Office, State, Last Name, Party, Vote %, Vote count]
Index: []


In [13]:
df_s_2020.dtypes

Year            int64
Office         object
State          object
Last Name      object
Party          object
Vote %        float64
Vote count     object
dtype: object

In [14]:
# convert Vote count to float for math
df_s_2020['Vote count'] = df_s_2020['Vote count'].astype(int)
df_s_2020.dtypes

Year            int64
Office         object
State          object
Last Name      object
Party          object
Vote %        float64
Vote count      int32
dtype: object

In [15]:
df_s_2020.head()
# gives 319 rows

Unnamed: 0,Year,Office,State,Last Name,Party,Vote %,Vote count
1,2020,Senate,AL,Tuberville,R,0.600954,1392076
9,2020,Senate,AL,Jones,D,0.397367,920478
13,2020,Senate,AK,Sullivan,R,0.538971,191112
24,2020,Senate,AZ,Kelly,D,0.511566,1716467
27,2020,Senate,AZ,McSally,R,0.488079,1637661


## 2020 House Election Data

In [16]:
df_house_2020 = data_frames['13. US House Results by State']
df_house_2020.head()

Unnamed: 0,1,STATE ABBREVIATION,STATE,DISTRICT,FEC ID,(I) Incumbent Indicator,CANDIDATE NAME (First),CANDIDATE NAME (Last),CANDIDATE NAME,TOTAL VOTES,...,RUNOFF %,GENERAL VOTES,GENERAL %,"GE RUNOFF ELECTION VOTES (GA, GU, LA)","GE RUNOFF ELECTION % (GA, GU, LA)","COMBINED GE PARTY TOTALS (CT, NY)","COMBINED % (CT, NY)",PE WINNER INDICATOR,GE WINNER INDICATOR,FOOTNOTES
0,2,,,,,,,,,,...,,,,,,,,,,
1,3,AL,Alabama,1.0,H0AL01055,,Jerry,Carl,"Carl, Jerry",,...,0.522766,211825.0,0.643698,,,,,W,W,
2,4,AL,Alabama,1.0,H0AL01089,,Bill,Hightower,"Hightower, Bill",,...,0.477234,,,,,,,,,
3,5,AL,Alabama,1.0,H0AL01071,,Chris,Pringle,"Pringle, Chris",,...,,,,,,,,,,
4,6,AL,Alabama,1.0,H0AL01063,,Wes,Lambert,"Lambert, Wes",,...,,,,,,,,,,


In [17]:
df_sel_house = df_house_2020[['STATE ABBREVIATION', 
                                'CANDIDATE NAME (Last)', 
                                'PARTY', 'GENERAL %', 
                                'GENERAL VOTES ']]

df_hou_2020 = df_sel_house.rename(columns={
    'STATE ABBREVIATION': 'State',
    'CANDIDATE NAME (Last)': 'Last Name',
    'PARTY': 'Party',
    'GENERAL %': 'Vote %',
    'GENERAL VOTES ': 'Vote count'
})
df_hou_2020.shape
# gives 4042 rows

(4042, 5)

In [18]:
# add year and office columns
df_hou_2020['Year'] = 2020
df_hou_2020['Office'] = 'House'

# Move the Year and Office columns to the front
new_cols = ['Year', 'Office', 'State', 'Last Name', 'Party', 'Vote %', 'Vote count']
df_hou_2020 = df_hou_2020.reindex(columns=new_cols)

# Include only Candidates R, D, IND (independent)
df_hou_2020 = df_hou_2020[df_hou_2020['Party'].isin(['R', 'D', 'IND'])]

df_hou_2020.head()

Unnamed: 0,Year,Office,State,Last Name,Party,Vote %,Vote count
1,2020,House,AL,Carl,R,0.643698,211825.0
2,2020,House,AL,Hightower,R,,
3,2020,House,AL,Pringle,R,,
4,2020,House,AL,Lambert,R,,
5,2020,House,AL,Castorani,R,,


In [19]:
# Remove any unseen NaN's from num rows.
df_h_2020 = df_hou_2020.dropna(subset=['Vote %'])
df_h_2020 = df_h_2020.dropna(subset=['Vote count'])
# confirms NaN's are gone
print(df_h_2020[df_h_2020['Vote %'].isna()])
print(df_h_2020[df_h_2020['Vote count'].isna()])

Empty DataFrame
Columns: [Year, Office, State, Last Name, Party, Vote %, Vote count]
Index: []
Empty DataFrame
Columns: [Year, Office, State, Last Name, Party, Vote %, Vote count]
Index: []


In [20]:
# convert Vote count to float for math
df_h_2020['Vote count'] = df_h_2020['Vote count'].astype(int)
df_h_2020.dtypes

Year            int64
Office         object
State          object
Last Name      object
Party          object
Vote %        float64
Vote count      int32
dtype: object

### Extra work to check data 

In [101]:
print(df_h_2020['Vote count'].unique())

[2020]


In [105]:
print(df_h_2020['Vote count'].isna().sum())
df_h_2020['Vote count'].apply(lambda x: str(x).isnumeric()).all()

0


True

In [106]:
# Get the rows where the sum of Vote % is not equal to 1.0
sum_not_100 = df_state_2020.groupby(['Year', 'Office', 'State'])['Vote %'].sum() != 1.0
print(sum_not_100[sum_not_100 == True])

# Get the rows where Vote count is NaN
print(df_state_2020[df_state_2020['Vote count'].isna()])


Year  Office  State
2020  House   AK       True
              AL       True
              AR       True
              AZ       True
              CA       True
                       ... 
      Senate  TN       True
              TX       True
              VA       True
              WV       True
              WY       True
Name: Vote %, Length: 135, dtype: bool
Empty DataFrame
Columns: [Year, Office, State, Party, Vote %, Vote count, Vote count_total, Final Vote %]
Index: []


In [51]:
df_h_2020.head()
# gives 871 rows

Unnamed: 0,Year,Office,State,Last Name,Party,Vote %,Vote count
1,2020,House,AL,Carl,R,0.643698,211825
7,2020,House,AL,Averhart,D,0.355387,116949
13,2020,House,AL,Moore,R,0.652227,197996
21,2020,House,AL,Harvey-Hall,D,0.346827,105286
26,2020,House,AL,Rogers,R,0.674615,217384


# Add data to a database

In [21]:
# import psycopg2
# from sqlalchemy import create_engine

# Replace the values below with your database credentials
DATABASE = "electiondb"
USER = "postgres"
PASSWORD = "Spanques&Gum6"
HOST = "localhost"
PORT = "5432"

# Connect to your PostgreSQL database
conn = psycopg2.connect(
    database=DATABASE,
    user=USER,
    password=PASSWORD,
    host=HOST,
    port=PORT
)

# Create a sqlalchemy engine
engine = create_engine(f"postgresql://{USER}:{PASSWORD}@{HOST}:{PORT}/{DATABASE}")

# Upload the data frames to the database using the results table
df_p_2020.to_sql("results", engine, if_exists="replace", index=False)
print("Presidential table loaded successfully")
df_s_2020.to_sql("results", engine, if_exists="append", index=False)
df_h_2020.to_sql("results", engine, if_exists="append", index=False)

# Commit the changes to the database
conn.commit()

# Close the database connection
conn.close()


Presidential table loaded successfully


In [22]:
# import psycopg2
# import pandas as pd

# Replace the values below with your database credentials
DATABASE = "electiondb"
USER = "postgres"
PASSWORD = "Spanques&Gum6"
HOST = "localhost"
PORT = "5432"

# Connect to the PostgreSQL database
conn = psycopg2.connect(database=DATABASE, user=USER, password=PASSWORD, host=HOST, port=PORT)

# Query the table and store the results in a Pandas dataframe
df = pd.read_sql_query("SELECT * FROM results", conn)

# Print the dataframe to verify that it contains data
print(df)

# Close the database connection
conn.close()
# gives 1128 rows for 2020

      Year Office State  Last Name Party    Vote %  Vote count
0     2020   Pres    AL      Trump     R  0.620316     1441170
1     2020   Pres    AL      Biden     D  0.365700      849624
2     2020   Pres    AL  Jorgensen   IND  0.010836       25176
3     2020   Pres    AK      Trump     R  0.528331      189951
4     2020   Pres    AK      Biden     D  0.427720      153778
...    ...    ...   ...        ...   ...       ...         ...
1123  2020  House    WI     Zunker     D  0.392140      162741
1124  2020  House    WI  Gallagher     R  0.641811      268173
1125  2020  House    WI      Stuck     D  0.357933      149558
1126  2020  House    WY     Cheney     R  0.685631      185732
1127  2020  House    WY  Grey Bull     D  0.245766       66576

[1128 rows x 7 columns]


In [None]:
# in postgreSQL created database called: electiondb
# create table in the postgresSQL first

# better table the generates its own primary key
# CREATE TABLE results (
#     year INTEGER NOT NULL,
#     office VARCHAR(50) NOT NULL,
#     state VARCHAR(50) NOT NULL,
#     last_name VARCHAR(50) NOT NULL,
#     party VARCHAR(50) NOT NULL,
#     vote_percentage NUMERIC(5, 2) NOT NULL,
#     vote_count BIGINT NOT NULL,
#     PRIMARY KEY (year, office, state, last_name, party)
# );


In [None]:
# query the data base
# SELECT * FROM results;
# SELECT * FROM results LIMIT 5;
# SELECT column_name
#     FROM information_schema.columns
#     WHERE table_name = 'results';
# SELECT 'Last Name' FROM results WHERE 'State'='CA';


In [31]:
# putting all the df together
# Concatenate the three data frames vertically
df_combo_2020 = pd.concat([df_p_2020, df_s_2020, df_h_2020], axis=0)

# Reset the index of the combined data frame
df_combo_2020.reset_index(drop=True, inplace=True)
df_combo_2020
# gives 1128 rows

Unnamed: 0,Year,Office,State,Last Name,Party,Vote %,Vote count
0,2020,Pres,AL,Trump,R,0.620316,1441170
1,2020,Pres,AL,Biden,D,0.365700,849624
2,2020,Pres,AL,Jorgensen,IND,0.010836,25176
3,2020,Pres,AK,Trump,R,0.528331,189951
4,2020,Pres,AK,Biden,D,0.427720,153778
...,...,...,...,...,...,...,...
1123,2020,House,WI,Zunker,D,0.392140,162741
1124,2020,House,WI,Gallagher,R,0.641811,268173
1125,2020,House,WI,Stuck,D,0.357933,149558
1126,2020,House,WY,Cheney,R,0.685631,185732


In [25]:
df_combo_2020['Year'] = 2020
df_combo_2020

Unnamed: 0,Year,Office,State,Last Name,Party,Vote %,Vote count
0,2020,Pres,AL,Trump,R,0.620316,1441170
1,2020,Pres,AL,Biden,D,0.365700,849624
2,2020,Pres,AL,Jorgensen,IND,0.010836,25176
3,2020,Pres,AK,Trump,R,0.528331,189951
4,2020,Pres,AK,Biden,D,0.427720,153778
...,...,...,...,...,...,...,...
1123,2020,House,WI,Zunker,D,0.392140,162741
1124,2020,House,WI,Gallagher,R,0.641811,268173
1125,2020,House,WI,Stuck,D,0.357933,149558
1126,2020,House,WY,Cheney,R,0.685631,185732


In [49]:
# map that defines the party wins by state for each office.
# Define colors for each party
colors = {'R': 'orchid', 'D': 'purple', 'IND': 'grey'}

# Define color labels
color_labels = {'R': 'Republican', 'D': 'Democrat', 'IND': 'Independent'}

# Aggregate the data by year, state, and office, and calculate the party with the highest vote percentage
df_state_2020_agg_house = df_combo_2020[df_combo_2020['Office']=='House'].groupby(['Year', 'State', 'Office']).apply(lambda x: x.loc[x['Vote %'].idxmax()])[['Vote %', 'Party']].reset_index()
df_state_2020_agg_senate = df_combo_2020[df_combo_2020['Office']=='Senate'].groupby(['Year', 'State', 'Office']).apply(lambda x: x.loc[x['Vote %'].idxmax()])[['Vote %', 'Party']].reset_index()
df_state_2020_agg_pres = df_combo_2020[df_combo_2020['Office']=='Pres'].groupby(['Year', 'State', 'Office']).apply(lambda x: x.loc[x['Vote %'].idxmax()])[['Vote %', 'Party']].reset_index()

# Map the winner of each state to colors
df_state_2020_agg_house['color'] = df_state_2020_agg_house['Party'].map(colors)
df_state_2020_agg_senate['color'] = df_state_2020_agg_senate['Party'].map(colors)
df_state_2020_agg_pres['color'] = df_state_2020_agg_pres['Party'].map(colors)

# Create choropleth maps with custom color labels for each office
fig_house = px.choropleth(df_state_2020_agg_house, 
                    locations='State',
                    locationmode='USA-states',
                    color='Party',
                    scope='usa',
                    color_discrete_map=colors,
                    hover_data={'State': True, 'Office': True, 'Vote %': True},
                    title='2020 US House Election Results')

fig_senate = px.choropleth(df_state_2020_agg_senate, 
                    locations='State',
                    locationmode='USA-states',
                    color='Party',
                    scope='usa',
                    color_discrete_map=colors,
                    hover_data={'State': True, 'Office': True, 'Vote %': True},
                    title='2020 US Senate Election Results')

fig_pres = px.choropleth(df_state_2020_agg_pres, 
                    locations='State',
                    locationmode='USA-states',
                    color='Party',
                    scope='usa',
                    color_discrete_map=colors,
                    hover_data={'State': True, 'Office': True, 'Vote %': True},
                    title='2020 US Presidential Election Results')

# Update the color scale labels for all three figures
for fig in [fig_house, fig_senate, fig_pres]:
    fig.update_layout(coloraxis_colorbar=dict(title='Party', tickvals=['R', 'D', 'IND'], 
                                          ticktext=[color_labels[x] for x in ['R', 'D', 'IND']]))

# Display all three figures
display(fig_house)
display(fig_senate)
display(fig_pres)

In [50]:
# map that defines the party wins by state for each office.
# Define colors for each party
colors = {'R': 'orchid', 'D': 'purple', 'IND': 'grey'}

# Define color labels
color_labels = {'R': 'Republican', 'D': 'Democrat', 'IND': 'Independent'}

# Aggregate the data by year, state, and office, and calculate the party with the highest vote percentage
df_state_2020_agg = df_combo_2020.groupby(['Year', 'State']).agg({'Vote %': 'mean', 'Party': 'first', 'Office': lambda x: ', '.join(sorted(x))}).reset_index()

# Map the winner of each state to colors
df_state_2020_agg['color'] = df_state_2020_agg['Party'].apply(lambda x: colors[x])

# Create choropleth map with custom color labels
fig = px.choropleth(df_state_2020_agg, 
                    locations='State',
                    locationmode='USA-states',
                    color='Party',
                    scope='usa',
                    color_discrete_map=colors,
                    hover_data={'State': True, 'Office': True, 'Vote %': True},
                    title='2020 US Election Results (Average of All Offices)')

# Update the color scale labels
fig.update_layout(coloraxis_colorbar=dict(title='Party', tickvals=['R', 'D', 'IND'], 
                                          ticktext=[color_labels[x] for x in ['R', 'D', 'IND']]))

# Show map
fig.show()


In [28]:
# map that defines the house party wins by state.
# Define colors for each party
colors = {'R': 'red', 'D': 'dodgerblue', 'IND': 'grey'}

# Define color labels
color_labels = {'R': 'Republican', 'D': 'Democrat', 'IND': 'Independent'}

# Aggregate the data by year, state, and office, and calculate the party with the highest vote percentage
df_state_2020_agg = df_combo_2020.groupby(['Year', 'State', 'Office']).apply(lambda x: x.loc[x['Vote %'].idxmax()])[['Vote %', 'Party']].reset_index()

# Map the winner of each state to colors
df_state_2020_agg['color'] = df_state_2020_agg['Party'].map(colors)

# Map the winner of each state to colors
df_state_2020_agg['color'] = df_state_2020_agg['Party'].apply(lambda x: colors[x])

# Create choropleth map with custom color labels
fig = px.choropleth(df_state_2020_agg, 
                    locations='State',
                    locationmode='USA-states',
                    color='Party',
                    scope='usa',
                    color_discrete_map=colors,
                    hover_data={'State': True, 'Office': True, 'Vote %': True},
                    title='2020 US House Election Results')

# Update the color scale labels
fig.update_layout(coloraxis_colorbar=dict(title='Party', tickvals=['R', 'D', 'IND'], 
                                          ticktext=[color_labels[x] for x in ['R', 'D', 'IND']]))

# Show map
fig.show()

In [29]:
# Group the data by state and find the winner
df_state_2020 = df_combo_2020.groupby('State').agg({'Vote %': 'idxmax', 'Party': 'first'})

# Define colors for each party
colors = {'R': 'red', 'D': 'dodgerblue', 'IND': 'grey'}

# Map the winner of each state to colors
df_state_2020['color'] = df_state_2020['Party'].map(colors)

# Add the state names to the DataFrame
df_state_2020['State'] = df_state_2020.index

# Define color labels
color_labels = {'R': 'Republican', 'D': 'Democrat', 'IND': 'Independent'}

# Create choropleth map with custom color labels
fig = px.choropleth(df_state_2020, 
                    locations='State',
                    locationmode='USA-states',
                    color='Party',
                    scope='usa',
                    color_discrete_map=colors,
                    title='2020 US Election Results')

# Update the color scale labels
fig.update_layout(coloraxis_colorbar=dict(title='Party', tickvals=['R', 'D', 'IND'], 
                                          ticktext=[color_labels[x] for x in ['R', 'D', 'IND']]))

# Show map
fig.show()


### Just trying something not a graph to use

In [30]:
# import plotly.express as px
# import pandas as pd

# Filter the data by selecting only the relevant columns
# df_selection = df_combo_2020[['LAST NAME', 'GENERAL %', 'GENERAL RESULTS', 'STATE ABBREVIATION']]

# Create map
fig = px.choropleth(df_combo_2020, locations='State', locationmode='USA-states',
                    scope='usa', color='Vote %',
                    color_continuous_scale='reds',
                    hover_name='Vote count',
                    title='Election Results 2020')

# Show map
fig.show()


In [4]:
# Filter the data by selecting only the relevant columns
df_selection = df_pres_2020[['LAST NAME', 'GENERAL %', 'GENERAL RESULTS', 'STATE ABBREVIATION'.'PARTY']]

# Create a new column that identifies the candidate as Trump, Biden, or "All others"
df_selection['Candidate'] = np.where(df_selection['LAST NAME'] == 'Trump', 'Trump', np.where(df_selection['LAST NAME'] == 'Biden', 'Biden', 'Others'))

# Calculate the percentage of the vote for each candidate
df_selection['Percentage of Vote'] = df_selection['GENERAL RESULTS'] / df_selection.groupby('STATE ABBREVIATION')['GENERAL RESULTS'].transform('sum')

# Define the desired order of columns
column_order = ['Biden', 'Trump', 'Others']

# Pivot the data frame
df_pivot = df_selection.pivot_table(values='Percentage of Vote', index='STATE ABBREVIATION', columns='Candidate')
# Reorder the columns
df_pivot = df_pivot.reindex(columns=column_order)
# Print the results
print(df_pivot)


Candidate              Biden     Trump    Others
STATE ABBREVIATION                              
AK                  0.427720  0.528331  0.007325
AL                  0.365700  0.620316  0.006992
AR                  0.347751  0.623957  0.002572
AZ                  0.493647  0.490560  0.002632
CA                  0.634844  0.343203  0.002439
CO                  0.553995  0.418979  0.001287
CT                  0.592607  0.391871  0.001411
DC                  0.921497  0.053973  0.004906
DE                  0.587430  0.397749  0.000780
FL                  0.478615  0.512198  0.000835
GA                  0.494731  0.492375  0.000759
HI                  0.637336  0.342689  0.004994
IA                  0.448917  0.530893  0.002524
ID                  0.330694  0.638434  0.002375
IL                  0.575416  0.405534  0.001003
IN                  0.409631  0.570306  0.001672
KS                  0.415086  0.561437  0.001956
KY                  0.361515  0.620866  0.001355
LA                  

In [None]:
# don't use this one
# loads a table called 'pres_2020' into electiondb in postgresSQL
df_p_2020.to_sql("pres_2020", engine, if_exists="replace", index=False, columns=["Year", "Office", "State", "Last Name", "Party", "Vote %", "Vote count"])

SELECT has_database_privilege('{postgres}', '{electiondb}', 'CONNECT');


In [124]:
df_state_2020['Vote count'] = pd.to_numeric(df_state_2020['Vote count'])
df_state_2020['Vote count_total'] = pd.to_numeric(df_state_2020['Vote count_total'])
df_state_2020['Final Vote %'] = df_state_2020['Final Vote %'].astype(float)

df_state_2020.dtypes

Year                 object
Office               object
State                object
Party                object
Vote %              float64
Vote count            int64
Vote count_total      int64
Final Vote %        float64
dtype: object

In [30]:
# Group the data by state and sum the general results
state_totals = df_pres_2020.groupby('STATE').sum()['GENERAL RESULTS']

# Sort the results by total votes in descending order
sorted_state_totals = state_totals.sort_values(ascending=False)

# Print the sorted results
print(sorted_state_totals)

STATE
California              17501380.0
Texas                   11315516.0
Florida                 11067456.0
New York                 8616861.0
Pennsylvania             6936976.0
Illinois                 6033744.0
Ohio                     5922202.0
Michigan                 5539302.0
North Carolina           5524804.0
Georgia                  4999960.0
New Jersey               4549457.0
Virginia                 4460524.0
Washington               4087631.0
Massachusetts            3631402.0
Arizona                  3387326.0
Wisconsin                3298041.0
Minnesota                3277171.0
Colorado                 3256980.0
Tennessee                3053851.0
Maryland                 3037030.0
Indiana                  3033210.0
Missouri                 3025962.0
South Carolina           2513329.0
Oregon                   2374321.0
Alabama                  2323282.0
Louisiana                2148062.0
Kentucky                 2136768.0
Connecticut              1823857.0
Iowa          

  state_totals = df_pres_2020.groupby('STATE').sum()['GENERAL RESULTS']
