## 2020 US election forecasting using polling data for simulations

### 1. Understanding our Dataset

In [1]:
import pandas as pd
import numpy as np
#  Read the csv file
df = pd.read_csv("us_poll_2020.csv")
df.head()

Unnamed: 0,question_id,poll_id,cycle,state,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,...,created_at,notes,url,stage,race_id,answer,candidate_id,candidate_name,candidate_party,pct
0,133816,71450,2020,Ohio,399,Rasmussen Reports/Pulse Opinion Research,,,Rasmussen Reports/Pulse Opinion Research,277.0,...,10/20/20 10:30,,https://www.rasmussenreports.com/public_conten...,general,6246,Biden,13256,Joseph R. Biden Jr.,DEM,48.0
1,133816,71450,2020,Ohio,399,Rasmussen Reports/Pulse Opinion Research,,,Rasmussen Reports/Pulse Opinion Research,277.0,...,10/20/20 10:30,,https://www.rasmussenreports.com/public_conten...,general,6246,Trump,13254,Donald Trump,REP,47.0
2,133838,71457,2020,Pennsylvania,399,Rasmussen Reports/Pulse Opinion Research,,,Rasmussen Reports/Pulse Opinion Research,277.0,...,10/20/20 13:21,,https://www.rasmussenreports.com/public_conten...,general,6249,Biden,13256,Joseph R. Biden Jr.,DEM,50.0
3,133838,71457,2020,Pennsylvania,399,Rasmussen Reports/Pulse Opinion Research,,,Rasmussen Reports/Pulse Opinion Research,277.0,...,10/20/20 13:21,,https://www.rasmussenreports.com/public_conten...,general,6249,Trump,13254,Donald Trump,REP,47.0
4,133777,71435,2020,Georgia,1102,Emerson College,1446.0,NewsNation,Emerson College,88.0,...,10/19/20 20:06,,https://emersonpolling.reportablenews.com/pr/g...,general,6221,Biden,13256,Joseph R. Biden Jr.,DEM,47.0


In [2]:
print("Number of rows = {} and number of columns = {}".format(df.shape[0], df.shape[1]))
print(df.columns)

Number of rows = 11909 and number of columns = 38
Index(['question_id', 'poll_id', 'cycle', 'state', 'pollster_id', 'pollster',
       'sponsor_ids', 'sponsors', 'display_name', 'pollster_rating_id',
       'pollster_rating_name', 'fte_grade', 'sample_size', 'population',
       'population_full', 'methodology', 'office_type', 'seat_number',
       'seat_name', 'start_date', 'end_date', 'election_date',
       'sponsor_candidate', 'internal', 'partisan', 'tracking',
       'nationwide_batch', 'ranked_choice_reallocated', 'created_at', 'notes',
       'url', 'stage', 'race_id', 'answer', 'candidate_id', 'candidate_name',
       'candidate_party', 'pct'],
      dtype='object')


| Colum|Description|  
|---|---|
| candidate_name	  |   The candidate for this answer choice| 
| start_date  |  The first day interviews were conducted for this poll | 
|end_date   | The last day interviews were conducted for this poll  |
|pollster|The name of the pollster|
|samplesize|The size of the sample|
|population|Whether the population interviewed was adults, registered voters, or likely voters|
|weight|A relative weight that describes how much this poll factors into the forecast relative to other polls|
|influence|A relative weight that describes how much this poll factors into today's the forecast (similar to "weight", but also takes into account how old the poll is)|
|pct|Voteshare for this candidate in this poll|
|house_adjusted_pct|	Voteshare in this poll after applying the house adjustment|
|trend_and_house_adjusted_pct|Voteshare in this poll after applying both house and trendline adjustments|
|tracking|Whether or not the poll sample overlaps with other polls in our database|
|poll_id|Unique identifier for a poll|
|question_id|Unique identifier for a question|



Although there are many useful  variables, we do not need all of them for our analysis.

In [3]:
# Select the required columns
df = df.loc[:, ['answer', 'start_date', 'end_date', 'state', 'pct', 'pollster', 'question_id', 'poll_id']]
df.head()

Unnamed: 0,answer,start_date,end_date,state,pct,pollster,question_id,poll_id
0,Biden,10/18/20,10/19/20,Ohio,48.0,Rasmussen Reports/Pulse Opinion Research,133816,71450
1,Trump,10/18/20,10/19/20,Ohio,47.0,Rasmussen Reports/Pulse Opinion Research,133816,71450
2,Biden,10/18/20,10/19/20,Pennsylvania,50.0,Rasmussen Reports/Pulse Opinion Research,133838,71457
3,Trump,10/18/20,10/19/20,Pennsylvania,47.0,Rasmussen Reports/Pulse Opinion Research,133838,71457
4,Biden,10/17/20,10/19/20,Georgia,47.0,Emerson College,133777,71435


Next, we will examine the data type of each column

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11909 entries, 0 to 11908
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   answer       11909 non-null  object 
 1   start_date   11909 non-null  object 
 2   end_date     11909 non-null  object 
 3   state        7363 non-null   object 
 4   pct          11909 non-null  float64
 5   pollster     11909 non-null  object 
 6   question_id  11909 non-null  int64  
 7   poll_id      11909 non-null  int64  
dtypes: float64(1), int64(2), object(5)
memory usage: 744.4+ KB


Based on the above information, we can conclude that:
1. The data type of the start and end date columns needs to be changed
2. There are many missing values in the state column

In [5]:
# Converting the data type of the start and end date columns
df = (df
      .assign(start_date = pd.to_datetime(df["start_date"]),
              end_date = pd.to_datetime(df['end_date']))
    )
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11909 entries, 0 to 11908
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   answer       11909 non-null  object        
 1   start_date   11909 non-null  datetime64[ns]
 2   end_date     11909 non-null  datetime64[ns]
 3   state        7363 non-null   object        
 4   pct          11909 non-null  float64       
 5   pollster     11909 non-null  object        
 6   question_id  11909 non-null  int64         
 7   poll_id      11909 non-null  int64         
dtypes: datetime64[ns](2), float64(1), int64(2), object(3)
memory usage: 744.4+ KB


### Data Cleaning and Exploring.
We will begin our data exploration by describing the numerical variables and examining the categorical variables. Additionally, we will review the date time columns and filter for the required dates.

In [6]:
df.pct.describe()

count    11909.000000
mean        43.335947
std         12.748563
min          0.000000
25%         41.000000
50%         45.000000
75%         50.000000
max         90.530000
Name: pct, dtype: float64

In [7]:
print("The total number of candidate in 2020 = {}".format(len(df.answer.unique())))
df.answer.value_counts()

The total number of candidate in 2020 = 50


Trump            5520
Biden            4007
Sanders           435
Warren            348
Jorgensen         347
Hawkins           253
Buttigieg         233
Harris            158
Bloomberg         103
Klobuchar          76
O'Rourke           59
Pence              57
West               41
Booker             34
Schultz            30
Haley              20
Castro             17
Yang               16
Amash              13
Blankenship        13
Steyer             11
Gillibrand         11
Gabbard            10
Williamson          8
De La Fuente        8
Delaney             7
Hickenlooper        6
Inslee              6
Obama               6
de Blasio           5
Messam              5
Bullock             4
Moulton             4
Clinton             4
Gravel              4
Pierce              4
Bennet              3
Winfrey             3
Swalwell            3
Brown               3
Simmons             2
La Riva             2
Ryan                2
Pelosi              2
Hornberger          1
Cuomo     

There are a total of 50 candidates, but only Trump and Biden are considered major candidates. Therefore, for the purpose of our analysis, we will focus on Trump and Biden. 


In [8]:
df = (df
      .query("answer in ['Trump', 'Biden']"))
df.answer.value_counts()

Trump    5520
Biden    4007
Name: answer, dtype: int64

In [9]:
print(" The minimum and maximum poll start dates are {} and {} respectively".format(df.start_date.min(), df.start_date.max()))
print(" The minimum and maximum poll end dates are {} and {} respectively".format(df.end_date.min(), df.end_date.max()))

 The minimum and maximum poll start dates are 2018-11-12 00:00:00 and 2020-10-18 00:00:00 respectively
 The minimum and maximum poll end dates are 2018-11-13 00:00:00 and 2020-10-19 00:00:00 respectively


For the purpose of our analysis, we will consider poll data that starts or ends between August 1st. 2020 and November 2nd, 2020

In [10]:
start_date = '2020-08-01'
end_date = '2020-11-02'
df = (df
      .query("start_date>=@start_date & end_date<=@end_date"))
print(" The minimum and maximum poll start dates are {} and {} respectively".format(df.start_date.min(), df.start_date.max()))
print(" The minimum and maximum poll end dates are {} and {} respectively".format(df.end_date.min(), df.end_date.max()))

 The minimum and maximum poll start dates are 2020-08-01 00:00:00 and 2020-10-18 00:00:00 respectively
 The minimum and maximum poll end dates are 2020-08-03 00:00:00 and 2020-10-19 00:00:00 respectively


In [11]:
print("Now the total number of rows and columns are {} and {} respectively".format(df.shape[0], df.shape[1]))

Now the total number of rows and columns are 4648 and 8 respectively


Let's examine the null values in the state column. 

In [12]:
print("The total number of states in our data are = {}".format(len(df.state.unique())))
df.state.value_counts()

The total number of states in our data are = 55


Wisconsin               228
Pennsylvania            186
North Carolina          164
Florida                 152
Arizona                 148
Michigan                138
Georgia                 102
Texas                    92
Ohio                     80
Minnesota                76
Iowa                     66
Colorado                 62
New Hampshire            58
Maine                    58
South Carolina           56
Nevada                   56
California               52
Virginia                 50
Missouri                 50
Montana                  50
Kansas                   48
New Jersey               48
Louisiana                44
Kentucky                 44
Alaska                   44
Maryland                 44
Washington               42
Alabama                  42
Delaware                 40
Oklahoma                 40
New York                 40
Utah                     40
North Dakota             40
West Virginia            40
Oregon                   40
Indiana             

Based on the above analysis, it appears that there are more states in our dataset than there are actual states in the United states. To address this, we will remove the CD-1 and CD-2 states and treat the poll as if it covers the entire state. 

In [13]:
df['state'] = (df['state']
                .str
                .replace(r' CD-1', '')
                .str
                .replace(r' CD-2', ''))
print("The total number of states in our data are = {}".format(len(df.state.unique())))
df.state.value_counts().sort_index()

The total number of states in our data are = 52


Alabama                  42
Alaska                   44
Arizona                 148
Arkansas                 38
California               52
Colorado                 62
Connecticut              36
Delaware                 40
District of Columbia     36
Florida                 152
Georgia                 102
Hawaii                   38
Idaho                    38
Illinois                 38
Indiana                  40
Iowa                     66
Kansas                   48
Kentucky                 44
Louisiana                44
Maine                    94
Maryland                 44
Massachusetts            40
Michigan                138
Minnesota                76
Mississippi              38
Missouri                 50
Montana                  50
Nebraska                 42
Nevada                   56
New Hampshire            58
New Jersey               48
New Mexico               40
New York                 40
North Carolina          164
North Dakota             40
Ohio                

In [14]:
# We need to remove all rows with null values in the state column.
df = df.dropna()

In [15]:
print("The dimensions of the DataFrame after cleaning = {}".format(df.shape))
df.info()

The dimensions of the DataFrame after cleaning = (3202, 8)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3202 entries, 0 to 5276
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   answer       3202 non-null   object        
 1   start_date   3202 non-null   datetime64[ns]
 2   end_date     3202 non-null   datetime64[ns]
 3   state        3202 non-null   object        
 4   pct          3202 non-null   float64       
 5   pollster     3202 non-null   object        
 6   question_id  3202 non-null   int64         
 7   poll_id      3202 non-null   int64         
dtypes: datetime64[ns](2), float64(1), int64(2), object(3)
memory usage: 225.1+ KB


In [16]:
df.head()

Unnamed: 0,answer,start_date,end_date,state,pct,pollster,question_id,poll_id
0,Biden,2020-10-18,2020-10-19,Ohio,48.0,Rasmussen Reports/Pulse Opinion Research,133816,71450
1,Trump,2020-10-18,2020-10-19,Ohio,47.0,Rasmussen Reports/Pulse Opinion Research,133816,71450
2,Biden,2020-10-18,2020-10-19,Pennsylvania,50.0,Rasmussen Reports/Pulse Opinion Research,133838,71457
3,Trump,2020-10-18,2020-10-19,Pennsylvania,47.0,Rasmussen Reports/Pulse Opinion Research,133838,71457
4,Biden,2020-10-17,2020-10-19,Georgia,47.0,Emerson College,133777,71435


We will reformat the data into the required format. The data currently contains two rows for each poll, one for Biden and one for Trump. For out analysis, we will combine these two rows into a single row.

In [17]:
df = (df
     .pivot_table(index=['start_date', 'end_date', 'state', 'poll_id', 'pollster', 'question_id'], columns=['answer'], values='pct')
     .reset_index())
df.head()

answer,start_date,end_date,state,poll_id,pollster,question_id,Biden,Trump
0,2020-08-01,2020-08-05,Texas,67958,Trafalgar Group,127253,43.4,49.4
1,2020-08-01,2020-08-10,Wisconsin,68437,Morning Consult,128123,50.0,43.0
2,2020-08-01,2020-08-31,Alabama,70395,SurveyMonkey,131146,33.0,65.0
3,2020-08-01,2020-08-31,Alabama,70395,SurveyMonkey,131197,36.69,60.1
4,2020-08-01,2020-08-31,Alaska,70396,SurveyMonkey,131147,42.0,57.0


### Let's Understand the US Electoral process
The USA Electoral Map
<br>
There are 538 seats and 270 to win
<br>
![alt text](image.jpg)

In [18]:
states = {
    "Alabama": 9,
    "Alaska": 3,
    "Arizona": 11,
    "Arkansas": 6,
    "California": 55,
    "Colorado": 9,
    "Connecticut": 7,
    "Delaware": 3,
    "District of Columbia": 3,
    "Florida": 29,
    "Georgia": 16,
    "Hawaii": 4,
    "Idaho": 4,
    "Illinois": 20,
    "Indiana": 11,
    "Iowa": 6,
    "Kansas": 6,
    "Kentucky": 8,
    "Louisiana": 8,
    "Maine": 4,
    "Maryland": 10,
    "Massachusetts": 11,
    "Michigan": 16,
    "Minnesota": 10,
    "Mississippi": 6,
    "Missouri": 10,
    "Montana": 3,
    "Nebraska": 5,
    "Nevada": 6,
    "New Hampshire": 4,
    "New Jersey": 14,
    "New Mexico": 5,
    "New York": 29,
    "North Carolina": 15,
    "North Dakota": 3,
    "Ohio": 18,
    "Oklahoma": 7,
    "Oregon": 7,
    "Pennsylvania": 20,
    "Rhode Island": 4,
    "South Carolina": 9,
    "South Dakota": 3,
    "Tennessee": 11,
    "Texas": 38,
    "Utah": 6,
    "Vermont": 3,
    "Virginia": 13,
    "Washington": 12,
    "West Virginia": 5,
    "Wisconsin": 10,
    "Wyoming": 3,
}

In [19]:
df = df.assign(seats = lambda row: row['state'].map(states))
df.head()

answer,start_date,end_date,state,poll_id,pollster,question_id,Biden,Trump,seats
0,2020-08-01,2020-08-05,Texas,67958,Trafalgar Group,127253,43.4,49.4,38
1,2020-08-01,2020-08-10,Wisconsin,68437,Morning Consult,128123,50.0,43.0,10
2,2020-08-01,2020-08-31,Alabama,70395,SurveyMonkey,131146,33.0,65.0,9
3,2020-08-01,2020-08-31,Alabama,70395,SurveyMonkey,131197,36.69,60.1,9
4,2020-08-01,2020-08-31,Alaska,70396,SurveyMonkey,131147,42.0,57.0,3


### Let's visualise data

In [20]:
import plotly.express as px
df_diff =  (df
            .groupby('start_date')
            .agg({'Biden':'mean',
                  'Trump':'mean'})
            .reset_index()
            .assign(diff =  lambda x: x['Biden'] - x['Trump'])
            )
df_diff.head()


ModuleNotFoundError: No module named 'plotly'

In [None]:
import plotly.express as px
fig = px.line(df_diff,
              x='start_date',
              y='diff',
              title='Biden vs Trump Difference'
             )
fig.update_layout(
    xaxis_title="Start date of the poll",
    yaxis_title='',
    plot_bgcolor='white'
)
fig.show()

We add center and standard deviation to our averaged polling data

In [None]:
def get_center(row):
    return row["Biden"] + (100 - row["Trump"] - row["Biden"]) / 2

def get_deviations(row):
    return (abs(row['Trump'] - row["Center"]) / 2) + 3

df = (df
      .assign(Center = lambda x: get_center(x),
              Stddev = lambda x: get_deviations(x)))
df.head()

answer,start_date,end_date,state,poll_id,pollster,question_id,Biden,Trump,seats,Center,Stddev
0,2020-08-01,2020-08-05,Texas,67958,Trafalgar Group,127253,43.4,49.4,38,47.0,4.2
1,2020-08-01,2020-08-10,Wisconsin,68437,Morning Consult,128123,50.0,43.0,10,53.5,8.25
2,2020-08-01,2020-08-31,Alabama,70395,SurveyMonkey,131146,33.0,65.0,9,34.0,18.5
3,2020-08-01,2020-08-31,Alabama,70395,SurveyMonkey,131197,36.69,60.1,9,38.295,13.9025
4,2020-08-01,2020-08-31,Alaska,70396,SurveyMonkey,131147,42.0,57.0,3,42.5,10.25


#### Now we're going average all polls
This isn't the best method as it disregards any weighting to better polls and assumes all area equal. In a more advanced study we can use metrics like fte_grade, sample_size and methodology to weight polls.

In [None]:
poll_state_summary = (df
                      .groupby('state')
                      .mean()
                      .reset_index())
poll_state_summary.head()

answer,state,poll_id,question_id,Biden,Trump,seats,Center,Stddev
0,Alabama,70950.428571,132664.047619,39.769048,57.161429,9.0,41.30381,10.92881
1,Alaska,70976.818182,132717.727273,44.02,52.478636,3.0,45.770682,6.558523
2,Arizona,70060.324324,130928.810811,49.248514,45.155946,11.0,52.046284,6.552061
3,Arkansas,71027.052632,132858.421053,38.753684,58.801579,6.0,39.976053,12.412763
4,California,70459.692308,131683.423077,62.081923,33.621154,55.0,64.230385,18.304615


### Simulation Time!
We're going to generate a random number from the distribution.

If it's above 50, we'll say Biden won the state.

If it's below 50, we'll say Biden lost the state.

If Biden wins the state, he gets the number of electoral seats that state represents.

If he doesn't win he get's nothing.....just the way it is, live with it.

This will give us a nice little series with the numbers representing the electoral seats she won from each state.

The output table below shows for each state (in a each row), the number of seats Biden one. The column represents the results of one simulation.

In [None]:
from numpy.random import normal
import ipykernel
ipykernel.__version__
def run_election_simulation(times, polls):
    # create a blank data frame
    result = pd.DataFrame()
    
    def get_seats(row):
        # using normal distribution for given mean and std
        # if the normal distribution value is greater than 50 then that indicates a win for Biden
        sample = normal(row['Center'], row['Stddev'])
        return row["seats"] if sample > 50 else 0
    for i in range(times):
        result[i] = polls.apply(get_seats, axis=1)
    
    return result

# given an example of the output
res = run_election_simulation(10, poll_state_summary)
res.loc['Total',:] = res.sum(axis=0)

# Adding State Name to Results DF
res = pd.concat([poll_state_summary['state'], res], axis=1, sort=False)

# A 0.0 means Biden didn't win that seat, or that Trump won that seat
res

Unnamed: 0,state,0,1,2,3,4,5,6,7,8,9
0,Alabama,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Alaska,0.0,3.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0
2,Arizona,11.0,0.0,11.0,11.0,11.0,11.0,11.0,0.0,0.0,11.0
3,Arkansas,0.0,0.0,0.0,0.0,0.0,6.0,0.0,6.0,0.0,0.0
4,California,55.0,55.0,0.0,55.0,0.0,55.0,0.0,55.0,55.0,55.0
5,Colorado,9.0,9.0,0.0,9.0,0.0,9.0,9.0,9.0,9.0,9.0
6,Connecticut,0.0,0.0,7.0,7.0,7.0,0.0,7.0,0.0,7.0,0.0
7,Delaware,0.0,3.0,3.0,3.0,0.0,3.0,3.0,3.0,3.0,0.0
8,District of Columbia,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
9,Florida,29.0,29.0,29.0,29.0,29.0,0.0,0.0,29.0,29.0,29.0


### Let's analyze the above simulated results on a per state level

In [None]:

# Convert values to intergers
summary = pd.DataFrame((res == 0).astype(int).sum(axis=1))

# Add back states to leftmost column
res_summary = pd.concat([poll_state_summary['state'], summary], axis=1, sort=False)
res_summary.columns = ['state', 'trump_wins']

# Get a percentage of times Trump Won in the simulations
res_summary['percent_trump_wins'] = (res_summary['trump_wins'] / 10) * 100
res_summary.head()
     

Unnamed: 0,state,trump_wins,percent_trump_wins
0,Alabama,9,90.0
1,Alaska,7,70.0
2,Arizona,3,30.0
3,Arkansas,8,80.0
4,California,3,30.0


### THIS WHERE THE SIMULATIONS HAPPEN (10K)

In [None]:
number_of_simulations = 10000

election_results = run_election_simulation(number_of_simulations, poll_state_summary)
election_results.loc['Total',:] = election_results.sum(axis=0)

biden_wins = election_results.loc['Total'][election_results.loc['Total'] > 270.0].count()

# Print output
f"Joe Biden wins {biden_wins} times out of {number_of_simulations} simulations"


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`



'Joe Biden wins 8039 times out of 10000 simulations'

#### Let's now summarize it per state

In [None]:

summary = pd.DataFrame((election_results == 0).astype(int).sum(axis=1))
res_summary = pd.concat([poll_state_summary['state'], summary], axis=1, sort=False)
res_summary.columns = ['state', 'trump_wins']
res_summary['percent_trump_wins'] = (res_summary['trump_wins'] / number_of_simulations) * 100
res_summary['percent_biden_wins'] = 100 - res_summary['percent_trump_wins']
res_summary = res_summary.dropna()
res_summary.head()#.sort_values(by="trump_wins")
     

Unnamed: 0,state,trump_wins,percent_trump_wins,percent_biden_wins
0,Alabama,7824,78.24,21.76
1,Alaska,7422,74.22,25.78
2,Arizona,3739,37.39,62.61
3,Arkansas,7793,77.93,22.07
4,California,2186,21.86,78.14


In [None]:

def getSeats(row):
    return states[row["state"]]

res_summary["seats"] = res_summary.apply(getSeats, axis=1)

res_summary['seats_trump_wins'] = res_summary['percent_trump_wins'].apply(lambda x: 1 if x > 50 else 0)
res_summary['seats_trump_wins'] = res_summary['seats_trump_wins'] * res_summary['seats']

res_summary['seats_biden_wins'] = res_summary['percent_biden_wins'].apply(lambda x: 1 if x > 50 else 0)
res_summary['seats_biden_wins'] = res_summary['seats_biden_wins'] * res_summary['seats']
seats_trump_wins = res_summary['seats_trump_wins'].sum()
seats_biden_wins = res_summary['seats_biden_wins'].sum()
print(seats_biden_wins)
print(seats_trump_wins)
res_summary.head()
     

350
188


Unnamed: 0,state,trump_wins,percent_trump_wins,percent_biden_wins,seats,seats_trump_wins,seats_biden_wins
0,Alabama,7824,78.24,21.76,9,9,0
1,Alaska,7422,74.22,25.78,3,3,0
2,Arizona,3739,37.39,62.61,11,0,11
3,Arkansas,7793,77.93,22.07,6,6,0
4,California,2186,21.86,78.14,55,0,55


In [None]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

res_summary = res_summary.replace({"state": us_state_abbrev}) 
res_summary.head()
     


Unnamed: 0,state,trump_wins,percent_trump_wins,percent_biden_wins,seats,seats_trump_wins,seats_biden_wins
0,AL,7824,78.24,21.76,9,9,0
1,AK,7422,74.22,25.78,3,3,0
2,AZ,3739,37.39,62.61,11,0,11
3,AR,7793,77.93,22.07,6,6,0
4,CA,2186,21.86,78.14,55,0,55


In [None]:

import plotly.graph_objects as go

fig = go.Figure(data=go.Choropleth(
    locations = res_summary['state'], # Spatial coordinates
    z = res_summary['percent_trump_wins'].astype(float), # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'Bluered',
    colorbar_title = "Trump",
))

title = f'{number_of_simulations} Simulations of the 2020 US Election show Biden winning {seats_biden_wins} and Trump winning {seats_trump_wins} Seats'
fig.update_layout(
    title_text = title, #'2020 Simulation of The US Presidential Election',
    geo_scope='usa', # limite map scope to USA
)

fig.show()