# ISOM 3400 Group Assignment
### Data Visualization and Analysis on *U.S. 2022 House election*
Programmed & Presented by Regan Yin


# Part I: Web Scraping

## Step 1: Import Needed Libraries

In [1]:
# Import packages
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementClickInterceptedException

## Step 2: Prepare for Driver

In [2]:
s = Service(ChromeDriverManager().install())
option = webdriver.ChromeOptions()
prefs = {'profile.default_content_setting_values': {'cookies': 1, 'images': 2, 'javascript': 1,
                            'plugins': 2, 'popups': 2, 'geolocation': 2,
                            'notifications': 2, 'auto_select_certificate': 2, 'fullscreen': 2,
                            'mouselock': 2, 'mixed_script': 2, 'media_stream': 2,
                            'media_stream_mic': 2, 'media_stream_camera': 2, 'protocol_handlers': 2,
                            'ppapi_broker': 2, 'automatic_downloads': 2, 'midi_sysex': 2,
                            'push_messaging': 2, 'ssl_cert_decisions': 1, 'metro_switch_to_desktop': 2,
                            'protected_media_identifier': 2, 'app_banner': 2, 'site_engagement': 2,
                            'durable_storage': 2}}
option.add_experimental_option("prefs", prefs)
option.add_argument("start-maximized")
option.add_argument("disable-infobars")
option.add_argument("disable-extensions")
# fast mode
driver = webdriver.Chrome(service=s, options=option)
# no options
# driver = webdriver.Chrome(service=s)

## Step 3: Create a function for getting election data

In [3]:
# Create a dictioary of postal code for each state

state_dict = { 'Alabama': 'AL',
        'Alaska': 'AK',
        'Arizona': 'AZ',
        'Arkansas': 'AR',
        'California': 'CA',
        'Colorado': 'CO',
        'Connecticut': 'CT',
        'Delaware': 'DE',
        'Florida': 'FL',
        'Georgia': 'GA',
        'Hawaii': 'HI',
        'Idaho': 'ID',
        'Illinois': 'IL',
        'Indiana': 'IN',
        'Iowa': 'IA',
        'Kansas': 'KS',
        'Kentucky': 'KY',
        'Louisiana': 'LA',
        'Maine': 'ME',
        'Maryland': 'MD',
        'Massachusetts': 'MA',
        'Michigan': 'MI',
        'Minnesota': 'MN',
        'Mississippi': 'MS',
        'Missouri': 'MO',
        'Montana': 'MT',
        'Nebraska': 'NE',
        'Nevada': 'NV',
        'New Hampshire': 'NH',
        'New Jersey': 'NJ',
        'New Mexico': 'NM',
        'New York': 'NY',
        'North Carolina': 'NC',
        'North Dakota': 'ND',
        'Ohio': 'OH',
        'Oklahoma': 'OK',
        'Oregon': 'OR',
        'Pennsylvania': 'PA',
        'Rhode Island': 'RI',
        'South Carolina': 'SC',
        'South Dakota': 'SD',
        'Tennessee': 'TN',
        'Texas': 'TX',
        'Utah': 'UT',
        'Vermont': 'VT',
        'Virginia': 'VA',
        'Washington': 'WA',
        'West Virginia': 'WV',
        'Wisconsin': 'WI',
        'Wyoming': 'WY'}


In [4]:
# function: get election data from given diver 
def get_election_data(driver,state):

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    result_list = []

    # 1.get portal code
    portal_result = soup.select_one('div.styles_container__T4_VX')
    postal_code = '/'
    try:
        postal_code = portal_result.find('p', {'class': 'styles_district-number__5i_rg'}).get_text(strip=True)[
                  0:2]  # not every state show the postal code
    except:
        print('no portalcode')

    # 2.get every district group
    district_list = soup.select('div.styles_left-title__rNUfI')
    if len (district_list) != 0:
        district_list = soup.select('div.styles_left-title__rNUfI')
        
    # loop, the first tag is not data
        for district in district_list[1:]:
            # 3.get district rank and
            rank = district.find('h5', {'class': 'styles_is-table__D_lzx'}).get_text(strip=True)
            row_list = district.find_all('tr')
            # 4.get every people row of district group
            for index, row in enumerate(row_list[1:]):
                # print(index)
                # print('rowget_text: ' + row.get_text())

                # got name: Rogers* (R), its not a standard data
                name = row.find('span').get_text(strip=True)
                if "Hide other candidates" in name:
                    continue  # Skip rows with "Hide other candidates"
                # get party and incumbent data ,calculate the length we del from name
                party = row.find('span', {'class': 'styles_tag__5jkDh'})
                no_vote = row.find('td', {'class': 'styles_container__MY5SI'})
                percentage = row.find('td', {'class': 'styles_container__vzwvV'})
                del_str = 0
                # Some people don't have
                if party:
                    party = party.get_text()
                    if party == ("(R)"):
                        party = "Republican"
                        del_str = -3
                    elif party == ("(D)"):
                        party = "Democratic"
                        del_str = -3
                    elif party == ("(Ind.)"):
                        party = "Independent"
                        del_str = -6
                    elif party == ("(Green Party)"):
                        party = "Green Party"
                        del_str = -13
                    elif party == ("(Libertarian Party)"):
                        party = "Libertarian Party"
                        del_str = -19
                    else:
                        party = party
                        del_str = 0 - len(party)
                else:
                    print('no party')
                    party = "/"

                # the first row is incumbent
                if index == 0:
                    incum = "incumbent"
                    del_str -= 1
                else:
                    incum = "/"

                if del_str != 0:
                        name = name[:del_str]

                if no_vote:
                    no_vote = no_vote.get_text(strip=True)
                else:
                    no_vote = '/'

                if percentage:
                    percentage = percentage.get_text(strip=True)
                else:
                    percentage = '/'
                # print('result: ' + state, postal_code, rank, name, incum, party, no_vote, percentage)
                # result
                mapping = state_dict

                postal_code = mapping[state]
                result_list.append([state, postal_code, rank, party, name, incum, no_vote, percentage])

        return result_list
    
    
    else:
        district_list = soup.select_one('div.styles_is-desktop__oAx3v')
    
        rank = '1st'
        row_list = district_list.find_all('tr')
            # 4.get every people row of district group
        for index, row in enumerate(row_list[1:]):
            # print(index)
            # print('rowget_text: ' + row.get_text())

            # got name: Rogers* (R), its not a standard data
            name = row.find('span').get_text(strip=True)
            if "Hide other candidates" in name:
                continue  # Skip rows with "Hide other candidates"
            # get party and incumbent data ,calculate the length we del from name
            party = row.find('span', {'class': 'styles_tag__5jkDh'})
            no_vote = row.find('td', {'class': 'styles_container__MY5SI'})
            percentage = row.find('td', {'class': 'styles_container__vzwvV'})
            del_str = 0
            # Some people don't have
            if party:
                party = party.get_text()
                if party == ("(R)"):
                    party = "Republican"
                    del_str = -3
                elif party == ("(D)"):
                    party = "Democratic"
                    del_str = -3
                elif party == ("(Ind.)"):
                    party = "Independent"
                    del_str = -6
                elif party == ("(Green Party)"):
                    party = "Green Party"
                    del_str = -13
                elif party == ("(Libertarian Party)"):
                    party = "Libertarian Party"
                    del_str = -19
                else:
                    party = party
                    del_str = 0 - len(party)
            else:
                print('no party')
                party = "/"

            # the first row is incumbent
            if index == 0:
                incum = "incumbent"
                del_str -= 1
            else:
                incum = "/"

            if del_str != 0:
                name = name[:del_str]

            if no_vote:
                no_vote = no_vote.get_text(strip=True)
            else:
                no_vote = '/'

            if percentage:
                percentage = percentage.get_text(strip=True)
            else:
                percentage = '/'
            # print('result: ' + state, postal_code, rank, name, incum, party, no_vote, percentage)
            # result
            mapping = state_dict

            postal_code = mapping[state]
            result_list.append([state, postal_code, rank, party, name, incum, no_vote, percentage])

    return result_list

In [5]:
def wait_for_loading_to_disappear(driver, timeout=10):
    try:
        WebDriverWait(driver, timeout).until(
            EC.invisibility_of_element((By.CLASS_NAME, "styles_loading-screen__mEgJy"))
        )
    except TimeoutException:
        print("Loading screen did not disappear within the given time")

## Step 4: Web Scrapping

In [6]:
def to_dataframe(nested_ls):
    election_df = pd.DataFrame(nested_ls,
        columns=['State', 'State Code', 'District', 'Party', 'Candidate', 'Incumbent', 'Vote', 'Pct%'])
    return election_df

In [7]:
# params
state_list = []
election_info = []


In [8]:
# get state list
page_link = 'https://www.politico.com/2022-election/results/'  # a sample that the following code is correct
driver.get(page_link)
driver.implicitly_wait(8)
soup = BeautifulSoup(driver.page_source, 'html.parser')
data = soup.select('#__next > div > div.styles_container__IP0HW.styles_is-desktop__eeJ39 > div.styles_content__cDWDJ.styles_is-desktop__eeJ39 > div > div:nth-child(17) > div > div > div > div > div > div > div.styles_columns-container__Kq5vS > div > a')
for item in data:
    state_list.append(item.get_text())
print(state_list)


['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']


In [9]:
# loop, get data from every state
for state in state_list:
    lo = state.lower()
    lo = lo.replace(" ", "-")
    url = "https://www.politico.com/2022-election/results/{}/house/".format(lo)
    driver.get(url)
    driver.implicitly_wait(3)
    # select expand button
    try:
        # Wait until the expand button is clickable
        expand_button = WebDriverWait(driver, 2).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "button.styles_container__C5E_O"))
        )
        expand_button.click()
    except ElementClickInterceptedException:
        print("Element is not clickable yet, retrying...")
    except Exception as e:
        print('no button')
        # No expand button found, so just get the election data for the single district
        election_info += get_election_data(driver, state)
    else:
        if ("more races" in expand_button.text):
            # click the button to expand
            expand_button.click()
            election_info += get_election_data(driver, state)
        if ('Other candidates' in expand_button.text):
            # some district have more than 4 candidate and need to expend the list inside each district
            expand_button.click()
            election_info += get_election_data(driver, state)
        else:
            # Expand button not needed, get election data for the single district
            election_info += get_election_data(driver, state)


no party
no portalcode
no party
no button
no party
no party
no party
no party
no button
no button
no portalcode
no party
no party
no party
no party
no party
no party
no party
no party
no party
no button
no button
no party
no button
no party
no button
no button
no party
no party
no party
no party
no party
no party
no party
no party
no party
no party
no party
no party
no party
no party
no party
no button
no button
no button
no party
no button
no party
no button
no party
no button
no party
no party
no party
no party
no button
no portalcode
no button
no party
no party
no party
no party
no button
no party
no button
no portalcode
no party
no party
no party
no party
no party
no portalcode
no button
no button
no portalcode
no party


In [None]:

file_data = to_dataframe(election_info)
file_data.to_csv('house.csv')
print(file_data)

         State State Code District              Party          Candidate  \
0      Alabama         AL      1st         Republican               Carl   
1      Alabama         AL      1st  Libertarian Party             Remrey   
2      Alabama         AL      2nd         Republican              Moore   
3      Alabama         AL      2nd         Democratic        Harvey-Hall   
4      Alabama         AL      2nd  Libertarian Party              Realz   
..         ...        ...      ...                ...                ...   
957  Wisconsin         WI      8th  Libertarian Party         VandenPlas   
958    Wyoming         WY      1st         Republican     Harriet Hagema   
959    Wyoming         WY      1st         Democratic  Lynnette GreyBull   
960    Wyoming         WY      1st  Libertarian Party   Richard Brubaker   
961    Wyoming         WY      1st                  /     Marissa Selvig   

     Incumbent     Vote   Pct%  
0    incumbent  139,854  84.2%  
1            /   26,1

# Part II: Data Visualization


## Step 1: Import Needed Libraries

In [11]:
from jupyter_dash import JupyterDash
from dash import html, dcc
from dash.dependencies import Input, Output
import pandas as pd
import plotly.express as px
import numpy as np

## Step 2: Preprocessing Data

In [12]:
# Read and preprocess the data
house = pd.read_csv('house.csv')

# Convert Vote column to numeric and remove commas
house['Vote'] = house['Vote'].str.replace(',', '')

# Convert non-numeric values to NaN and remove rows with NaN in the 'Vote' column
house['Vote'] = pd.to_numeric(house['Vote'], errors='coerce')
house = house.dropna(subset=['Vote'])

# Convert cleaned 'Vote' column to integer
house['Vote'] = house['Vote'].astype('int')

# Convert Pct% column to numeric
house['Pct%'] = house['Pct%'].str.rstrip('%').astype('float') / 100.0

# Group by State, State Code, and District
grouped = house.groupby(['State', 'State Code', 'District'])

# Create a new DataFrame for the seat_won dataset
seat_won = pd.DataFrame(columns=['State', 'State Code', 'Total Seats', 'Total Votes', 'Won Seats', 'Republican Votes', 'Democratic Votes', 'Libertarian Votes', 'Rep. Won Seats %'])

# Iterate over each state and state code
for (state, state_code), state_data in house.groupby(['State', 'State Code']):
    total_seats = state_data['District'].nunique()
    total_votes = state_data['Vote'].sum()
    
    # Count the won seats for Republicans
    won_seats = 0
    for district, district_data in state_data.groupby('District'):
        max_pct_idx = district_data['Pct%'].idxmax()
        if house.loc[max_pct_idx, 'Party'] == 'Republican':
            won_seats += 1
    
    republican_votes = state_data.loc[state_data['Party'] == 'Republican', 'Vote'].sum()
    democratic_votes = state_data.loc[state_data['Party'] == 'Democratic', 'Vote'].sum()
    libertarian_votes = state_data.loc[state_data['Party'] == 'Libertarian Party', 'Vote'].sum()
    rep_won_seats_pct = round((won_seats / total_seats) * 100, 2)
    won_party = 'Republican' if won_seats > (total_seats/2) else 'Democratic'
    new_row = pd.DataFrame({'State': [state],
                            'State Code': [state_code],
                            'Total Seats': [total_seats],
                            'Total Votes': [total_votes],
                            'Won Seats': [won_seats],
                            'Republican Votes': [republican_votes],
                            'Democratic Votes': [democratic_votes],
                            'Libertarian Votes': [libertarian_votes],
                            'Rep. Won Seats %': [rep_won_seats_pct],
                            'Won Party': [won_party]})
    
    seat_won = pd.concat([seat_won, new_row], ignore_index=True)
    seat_won['Rep. Won Seats %'] = seat_won['Rep. Won Seats %'].astype(float)

# Display the seat_won DataFrame
print(seat_won)

             State State Code Total Seats Total Votes Won Seats  \
0          Alabama         AL           7     1336452         6   
1           Alaska         AK           1      249148         0   
2          Arizona         AZ           7     1957922         4   
3         Arkansas         AR           4      892785         4   
4       California         CA          52    10656359        12   
5         Colorado         CO           4     1313101         2   
6      Connecticut         CT           5     1261811         0   
7         Delaware         DE           1      321649         0   
8          Florida         FL          27     7330726        19   
9          Georgia         GA          14     3904930         9   
10          Hawaii         HI           2      399133         0   
11           Idaho         ID           2      583517         2   
12        Illinois         IL          16     3774487         3   
13         Indiana         IN           9     2035635         

## Step 3: Create a Dash App for Visualization

In [13]:
from jupyter_dash import JupyterDash
import dash
from dash import html, dcc
from dash.dependencies import Input, Output
import plotly.express as px

app = JupyterDash(__name__)

app.layout = html.Div([
    html.Div([
        html.H1(children='US House Election 2022'),
        html.Hr()]),
    html.Div([
        html.Div([
            html.H3(children='Total Seat: 435 | Republican won: 222 | Democratic won: 213'),
            dcc.Graph(id='choropleth_map')],
            style={'width': '60%', 'float': 'left', 'display': 'inline-block'}),
        html.Div([
            html.H3(children='Winning Party: Republican'),
            html.Div([
                dcc.Graph(id='bar_chart')],
                style={'width': '100%', 'display': 'block'}),
            html.Div([
                dcc.Graph(id='district_bar_chart')],
                style={'width': '100%', 'display': 'block'})
            ],
            style={'width': '40%', 'float': 'left', 'display': 'flex', 'flex-direction': 'column'})
        ],
        style={'width': '100%', 'display': 'flex'}),
    html.Div([
        html.H3(children='Top/Bottom 25 States by Republican Won Seat %'),
        dcc.RadioItems(
            id='toggle-chart',
            options=[
                {'label': 'Top 25', 'value': 'top'},
                {'label': 'Bottom 25', 'value': 'bottom'},
            ],
            value='top',
            labelStyle={'display': 'inline-block'}
        ),
        dcc.Graph(id='top-bottom-25-bar-chart'),
    ],
    style={'width': '100%', 'display': 'block'})
])


@app.callback([Output(component_id='choropleth_map', component_property='figure'),
               Output(component_id='bar_chart', component_property='figure'),
               Output(component_id='district_bar_chart', component_property='figure')],
              Input(component_id='choropleth_map', component_property='hoverData'))
def plot_choropleth_and_bar_charts(hover_data):

    custom_color_scale = px.colors.diverging.RdBu_r # type: ignore

    fig = px.choropleth(seat_won, scope="usa", locationmode='USA-states', locations='State Code',
                        color='Rep. Won Seats %',
                        color_continuous_scale=custom_color_scale,
                        range_color=(0, seat_won['Rep. Won Seats %'].max()),
                        hover_name='State',
                        hover_data={
                            #'State': False,
                            #'Total Votes': False,
                            'State Code': True,
                            'Won Party': True,
                            'Won Seats': True,
                            'Total Seats': True,
                            'Republican Votes': True,
                            'Democratic Votes': True,
                            'Libertarian Votes': True,
                            'Rep. Won Seats %': True,
                        },
                        width=900, height=600)

    if hover_data:
        state_code = hover_data['points'][0]['location']
        state_data = seat_won.loc[seat_won['State Code'] == state_code]

        bar_data = pd.DataFrame({
            'Party': ['Republican', 'Democratic', 'Libertarian'],
            'Votes': [
                state_data['Republican Votes'].values[0],
                state_data['Democratic Votes'].values[0],
                state_data['Libertarian Votes'].values[0]
            ]
        })
        winning_party = state_data['Won Party'].values[0]
        bar_chart = px.bar(bar_data, x='Party', y='Votes',
                           color='Party',
                                    color_discrete_map={
                                        'Republican': '#980000',
                                        'Democratic': '#196499',
                                        'Libertarian': 'pink'},
                           hover_data={'Votes': True},
                           labels={'x': 'Party', 'y': 'Votes'},
                           title=f"State: {state_data['State'].values[0]} - {winning_party} Won",
                           width=500, height=300)
        #bar_chart.update_traces(marker_color=['#980000', '#196499', 'pink'])
    else:
        bar_chart = px.bar(title="Hover over a state on the map to display the votes by party",
                                    width=500, height=300)

    if hover_data:
        state_code = hover_data['points'][0]['location']
        state_data = seat_won.loc[seat_won['State Code'] == state_code]

        # District bar chart (votes by winner in each district)
        district_data = house[house['State Code'] == state_code].groupby('District').apply(lambda x: x.nlargest(1, 'Vote')).reset_index(drop=True)
        district_bar_chart = px.bar(district_data,
                                    x='District',
                                    y='Vote',
                                    color='Party',
                                    color_discrete_map={
                                        'Republican': '#980000',
                                        'Democratic': '#196499'},
                                    hover_data={'Party': True,
                                                'Vote': True,
                                                'Candidate': True},
                                    labels={'x': 'Votes',
                                            'y': 'District'},
                                    title=f"No. of Districts: {state_data['Total Seats'].values[0]}",
                                    width=500, height=300)
    else:
        district_bar_chart = px.bar(title="Hover over a state on the map to display the votes by winner in each district",
                                    width=500, height=300)
    return fig, bar_chart, district_bar_chart

@app.callback(
    Output(component_id='top-bottom-25-bar-chart', component_property='figure'),
    Input(component_id='toggle-chart', component_property='value')
)
def plot_top_bottom_25_bar_chart(selection):
    top_states = seat_won.nlargest(25, 'Rep. Won Seats %')
    bottom_states = seat_won.nsmallest(25, 'Rep. Won Seats %')

    if selection == 'top':
        data = top_states
    else:
        data = bottom_states

    data = data.sort_values(by='Rep. Won Seats %', ascending= True)

    bar_chart = px.bar(data, x='Rep. Won Seats %', y='State',
                       text='Rep. Won Seats %',
                       labels={'x': 'Rep. Won Seats %', 'y': 'State'},
                       title=f"{selection.capitalize()} 25 States",
                       width=1000, height=800)

    bar_chart.update_traces(marker_color='#980000', textposition='outside')
    bar_chart.update_xaxes(range=[0, data['Rep. Won Seats %'].max() * 1.2]) # Add some padding for the text labels
    bar_chart.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

    return bar_chart

app.run_server(debug=True, width=1000, port=8052)

Dash is running on http://127.0.0.1:8052/

Dash app running on http://127.0.0.1:8052/
