# Theme: The Global Impact of Pandemics and Specifically Covid-19

## Data Collection and Web Scraping

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Suppress warning messages
pd.options.mode.chained_assignment = None

# URL of the Wikipedia page containing the table
url = "https://en.wikipedia.org/wiki/List_of_epidemics_and_pandemics"

# Read the HTML tables from the URL
tables = pd.read_html(url)

# Select the table I want from the list of tables
epidemics_table = tables[1]  # The table I need is the second table on the page

# Extracting the required columns
epidemics_table = epidemics_table[['Event', 'Years', 'Location', 'Disease', 'Death toll (estimate)']]


# Set the maximum number of rows to display
pd.set_option('display.max_rows', 300)

epidemics_table

In [None]:
import numpy as np
import re
# Replace 'Unknown' with NaN in the 'Death toll (estimate)' column
epidemics_table['Death toll (estimate)'] = epidemics_table['Death toll (estimate)'].replace('Unknown', np.nan)

# Function to remove anything inside parentheses
def remove_inside_parentheses(string):
    if isinstance(string, str):
        return re.sub(r'\s*\([^)]*\)', '', string)
    else:
        return string

# Apply the function to the 'Death toll (estimate)' column
epidemics_table['Death toll (estimate)'] = epidemics_table['Death toll (estimate)'].apply(remove_inside_parentheses)


# Remove '+' signs from the 'Death toll (estimate)' column
epidemics_table['Death toll (estimate)'] = epidemics_table['Death toll (estimate)'].str.replace('+', '')

epidemics_table.at[2, 'Death toll (estimate)'] = '875,000'
epidemics_table.at[4, 'Death toll (estimate)'] = '7,500,000'
epidemics_table.at[5, 'Death toll (estimate)'] = '2,000,000'
epidemics_table.at[7, 'Death toll (estimate)'] = '57,500,000'
epidemics_table.at[14, 'Death toll (estimate)'] = '2,000,000'
epidemics_table.at[16, 'Death toll (estimate)'] = '137,500,000'
epidemics_table.at[19, 'Death toll (estimate)'] = 'NaN'
epidemics_table.at[20, 'Death toll (estimate)'] = '6,500,000'
epidemics_table.at[22, 'Death toll (estimate)'] = '3,750,000'
epidemics_table.at[21, 'Death toll (estimate)'] = '10,000,000'
epidemics_table.at[23, 'Death toll (estimate)'] = '135,000'
epidemics_table.at[25, 'Death toll (estimate)'] = '2,500,000'
epidemics_table.at[26, 'Death toll (estimate)'] = '7,000'
epidemics_table.at[30, 'Death toll (estimate)'] = '650,000'
epidemics_table.at[33, 'Death toll (estimate)'] = '2,000,000'
epidemics_table.at[34, 'Death toll (estimate)'] = '1,000,000'
epidemics_table.at[37, 'Death toll (estimate)'] = '20,000'
epidemics_table.at[48, 'Death toll (estimate)'] = '875'
epidemics_table.at[66, 'Death toll (estimate)'] = '9,700'
epidemics_table.at[68, 'Death toll (estimate)'] = '45,000'
epidemics_table.at[70, 'Death toll (estimate)'] = '835'
epidemics_table.at[75, 'Death toll (estimate)'] = '2,000,000'
epidemics_table.at[81, 'Death toll (estimate)'] = '150,301'
epidemics_table.at[85, 'Death toll (estimate)'] = '42,000'
epidemics_table.at[93, 'Death toll (estimate)'] = '12,500'
epidemics_table.at[107, 'Death toll (estimate)'] = '1,000,000'
epidemics_table.at[112, 'Death toll (estimate)'] = '13,500,000'
epidemics_table.at[121, 'Death toll (estimate)'] = '19,850'
epidemics_table.at[130, 'Death toll (estimate)'] = '1,000,000'
epidemics_table.at[138, 'Death toll (estimate)'] = '250,000'
epidemics_table.at[139, 'Death toll (estimate)'] = '2,850'
epidemics_table.at[144, 'Death toll (estimate)'] = '67,000,000'
epidemics_table.at[145, 'Death toll (estimate)'] = '2,500,000'
epidemics_table.at[158, 'Death toll (estimate)'] = '2,500,000'
epidemics_table.at[160, 'Death toll (estimate)'] = '36,000'
epidemics_table.at[161, 'Death toll (estimate)'] = '2,500,000'
epidemics_table.at[169, 'Death toll (estimate)'] = '42,000,000'
epidemics_table.at[173, 'Death toll (estimate)'] = '39'
epidemics_table.at[174, 'Death toll (estimate)'] = '8921'
epidemics_table.at[215, 'Death toll (estimate)'] = '18,449'
epidemics_table.at[216, 'Death toll (estimate)'] = '284,000'
epidemics_table.at[222, 'Death toll (estimate)'] = '941'
epidemics_table.at[234, 'Death toll (estimate)'] = '4004'
epidemics_table.at[248, 'Death toll (estimate)'] = '7,009,000'

# Replace the '[c]' with an empty string
epidemics_table.at[248, 'Years'] = epidemics_table.at[248, 'Years'].replace('[c]', '')

   # Remove commas from 'Death toll (estimate)' column
epidemics_table['Death toll (estimate)'] = epidemics_table['Death toll (estimate)'].str.replace(',', '')
epidemics_table['Death toll (estimate)'] = epidemics_table['Death toll (estimate)'].fillna('')
epidemics_table['Death toll (estimate)'] = pd.to_numeric(epidemics_table['Death toll (estimate)'], errors='coerce').astype('Int64')
epidemics_table

#Examming the Data (The Events that resulted in the highest and lowest Death Tolls)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px

import matplotlib as mpl
top_10_events = epidemics_table.sort_values(by='Death toll (estimate)', ascending=False).head(10)

# Create an interactive bar chart using Plotly Express
fig = px.bar(top_10_events, x='Event', y='Death toll (estimate)', title='Top 10 Death Toll by Event')

# Save the plot as an HTML file
fig.write_html("top_10_death_toll.html")

# Display the interactive plot
fig.show()

At a glance, we observe that the Black Death had the highest death toll, approximately 137.5 million people. Notably, the COVID-19 pandemic ranks among the top 10, underscoring its historic significance in recent times with an approximate death toll of 7 million. Additionally, the ongoing HIV/AIDS epidemic has a profound impact, with an approximate death toll of 42 million, highlighting its enduring significance

In [None]:
import pandas as pd
import plotly.express as px

# Sort the DataFrame by 'Death toll (estimate)' column in ascending order and select lowest 10 events
lowest_10_events = epidemics_table.sort_values(by='Death toll (estimate)').head(10)

# Create an interactive bar chart using Plotly Express with custom color
fig = px.bar(lowest_10_events, x='Event', y='Death toll (estimate)', title='Lowest 10 Death Toll by Event', color_discrete_sequence=['hotpink'])

# Save the plot as an HTML file
fig.write_html("lowest_10_death_toll.html")

# Display the interactive plot
fig.show()

The Lowest Death Toll by Event provides an overview of events with the least significant death tolls. For example, the Queensland 2009 dengue outbreak resulted in only 1 fatality, while the 2006 India malaria outbreak saw an estimated 17 deaths. These instances may indicate diseases that are less contagious, occurred in sparsely populated areas, or where effective preventive measures were implemented.

In [None]:
import pandas as pd
import plotly.express as px

# Extracting the year from the 'Years' column (assuming the format is 'YYYY')
epidemics_table['Year'] = epidemics_table['Years'].str.extract(r'(\d{4})')

# Aggregate data by year and event for hover text
group1_agg = epidemics_table[(epidemics_table['Year'] >= '1350') & (epidemics_table['Year'] < '1800')].groupby(['Year', 'Event']).agg({'Death toll (estimate)': 'sum'}).reset_index()
group2_agg = epidemics_table[(epidemics_table['Year'] >= '1800') & (epidemics_table['Year'] < '1900')].groupby(['Year', 'Event']).agg({'Death toll (estimate)': 'sum'}).reset_index()
group3_agg = epidemics_table[epidemics_table['Year'] >= '1900'].groupby(['Year', 'Event']).agg({'Death toll (estimate)': 'sum'}).reset_index()

# Create subplots for each group
fig1 = px.line(group1_agg, x='Year', y='Death toll (estimate)', 
              labels={'Year': 'Year', 'Death toll (estimate)': 'Total Death Toll'}, 
              title='Events by Death Toll Between 1350-1800', hover_name='Event')

fig1.update_traces(mode='lines+markers', marker=dict(color='blue'))
fig1.update_layout(xaxis_tickangle=90)

# Save the plot as HTML file
fig1.write_html("1350-1800_death_toll.html")

# Display the interactive plot
fig1.show()

fig2 = px.line(group2_agg, x='Year', y='Death toll (estimate)', 
              labels={'Year': 'Year', 'Death toll (estimate)': 'Total Death Toll'}, 
              title='Events by Death Toll Between 1800-1900', hover_name='Event')

fig2.update_traces(mode='lines+markers', marker=dict(color='green'))
fig2.update_layout(xaxis_tickangle=90)

# Save the plot as HTML file
fig2.write_html("1800-1900_death_toll.html")

# Display the interactive plot
fig2.show()

fig3 = px.line(group3_agg, x='Year', y='Death toll (estimate)', 
              labels={'Year': 'Year', 'Death toll (estimate)': 'Total Death Toll'}, 
              title='Events by Death Toll Between 1900-Present', hover_name='Event')

fig3.update_traces(mode='lines+markers', marker=dict(color='red'))
fig3.update_layout(xaxis_tickangle=90)

# Save the plot as HTML file
fig3.write_html("1900-Present_death_toll.html")

# Display the interactive plot
fig3.show()

Examining significant events and their respective death tolls between 1350 and 1800 reveals distinctive spikes in mortality. The first major spike occurred during the Cocoliztli epidemic of 1545-1548, which ravaged Central Mexico and led to approximately 10 million deaths. Following this devastating event, another surge in fatalities was observed during the Cocoliztli epidemic of 1576, resulting in approximately 2.5 million deaths.

In 1616, the New England infections epidemic emerged, claiming the lives of an estimated 2 million individuals. The graph further depicts a notable increase in mortality during the 1772 North American measles epidemic, coinciding with the 1772-1773 Persian Plague, collectively resulting in just over 2 million deaths.

The second graph reveals a notable spike caused by the 1855 Norfolk yellow fever epidemic, which was a consequence of the Third Plague pandemic. This devastating event, occurring in 1855, resulted in the tragic loss of approximately 13.5 Million lives. The Third Plague pandemic, originating in China in the mid-19th century, spread globally, causing widespread mortality.

Additionally, throughout the 19th century, several cholera pandemics inflicted a significant death toll. The first notable surge occurred in 1846, claiming approximately 1 million lives worldwide. Following this, the Fourth cholera pandemic in 1863 led to the deaths of around 600,000 individuals. Towards the end of the century, in 1899, the 6th cholera pandemic caused an estimated 800,000 deaths, marking the end to the century's battle against infectious disease. These pandemics underscore the profound impact of infectious diseases on human populations during the 19th century, shaping public health policies and practices for generations to come.

Finally, examining events from the 19th century to the present day, three significant spikes emerge. The first, and most impactful, occurred in 1918 with the influenza pandemic, commonly known as the Spanish flu. The Spanish flu was caused by the H1N1 influenza virus and is considered one of the deadliest pandemics in history. It spread rapidly across the globe, affecting millions of people and resulting in a staggering death toll of approximately 67 million individuals worldwide. The Spanish flu disproportionately affected young, healthy adults, highlighting the unpredictability and severity of influenza viruses.

The next spike reflects the ongoing HIV/AIDS epidemic. HIV (Human Immunodeficiency Virus) attacks the body's immune system, specifically targeting CD4 cells, which are crucial for fighting off infections. AIDS (Acquired Immunodeficiency Syndrome) is the most advanced stage of HIV infection when the immune system is severely damaged. While the spike is marked in 1981 with the recognition of the first cases, the HIV/AIDS epidemic continues to claim lives daily. Since its emergence, HIV/AIDS has led to a devastating death toll of approximately 42 million individuals worldwide. Despite significant advancements in treatment and prevention, HIV/AIDS remains a significant global health challenge, particularly in sub-Saharan Africa and other regions heavily affected by the epidemic.

Most recently, we observe a spike caused by the COVID-19 pandemic, caused by the novel coronavirus SARS-CoV-2. COVID-19 was first identified in Wuhan, China, in December 2019 and quickly spread globally, leading the World Health Organization to declare it a pandemic in March 2020. COVID-19 is primarily transmitted through respiratory droplets and close contact with infected individuals. The disease manifests with a wide range of symptoms, from mild respiratory illness to severe pneumonia and acute respiratory distress syndrome (ARDS). The COVID-19 pandemic has resulted in just over 7 million deaths globally, making it one of the most significant public health crises in recent times. Its rapid spread, high transmission rate, and severe impact on healthcare systems underscore the urgent need for global cooperation and robust public health measures to control and mitigate its effects.


## Investigating by geographical location

In [None]:
import pandas as pd
import geopy
from geopy.geocoders import Nominatim
import plotly.express as px

# Create a geolocator instance
geolocator = Nominatim(user_agent="my_geocoder")

# Geocode locations to get latitude and longitude
def geocode_location(location):
    try:
        location = geolocator.geocode(location)
        if location:
            return location.latitude, location.longitude
    except:
        pass
    return None, None

# Apply geocoding function to the 'Location' column
epidemics_table[['Latitude', 'Longitude']] = epidemics_table['Location'].apply(lambda x: pd.Series(geocode_location(x)))

# Filter out rows where latitude and longitude are available
filtered_data = epidemics_table.dropna(subset=['Latitude', 'Longitude'])

In [None]:
import plotly.express as px

# Create scatter plot for events
fig = px.scatter_mapbox(filtered_data, 
                        lat='Latitude',
                        lon='Longitude',
                        hover_name='Location',
                        hover_data={'Event': True, 'Death toll (estimate)': True},  # Include event and death toll in hover data
                        zoom=0,  # Set the initial zoom level
                        title='Geographical Distribution of Epidemics',
                        mapbox_style="open-street-map")  # Choose OpenStreetMap style

# Adjust marker color to green
fig.update_traces(marker=dict(size=8,
                               color='hotpink',
                               opacity=0.8),
                   hovertemplate='<b>%{hovertext}</b><br><br>' +  # Customize hover template
                                 'Event: %{customdata[0]}<br>' +
                                 'Death toll: %{customdata[1]}')

# Remove legend
fig.update_layout(showlegend=False)

# Show the plot
fig.show()

In [None]:
import plotly.express as px

fig = px.scatter_mapbox(filtered_data, 
                        lat='Latitude',
                        lon='Longitude',
                        hover_name='Location',
                        hover_data={'Event': True, 'Death toll (estimate)': True},
                        color_discrete_sequence=["fuchsia"], 
                        zoom=1, 
                        height=300)
fig.update_layout(
    mapbox_style="white-bg",
    mapbox_layers=[
        {
            "below": 'traces',
            "sourcetype": "raster",
            "sourceattribution": "United States Geological Survey",
            "source": [
                "https://basemap.nationalmap.gov/arcgis/rest/services/USGSImageryOnly/MapServer/tile/{z}/{y}/{x}"
            ]
        },
        {
            "sourcetype": "raster",
            "sourceattribution": "Government of Canada",
            "source": ["https://geo.weather.gc.ca/geomet/?"
                       "SERVICE=WMS&VERSION=1.3.0&REQUEST=GetMap&BBOX={bbox-epsg-3857}&CRS=EPSG:3857"
                       "&WIDTH=1000&HEIGHT=1000&LAYERS=RADAR_1KM_RDBR&TILED=true&FORMAT=image/png"],
        }
      ])
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

# Remove latitude and longitude from hover info
fig.update_traces(hovertemplate='<b>%{hovertext}</b><br><br>' +
                                 'Event: %{customdata[0]}<br>' +
                                 'Death toll: %{customdata[1]}')

fig.show()



The geographic plot showcasing pandemics and epidemics around the world provides us with invaluable insights into the spatial distribution and spread of infectious diseases. Upon observation, we notice distinct clusters, particularly around Europe, indicating a concentration of outbreaks in this region. This clustering can be attributed to the close proximity of countries and dense populations, facilitating the rapid transmission of diseases. Urban centers and bustling transportation networks within Europe likely contribute to the heightened vulnerability to pandemics and epidemics.

Conversely, regions such as Antarctica and Greenland exhibit fewer instances of epidemics. These areas, characterized by their remoteness and sparse populations, present less favorable conditions for disease transmission. The limited human presence and isolation from major transportation routes likely contribute to the lower incidence of outbreaks in these rural locales.

This geographical analysis underscores the significant role of population density, connectivity, and environmental factors in shaping the distribution and spread of pandemics and epidemics worldwide.