In [None]:
# imports 
import math
import numpy as np 
import pandas as pd 
import plotly.express as ex
import plotly.graph_objects as go
import plotly.offline as pyo
from datetime import datetime
pyo.init_notebook_mode()

In [None]:
# load data
vacc_df = pd.read_csv("/kaggle/input/covid-world-vaccination-progress/country_vaccinations.csv")
summary_df = pd.read_csv("/kaggle/input/covid19-global-dataset/worldometer_coronavirus_summary_data.csv")
daily_df = pd.read_csv("/kaggle/input/covid19-global-dataset/worldometer_coronavirus_daily_data.csv")

# Data Cleaning¶
The most important step to take before we get started geenrating any kind of information from all these data sources, it is first important to clean our data and make sure that the datasets are compatible with each other. Since most of the data is divided on a country-by-country basis, we must make sure that all the country names have the same formatting.

For instance, "United States of America" may be written as "USA", "United Kingdom" as "UK" and so on. We will explore this problem and others in this section. We will prioritize only those countries for which vaccination data is available since most countries have either not yet started vaccinations or do not yet have sufficiently available public data.

In [None]:
# Identify the differences
print("Countries in the Vaccination Data not in Summary Data")
print([x for x in vacc_df.country.unique() if x not in summary_df.country.unique()])

Replace

'Czechia' == "Czech Republic"
'Isle of Man' == "Isle Of Man"
'United Kingdom' == "UK"
'United States' == "USA"
'Northern Cyprus' == "Cyprus"
Drop

England
Wales
Scotland
Northern Ireland
(since they are a part of the UK)

In [None]:
# Implement the above
vacc_df.country = vacc_df.country.replace().replace({
    "Czechia": "Czech Republic", 
    "United States": "USA", 
    "United Kingdom": "UK", 
    "Isle of Man": "Isle Of Man",
    "Republic of Ireland": "Ireland",
    "Northern Cyprus" : "Cyprus"
})

# drop these 3 since they are included in UK 
vacc_df = vacc_df[vacc_df.country.apply(lambda x: x not in ['England', 'Scotland', 'Wales', 'Northern Ireland'])]

In [None]:
# function to easily agrregate columns
def aggregate(df: pd.Series, agg_col: str) -> pd.DataFrame:
    
    data = df.groupby("country")[agg_col].max()
    data = pd.DataFrame(data)
    
    return data

In [None]:
# define the columns we want to summarize
cols_to_summarize = ['people_vaccinated', 
                     'people_vaccinated_per_hundred', 
                     'people_fully_vaccinated', 
                     'people_fully_vaccinated_per_hundred', 
                     'total_vaccinations_per_hundred', 
                     'total_vaccinations']

summary = summary_df.set_index("country")
vaccines = vacc_df[['country', 'vaccines']].drop_duplicates().set_index('country')
summary = summary.join(vaccines)

for col in cols_to_summarize:   
    summary = summary.join(aggregate(vacc_df, col))

summary['percentage_vaccinated'] = summary.total_vaccinations / summary.population * 100
summary['tested_positive'] = summary.total_confirmed / summary.total_tests * 100

In [None]:
# Data used for this section
summary.head(5)

# Visualizations
Great! So now we have our data ready. But plain numbers are boring and hard to grasp clearly. So let's turn them into colors and patterns!

In [None]:
# helper functions 
def get_multi_line_title(title:str, subtitle:str):
    return f"{title}<br><sub>{subtitle}</sub>"

def visualize_column(data: pd.DataFrame, xcolumn: str, ycolumn:str, title:str, colors:str, ylabel="Count", n=None):
    hovertemplate ='<br><b>%{x}</b>'+f'<br><b>{ylabel}: </b>'+'%{y}<br><extra></extra>'    
    data = data.sort_values(ycolumn, ascending=False).dropna(subset=[ycolumn])        
    
    if n is not None: 
        data = data.iloc[:n]
    else:
        n = ""
    fig = go.Figure(go.Bar(
                    hoverinfo='skip',
                     x=data[xcolumn], 
                     y=data[ycolumn], 
                     hovertemplate = hovertemplate,
                     marker=dict(
                         color = data[ycolumn],
                         colorscale=colors,
                        ),
                    ),
                )
    
    fig.update_layout(
        title=title,
        xaxis_title=f"Top {n} {xcolumn.title()}",
        yaxis_title=ylabel,
        plot_bgcolor='rgba(0,0,0,0)',
        hovermode="x"
    )
    
    fig.show()


#  Visualizing Summaries
Let us plot some bargraphs using the helper functions we defined above, in tandem with the data we so meticulously processed

In [None]:
title = get_multi_line_title("People Vaccinated", "Individuals who received the first dose of the vaccine")
visualize_column(summary.reset_index(), 'country', "total_vaccinations", title, "Blugrn", n=20 )

Visualizing Summaries
Let us plot some bargraphs using the helper functions we defined above, in tandem with the data we so meticulously processed

In [None]:
title = get_multi_line_title("Percentage Vaccinated", "Percentage of the total population that have received the first dose")
visualize_column(summary.reset_index(), 'country', "percentage_vaccinated", title, "Purp", "Percentage(%)", n=20)

In [None]:
title = get_multi_line_title("People Fully Vaccinated", "Individuals who have received all doses of the vaccine")
visualize_column(summary.reset_index(), 'country', "people_fully_vaccinated", title, "Pinkyl", n=20 )

In [None]:
title = get_multi_line_title("Tested Positive ", "Fraction of  people that tested positive among those that were tested")
visualize_column(summary.reset_index(), 'country',"tested_positive", title, "Reds", n=20, ylabel='Percentage' )

In [None]:
data = summary.dropna(subset=['vaccines'])
data = summary.groupby('vaccines')['total_vaccinations'].sum()
data = pd.DataFrame(data).reset_index()

title = get_multi_line_title("Vaccines In Use", "Popular Vaccine Combinations that are used around the globe")
visualize_column(data, 'vaccines',"total_vaccinations", title, "RdBu" )

In [None]:
data = summary.dropna(subset=['serious_or_critical'])
data = data.reset_index()

title = get_multi_line_title("Serious or Critical Cases", "Number of people who are currently critically ill due to Covid-19")
visualize_column(data, 'country',"serious_or_critical", title, "turbid", n=20)

In [None]:
title = get_multi_line_title("Popular Vaccines", "Vaccines being admisitered around the world")
data = summary.reset_index().dropna(subset=['vaccines'])
fig = ex.choropleth(data, locations="country", 
                    locationmode='country names',
                    color="vaccines", 
                    hover_name="country", 
                   )


fig.update_layout(title=title, 
                  title_x=0.5)
fig.show()

# Summary


* Data Cleaning : Preprocessing and standardization.
* Data Summarization : Combine the daily vaccine information with the existing virus summary.
* Summary Visualizations : Generate attractive bar plots for various summaries.
* Global Statistics Visualization : Compare the daily new cases, active cases and deaths.
* Vaccine Preferences : Visualize the vaccine combinations in use around the globe.

Thanks for taking the time to read this notebook. If you liked it, an UPVOTE is massively encouraging! I will try to keep this notebook updated and add in more visualizations in the future so be sure to check back soon!