# Hey Everyone! 
# Welcome to my Covid Vaccination Visualization Project! 

### This Project and the results can also be found on Kaggle itself
https://www.kaggle.com/sebastiantenberge/covid-vaccination-visualized

In this project I am working with data which was provided by the following 3 kaggle competitions. 
- [Covid vaccination data](https://www.kaggle.com/gpreda/covid-world-vaccination-progress)
- [Country and Population Data](http://www.kaggle.com/erikbruin/countries-of-the-world-iso-codes-and-population)
- [GDP Data](https://www.kaggle.com/ahmdfatihin/world-population-incomegdp-and-life-expectancy)

I hope you enjoy the insights gained from the visualizations. 

If you have any questions or feedback, I am happy to hear from you in the comment section 😊

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
!pip install plotly-geo
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### In the following section the data is prepared to visualize the current state of the covid vaccination in different countries

In [None]:
covid_df = pd.read_csv('/kaggle/input/covid-world-vaccination-progress/country_vaccinations.csv')
covid_df.head()

In [None]:
#setting the right order of the data and transforming the date column to function with choropleth map animations

covid_df['date'] = pd.to_datetime(covid_df['date'])
covid_df = covid_df.sort_values(['date', 'country'], ascending=True)
covid_df['date'] = covid_df['date'].dt.strftime('%m-%d-%Y')

covid_df.info()

In [None]:
#frontfilling all empty fields with the value of the previous column. 
#filling all final missing columns that have not been populated by frontfill with 0

covid_df = covid_df.groupby('country').apply(lambda group: group.fillna(method='ffill'))
covid_df = covid_df.groupby('country').apply(lambda group: group.fillna(0))

In [None]:
#adding the country population table
country_area_df = pd.read_csv('../input/countries-of-the-world-iso-codes-and-population/countries_by_population_2019.csv')
country_area_df = country_area_df[['name', 'area', 'Density']]
country_area_df = country_area_df.rename(columns={'name': 'country'})

#adding the country code table to later join with other tables
country_code = pd.read_csv('../input/countries-of-the-world-iso-codes-and-population/country_codes_2020.csv')
country_code_clean = country_code[['name', 'cca3']]
country_code_clean = country_code_clean.rename(columns={'name': 'country', 'cca3': 'iso_code'})

#adding the region table to be able to join the gdp table
country_region = pd.read_csv('../input/world-population-incomegdp-and-life-expectancy/countries_total.csv')
country_region_df = country_region[['name', 'alpha-3', 'region', 'sub-region']]
country_region_df = country_region_df.rename(columns={'alpha-3': 'iso_code', 'name': 'country'})

#adding the gdp table
gdp_per_capita = pd.read_csv('../input/world-population-incomegdp-and-life-expectancy/income_per_person.csv')
gdp_per_capita_df = gdp_per_capita[['country', '2020']]
gdp_per_capita_df = gdp_per_capita_df.rename(columns={'2020': 'gdp_per_capita', 'alpha-3': 'iso_code'})

#adding the population table
population = pd.read_csv('../input/world-population-incomegdp-and-life-expectancy/population_total.csv')
population_df = population[['country', '2020']]
population_df = population_df.rename(columns={'2020': 'population'})

In [None]:
#select the rows with the most up to date row for each country

latest_covid_df = covid_df.groupby('country').max().reset_index()

avg_vaccination_rate_df = covid_df[['country', 'daily_vaccinations']].groupby('country').mean('daily_vaccinations')
avg_vaccination_rate_df = avg_vaccination_rate_df.rename(columns={'daily_vaccinations': 'avg_daily_vaccinations'})

In [None]:
### Mergin data sets
country = country_area_df.merge(country_code_clean, on='country', how='inner', suffixes=('_1', '_2'))
gdp = gdp_per_capita_df.merge(population_df, on='country', how='inner', suffixes=('_1', '_2'))
gdp_region = gdp.merge(country_region_df, on='country', how='inner', suffixes=('_1', '_2'))
country_data = gdp_region.merge(country, on='iso_code', how='inner', suffixes=('_1', '_2'))
expanded_df = latest_covid_df.merge(country_data, on='iso_code', how='inner', suffixes=('_1', '_2'))
expanded_df["gdp"] = expanded_df["gdp_per_capita"] * expanded_df["population"]
expanded_df = expanded_df.merge(avg_vaccination_rate_df, on='country', how='inner', suffixes=('_1', '_2'))

In [None]:
#selecting only the required columns
expanded_master_df = expanded_df[['country', 'iso_code', 'region', 'daily_vaccinations_per_million', 'total_vaccinations', 'people_fully_vaccinated', 'daily_vaccinations', 'people_fully_vaccinated_per_hundred', 'people_vaccinated_per_hundred', 'avg_daily_vaccinations', 'area', 'population', 'Density', 'gdp', 'gdp_per_capita', 'vaccines']]
expanded_master_df.head()

In [None]:
unique_regions = expanded_master_df.region.unique()
expanded_master_df = expanded_master_df.sort_values(['people_vaccinated_per_hundred'], ascending=True)


for x in unique_regions:
    fig2 = go.Figure(
    data=[
        go.Bar(
            name="% VACCINATED",
            x=expanded_master_df[(expanded_master_df["region"] == x)]["country"],
            y=expanded_master_df[(expanded_master_df["region"] == x)]["people_vaccinated_per_hundred"],
            offsetgroup=0,
        ),
        go.Bar(
            name="% FULLY VACCINATED",
            x=expanded_master_df[(expanded_master_df["region"] == x)]["country"],
            y=expanded_master_df[(expanded_master_df["region"] == x)]["people_fully_vaccinated_per_hundred"],
            offsetgroup=1,
        ),
    ],
    layout=go.Layout(
        title= x,
        template='plotly_dark',
        yaxis_title="Percent of Population"
    )
    )
    fig2.show()

#### the following table shows us the correlation between several factors and covid vaccination

In [None]:
plt.subplots(figsize=(10,7.5))
sns.heatmap(expanded_master_df.corr(), annot= True);

In [None]:
vaccines = expanded_master_df.vaccines.unique()
for v in vaccines:
    countries = expanded_master_df.loc[expanded_master_df.vaccines==v, 'country'].values
    print(f"caccines: {v}: \ncountries: {list(countries)}\n")

In [None]:
fig = px.choropleth(locations=expanded_master_df['country'], 
                    locationmode="country names",
                    color=expanded_master_df['vaccines'],
                    title="Countries using each vaccine (different colors for each vaccine)",
                    height = 800
                   )
fig.update_layout({'legend_orientation':'v'})
fig.update_layout({'legend_title':'Vaccine scheme'})
fig.show()

In [None]:
#Creating a new df to avoid breaking other df
animation_covid_df = covid_df[['country', 'iso_code', 'date', 'total_vaccinations', 'daily_vaccinations_per_million', 'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred', 'daily_vaccinations']]

animation_covid_df.info()

In [None]:
#creating a df with all unique countries and iso_codes
#creating a new table with all the dates in the original dataframe
countries = animation_covid_df.loc[:, ['country', 'iso_code']].drop_duplicates()
dates_df = animation_covid_df.loc[:, ['date']].drop_duplicates()

#creating an index called row number to later merge the dates table with the countries table on
dates_df['row_number'] = dates_df.reset_index().index

number_of_dates = dates_df.max() #shows the number of dates or rows in the the dates table

#creating an equivilant number of rows for each country as there are dates in the dates_df 
indexed_country = countries.append([countries]*number_of_dates[1],ignore_index=True)
indexed_country = indexed_country.sort_values(['country', 'iso_code'], ascending=True)
#creating a new column called 'row_number' to join the indexed_country df with the dates_df
indexed_country['row_number'] = indexed_country.groupby(['country', 'iso_code']).cumcount()+1

#merging all the indexed countries with all the possible dates on the row number
indexed_country_date_df = indexed_country.merge(dates_df, on='row_number', how='left', suffixes=('_1', '_2'))

In [None]:
#setting the 'date' column in both tables to datetime so they can be merged on
animation_covid_df['date'] = pd.to_datetime(animation_covid_df['date'])
indexed_country_date_df['date'] = pd.to_datetime(indexed_country_date_df['date'])

#merging the indexed_country_date_df with the original data from the animation_covid_df
Animation_df2 = indexed_country_date_df.merge(animation_covid_df, on=['iso_code', 'date'], how='left', suffixes=('_1', '_2'))
#adding a region column 
Animation_df3 = Animation_df2.merge(country_region_df, on=['iso_code'], how='left', suffixes=('_1', '_2'))

#selecting the columns that the final df shall have
final_Animation_df = Animation_df3[['country_1', 'iso_code', 'region', 'row_number', 'date', 'total_vaccinations', 'daily_vaccinations_per_million', 'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred', 'daily_vaccinations']]
final_Animation_df = final_Animation_df.rename(columns={'country_1': 'country'})

#populating the missing values
final_Animation_df = final_Animation_df.groupby('country').apply(lambda group: group.fillna(method='ffill'))
final_Animation_df = final_Animation_df.groupby('country').apply(lambda group: group.fillna(0))

#dropping pitcirn because they are an outlier in the daily vaccination animation
final_Animation_df = final_Animation_df[final_Animation_df['iso_code'] != 'PCN']
final_Animation_df

In [None]:
#transforming the data so that it works with plotly express
final_Animation_df['date'] = pd.to_datetime(final_Animation_df['date'])
final_Animation_df = final_Animation_df.sort_values(['date', 'country'], ascending=True)
final_Animation_df['date'] = final_Animation_df['date'].dt.strftime('%m-%d-%Y')

In [None]:
columnlist = ('total_vaccinations', 'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred')

for column_x in columnlist:
    fig = px.choropleth(final_Animation_df,                            # Input Dataframe
                         locations="iso_code",           # identify country code column
                         color=column_x,                     # identify representing column
                         hover_name="country",              # identify hover name
                         animation_frame="date",        # identify date column
                         projection="natural earth",        # select projection
                         color_continuous_scale= 'RdBu',  # select prefer color scale
                         range_color=[0,expanded_master_df[column_x].max()],             # select range of dataset     
                         title=column_x)
    fig.show()          
    fig.write_html("example_map.html")

In [None]:
unique_regions_2 = expanded_master_df.region.unique()

for region_x in unique_regions_2:
    max_x_value = final_Animation_df[['daily_vaccinations_per_million']][final_Animation_df['region'] == region_x].max()
    max_y_value = final_Animation_df[['people_vaccinated_per_hundred']][final_Animation_df['region'] == region_x].max()
    fig = px.scatter(final_Animation_df[final_Animation_df['region'] == region_x], 
                     x="daily_vaccinations_per_million", 
                     y="people_vaccinated_per_hundred", 
                     animation_frame="date", 
                     animation_group="country",
                     hover_name="country", 
                     text='iso_code',
                     range_x=[0, max_x_value.tolist()[0]], 
                     range_y=[0,max_y_value.tolist()[0]], 
                     title=region_x)

    fig.update_traces(marker=dict(size=40,  color='DarkSlateGrey'))

    fig.show()