In [21]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [22]:
df = pd.read_csv("/kaggle/input/covid-world-vaccination-progress/country_vaccinations.csv")

In [23]:
df.head(n=4)

In [24]:
print(f"Length before dropping nans: {len(df)}")
df = df.dropna(subset=['people_vaccinated'])
print(f"Length before dropping nans: {len(df)}")

In [25]:
df['date'] = pd.to_datetime(df['date'])

## Types of vaccines used

In [None]:
def get_vaccines(cdf: pd.DataFrame):
    vaccines = cdf['vaccines'].unique()
    distinct_vaccines = []
    
    for vaccine in vaccines:
        for v in vaccine.split(','):
            v = v.strip()
            if v not in distinct_vaccines:
                distinct_vaccines.append(v)
    return distinct_vaccines

In [None]:
vaccines = get_vaccines(df)
print(vaccines)

## Which vaccines are used in which countries?

In [32]:
vaccines_country = {}
people_vaccinated = {}

countries = df['country'].unique()

for vaccine in vaccines:
    vaccines_country[vaccine] = []
    
for country in countries:
    people_vaccinated[country] = 0

In [33]:
for country in countries:
    country_df = df.loc[df['country'] == country]
    country_vaccinations = country_df['people_vaccinated'].sum()
    people_vaccinated[country] = country_vaccinations
    
    c_vaccines = country_df['vaccines'].unique()
    for vaccine in c_vaccines:
        for v in vaccine.split(','):
            v = v.strip()
            vaccines_country[v].append(country)

In [44]:
for vaccine in vaccines_country:
    print(f"Vaccine: {vaccine}:")
    print(f"Number of countries using it: {len(vaccines_country[vaccine])}")
    print(f"List of countries using it: \n{vaccines_country[vaccine]}")
    print(f"-"*200, "\n")

## Histogram of vaccines and number of countries using it

In [35]:
vaccine_counts = {}
for v in vaccines_country:
    vaccine_counts[v] = len(vaccines_country[v])

plt.rcParams["figure.figsize"] = (20,7)
plt.bar(vaccine_counts.keys(), vaccine_counts.values())
plt.xticks(rotation=60)
plt.show()

## Country with highest vaccinations

In [40]:
max_v_country = max(people_vaccinated, key=people_vaccinated.get)
print(max_v_country)

## Number of people vaccinated over time in country with highest vaccinations

In [41]:
max_v_cntry_df = df.loc[df['country'] == max_v_country]
max_v_cntry_df = max_v_cntry_df.sort_values(by='date')

In [43]:
plt.plot(max_v_cntry_df['date'], max_v_cntry_df['people_vaccinated'])

plt.title(f"Vaccinations over time in {max_v_country}")
plt.xlabel("Time")
plt.ylabel("People Vaccinated (in 100 millions)")

plt.show()