In [1]:
# import statements

import pandas as pd

In [2]:
# Extract data from CSV file

df = pd.read_csv('../data/country_vaccinations.csv')
df.head(10)

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines,source_name,source_website
0,Albania,ALB,2021-01-10,0.0,0.0,,,,0.0,0.0,,,Pfizer/BioNTech,Ministry of Health,https://shendetesia.gov.al/vaksinimi-anticovid...
1,Albania,ALB,2021-01-11,,,,,64.0,,,,22.0,Pfizer/BioNTech,Ministry of Health,https://shendetesia.gov.al/vaksinimi-anticovid...
2,Albania,ALB,2021-01-12,128.0,128.0,,,64.0,0.0,0.0,,22.0,Pfizer/BioNTech,Ministry of Health,https://shendetesia.gov.al/vaksinimi-anticovid...
3,Albania,ALB,2021-01-13,188.0,188.0,,60.0,63.0,0.01,0.01,,22.0,Pfizer/BioNTech,Ministry of Health,https://shendetesia.gov.al/vaksinimi-anticovid...
4,Albania,ALB,2021-01-14,266.0,266.0,,78.0,66.0,0.01,0.01,,23.0,Pfizer/BioNTech,Ministry of Health,https://shendetesia.gov.al/vaksinimi-anticovid...
5,Albania,ALB,2021-01-15,308.0,308.0,,42.0,62.0,0.01,0.01,,22.0,Pfizer/BioNTech,Ministry of Health,https://shendetesia.gov.al/vaksinimi-anticovid...
6,Albania,ALB,2021-01-16,369.0,369.0,,61.0,62.0,0.01,0.01,,22.0,Pfizer/BioNTech,Ministry of Health,https://shendetesia.gov.al/vaksinimi-anticovid...
7,Albania,ALB,2021-01-17,405.0,405.0,,36.0,58.0,0.01,0.01,,20.0,Pfizer/BioNTech,Ministry of Health,https://shendetesia.gov.al/vaksinimi-anticovid...
8,Albania,ALB,2021-01-18,447.0,447.0,,42.0,55.0,0.02,0.02,,19.0,Pfizer/BioNTech,Ministry of Health,https://shendetesia.gov.al/vaksinimi-anticovid...
9,Albania,ALB,2021-01-19,483.0,483.0,,36.0,51.0,0.02,0.02,,18.0,Pfizer/BioNTech,Ministry of Health,https://shendetesia.gov.al/vaksinimi-anticovid...


In [3]:
# Explore Data

print('The data frame has: \n -{} rows and {} columns.'.format(df.shape[0], df.shape[1]))
print(' -{} duplicates.'.format(len(df[df.duplicated()])))
print(' -{} countries.'.format(df['country'].nunique()))

print('\nThe list of columns with the number of missing values:\n\n', df.isnull().sum())

The data frame has: 
 -2916 rows and 15 columns.
 -0 duplicates.
 -85 countries.

The list of columns with the number of missing values:

 country                                   0
iso_code                                240
date                                      0
total_vaccinations                     1048
people_vaccinated                      1386
people_fully_vaccinated                1978
daily_vaccinations_raw                 1377
daily_vaccinations                      115
total_vaccinations_per_hundred         1048
people_vaccinated_per_hundred          1386
people_fully_vaccinated_per_hundred    1978
daily_vaccinations_per_million          115
vaccines                                  0
source_name                               0
source_website                            0
dtype: int64


In [4]:
# Explore Countries

print(*df['country'].unique().tolist(), sep='\n')

Albania
Algeria
Andorra
Argentina
Austria
Bahrain
Bangladesh
Belgium
Bermuda
Bolivia
Brazil
Bulgaria
Canada
Cayman Islands
Chile
China
Costa Rica
Croatia
Cyprus
Czechia
Denmark
Ecuador
Egypt
England
Estonia
Faeroe Islands
Finland
France
Germany
Gibraltar
Greece
Greenland
Guernsey
Hungary
Iceland
India
Indonesia
Iran
Ireland
Isle of Man
Israel
Italy
Jersey
Kuwait
Latvia
Liechtenstein
Lithuania
Luxembourg
Maldives
Malta
Mauritius
Mexico
Monaco
Morocco
Myanmar
Nepal
Netherlands
Northern Cyprus
Northern Ireland
Norway
Oman
Panama
Peru
Poland
Portugal
Qatar
Romania
Russia
Saint Helena
Saudi Arabia
Scotland
Serbia
Seychelles
Singapore
Slovakia
Slovenia
Spain
Sri Lanka
Sweden
Switzerland
Turkey
United Arab Emirates
United Kingdom
United States
Wales


## Questions

1. Does 'Northern Cyprus' include data that should be added to 'Cyprus' data? Or is it duplicate data that should be removed?

2. Do 'England', 'Ireland', 'Northern Ireland', 'Scotland', and 'Wales' include data that should be added to 'United Kingdom' data? Or is it duplicate data that should be removed?

## Assumptions

For simplicity, we are going to assume that the aforementioned rows are duplicates. Therefore, we are going to remove (drop) them.

In [5]:
# Clean Countries

"""
Drop

   England
   Ireland
   Northern Ireland
   Scotland
   Wales

Since they are part of the United Kingdom

Drop

   Northern Cyprus
   
Since it is part of the Cyprus

"""
countries_to_del = ['England', 'Ireland' , 'Northern Ireland', 'Scotland', 'Wales', 'Northern Cyprus']

for country_name in countries_to_del:
    df.drop(df[df['country'] == country_name].index, inplace=True)

print(*df['country'].unique().tolist(), sep='\n')

Albania
Algeria
Andorra
Argentina
Austria
Bahrain
Bangladesh
Belgium
Bermuda
Bolivia
Brazil
Bulgaria
Canada
Cayman Islands
Chile
China
Costa Rica
Croatia
Cyprus
Czechia
Denmark
Ecuador
Egypt
Estonia
Faeroe Islands
Finland
France
Germany
Gibraltar
Greece
Greenland
Guernsey
Hungary
Iceland
India
Indonesia
Iran
Isle of Man
Israel
Italy
Jersey
Kuwait
Latvia
Liechtenstein
Lithuania
Luxembourg
Maldives
Malta
Mauritius
Mexico
Monaco
Morocco
Myanmar
Nepal
Netherlands
Norway
Oman
Panama
Peru
Poland
Portugal
Qatar
Romania
Russia
Saint Helena
Saudi Arabia
Serbia
Seychelles
Singapore
Slovakia
Slovenia
Spain
Sri Lanka
Sweden
Switzerland
Turkey
United Arab Emirates
United Kingdom
United States


In [6]:
print('Number of countries after cleaning is: {} countries.'.format(df['country'].nunique()))

Number of countries after cleaning is: 79 countries.


## Questions to answer with visualization:

1. What vaccines are used and in which countries?
2. What country is vaccinated more people?
3. What country is vaccinated a larger percent from its population?


In [7]:
# Transform Data
#   Clean Data

# Remove 'source_website' column
df.drop('source_website', axis=1, inplace=True)

In [8]:
df.groupby(['country'])['people_vaccinated'].sum()

country
Albania                      4900.0
Algeria                         0.0
Andorra                      2903.0
Argentina                 5737736.0
Austria                   4344250.0
Bahrain                   5529534.0
Bangladesh                 548840.0
Belgium                   7717279.0
Bermuda                     25398.0
Bolivia                         0.0
Brazil                   47089049.0
Bulgaria                   940297.0
Canada                    2551474.0
Cayman Islands              25562.0
Chile                     7609302.0
China                           0.0
Costa Rica                 192381.0
Croatia                    620634.0
Cyprus                      57752.0
Czechia                   6592059.0
Denmark                   6421014.0
Ecuador                      9318.0
Egypt                           0.0
Estonia                    448594.0
Faeroe Islands              16276.0
Finland                   2558209.0
France                   35350112.0
Germany             

In [9]:
# Visualization

df.groupby(['country'])['people_vaccinated'].sum().plot(kind='bar');

In [None]:
# Countries with most people vaccinated

most_people_vaccinated = df.sort_values(['people_vaccinated'],ascending=[False]).head(100)

In [None]:
# Visualization of Top 10 Countries with most people vaccinated

most_people_vaccinated.groupby(['country'])['people_vaccinated'].sum().plot(kind='bar');