# Covid-19 World Vaccination Progress

### Import libraries and dataset

In [1]:
# Import libraries 

import pandas as pd
import numpy as np
import os

In [2]:
# Import Dataset

df_new = pd.read_csv(r'C:\Users\Owner\Desktop\Data Analyst\Achievement 6\country_vaccinations.csv')

In [3]:
# Check shape 

df_new.shape

(86512, 15)

In [4]:
df_new.head()

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines,source_name,source_website
0,Afghanistan,AFG,2021-02-22,0.0,0.0,,,,0.0,0.0,,,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
1,Afghanistan,AFG,2021-02-23,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
2,Afghanistan,AFG,2021-02-24,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
3,Afghanistan,AFG,2021-02-25,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
4,Afghanistan,AFG,2021-02-26,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/


### Data Wrangling and Consistency Checks

In [5]:
# Drop columns 'iso_code', 'source_name', 'source_website'

df_new = df_new.drop(columns = ['iso_code', 'source_name', 'source_website'])

In [6]:
df_new.head()

Unnamed: 0,country,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines
0,Afghanistan,2021-02-22,0.0,0.0,,,,0.0,0.0,,,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
1,Afghanistan,2021-02-23,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
2,Afghanistan,2021-02-24,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
3,Afghanistan,2021-02-25,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
4,Afghanistan,2021-02-26,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."


In [7]:
# Check for mixed types

for col in df_new.columns.tolist():
  weird = (df_new[[col]].applymap(type) != df_new[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_new[weird]) > 0:
    print (col)

No mixed type data

In [8]:
# find missing value

df_new.isnull().sum()

country                                    0
date                                       0
total_vaccinations                     42905
people_vaccinated                      45218
people_fully_vaccinated                47710
daily_vaccinations_raw                 51150
daily_vaccinations                       299
total_vaccinations_per_hundred         42905
people_vaccinated_per_hundred          45218
people_fully_vaccinated_per_hundred    47710
daily_vaccinations_per_million           299
vaccines                                   0
dtype: int64

There are a lot of missing values and I can't risk to remove them as it can impact the analysis. The values can be missing due to the different ways of reporting in different countries. For now I will impute the missing values to N/A, until I investigate the reason.

In [9]:
# Impute missing values with '0'

df_new.fillna(0, inplace =True)

In [10]:
# Verify the query above

df_new.isnull().sum()

country                                0
date                                   0
total_vaccinations                     0
people_vaccinated                      0
people_fully_vaccinated                0
daily_vaccinations_raw                 0
daily_vaccinations                     0
total_vaccinations_per_hundred         0
people_vaccinated_per_hundred          0
people_fully_vaccinated_per_hundred    0
daily_vaccinations_per_million         0
vaccines                               0
dtype: int64

In [11]:
# Finding duplicates

df_dups = df_new[df_new.duplicated()]

In [12]:
df_dups

Unnamed: 0,country,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines


There are no duplicates in the datatset

In [13]:
df_new.dtypes

country                                 object
date                                    object
total_vaccinations                     float64
people_vaccinated                      float64
people_fully_vaccinated                float64
daily_vaccinations_raw                 float64
daily_vaccinations                     float64
total_vaccinations_per_hundred         float64
people_vaccinated_per_hundred          float64
people_fully_vaccinated_per_hundred    float64
daily_vaccinations_per_million         float64
vaccines                                object
dtype: object

In [14]:
# Change date to string

df_new['date'] = df_new['date'].astype('str')

In [15]:
df_new.describe()

Unnamed: 0,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million
count,86512.0,86512.0,86512.0,86512.0,86512.0,86512.0,86512.0,86512.0,86512.0
mean,23151170.0,8451007.0,6341251.0,110608.3,130851.7,40.419616,19.535471,15.932736,3245.792248
std,161103700.0,49698670.0,38907290.0,786475.6,766948.7,62.707869,28.764846,25.947621,3932.156455
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,877.0,0.0,0.0,0.0,629.0
50%,1008.0,0.0,0.0,0.0,7245.0,0.01,0.0,0.0,2036.0
75%,3697554.0,1843103.0,1137869.0,12806.25,43704.5,68.75,38.27,25.22,4667.0
max,3263129000.0,1275541000.0,1240777000.0,24741000.0,22424290.0,345.37,124.76,122.37,117497.0


### Export Dataset

In [16]:
# Create a path

path = r'C:\Users\Owner\Desktop\Data Analyst\Achievement 6'

In [17]:
df_new.to_csv(os.path.join(path, 'df_new.csv'), index = False)