### Relevant Libaries for web scraping

In [34]:
import pandas as pd
import numpy as np
import requests 
from bs4 import BeautifulSoup as bs

### Load Webpage and Convert to Beautiful Soup Object

In [35]:
# Load webpage content
rv = requests.get('https://github.com/owid/covid-19-data/blob/master/public/data/vaccinations/locations.csv')


# Convert to a beautiful soup object
webpage_v = bs(rv.content)

# print out html content
print(webpage_l.prettify())

<html>
 <body>
  <p>
   location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million
Afghanistan,AFG,2021-02-22,0,0,,,,0.0,0.0,,
Afghanistan,AFG,2021-02-23,,,,,1367,,,,35
Afghanistan,AFG,2021-02-24,,,,,1367,,,,35
Afghanistan,AFG,2021-02-25,,,,,1367,,,,35
Afghanistan,AFG,2021-02-26,,,,,1367,,,,35
Afghanistan,AFG,2021-02-27,,,,,1367,,,,35
Afghanistan,AFG,2021-02-28,8200,8200,,,1367,0.02,0.02,,35
Afghanistan,AFG,2021-03-01,,,,,1580,,,,41
Afghanistan,AFG,2021-03-02,,,,,1794,,,,46
Afghanistan,AFG,2021-03-03,,,,,2008,,,,52
Afghanistan,AFG,2021-03-04,,,,,2221,,,,57
Afghanistan,AFG,2021-03-05,,,,,2435,,,,63
Afghanistan,AFG,2021-03-06,,,,,2649,,,,68
Afghanistan,AFG,2021-03-07,,,,,2862,,,,74
Afghanistan,AFG,2021-03-08,,,,,2862,,,,74
Afghanistan,AFG,2021-03-09,,,,,2862,,,,74
Afghanistan,AFG,2021-03-10,,,,,2

### Scrape Table Data And Select Based on CSS Selector

In [36]:
table_v = webpage_v.select("table.js-csv-data")[0]
table_v

<table class="js-csv-data csv-data js-file-line-container">
<thead>
<tr class="js-file-line" id="LC1">
<td class="blob-num js-line-number" data-line-number="1" id="L1"></td>
<th>location</th>
<th>iso_code</th>
<th>vaccines</th>
<th>last_observation_date</th>
<th>source_name</th>
<th>source_website</th>
</tr>
</thead>
<tbody>
<tr class="js-file-line" id="LC2">
<td class="blob-num js-line-number" data-line-number="2" id="L2"></td>
<td>Afghanistan</td>
<td>AFG</td>
<td>Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm/Beijing</td>
<td>2021-05-26</td>
<td>World Health Organization</td>
<td>https://covid19.who.int/</td>
</tr>
<tr class="js-file-line" id="LC3">
<td class="blob-num js-line-number" data-line-number="3" id="L3"></td>
<td>Albania</td>
<td>ALB</td>
<td>Oxford/AstraZeneca, Pfizer/BioNTech, Sinovac, Sputnik V</td>
<td>2021-05-27</td>
<td>Ministry of Health</td>
<td>https://shendetesia.gov.al/vaksinimi-anticovid-kryhen-759043-vaksinime/</td>
</tr>
<tr class="js-file-line" id="LC4">
<td

### Find All Columns and Put into List

In [37]:
columns_v = table_v.find('thead').find_all('th')
columns_v

[<th>location</th>,
 <th>iso_code</th>,
 <th>vaccines</th>,
 <th>last_observation_date</th>,
 <th>source_name</th>,
 <th>source_website</th>]

### Find All Column Names from Columns List

In [26]:
column_names_v = [c.string for c in columns_v]
column_names_v

['location',
 'iso_code',
 'vaccines',
 'last_observation_date',
 'source_name',
 'source_website']

### Sepearate Data Needed into Cells for the Table

In [38]:
# Define where rows are in html text
table_rows_v = table_v.find('tbody').find_all('tr')

# Iterate through each row and take data from everything after 'td'
v = []
for tr in table_rows_v:
    td = tr.find_all('td')
    row = [tr.string for tr in td]
    v.append(row)

# This removes first column of every row (index)
for x in v:
    del x[0]

# Take data and put into columns
df_v = pd.DataFrame(v, columns=column_names_v)

### Remove blank values from total vaccinations and save as csv

In [39]:
# Replace empty cells with NaN
df_v = df_v.replace('',np.nan)

# Removing blank cells in total vaccinations column
df_v = df_v.dropna(subset = ['vaccines'])

# Saving to csv
df_v.to_csv('Vaccinations_type_updated.csv')

### Direct Download of CSV from UK Government Vaccination Data

In [29]:
df_gov = pd.read_csv('https://coronavirus.data.gov.uk/api/v1/data?filters=areaType=overview&structure=%7B%22areaType%22:%22areaType%22,%22areaName%22:%22areaName%22,%22areaCode%22:%22areaCode%22,%22date%22:%22date%22,%22newPeopleVaccinatedFirstDoseByPublishDate%22:%22newPeopleVaccinatedFirstDoseByPublishDate%22,%22newPeopleVaccinatedSecondDoseByPublishDate%22:%22newPeopleVaccinatedSecondDoseByPublishDate%22,%22cumPeopleVaccinatedFirstDoseByPublishDate%22:%22cumPeopleVaccinatedFirstDoseByPublishDate%22,%22cumPeopleVaccinatedSecondDoseByPublishDate%22:%22cumPeopleVaccinatedSecondDoseByPublishDate%22%7D&format=csv')

### Remove Irrelevant Rows and Save to CSV file

In [30]:
df_gov = df_gov.drop(columns = ['areaType','areaName','areaCode'])
df_gov.to_csv('vaccinations_gov_updated.csv')

### Direct Download of Vaccination Data

In [33]:
df_v2 = pd.read_csv('https://github.com/owid/covid-19-data/raw/master/public/data/vaccinations/vaccinations.csv')
df_v2.to_csv('Vaccinations_updated.csv')