Import necessary libraries.


In [88]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime, timedelta


In [89]:
# ignore
proxies = {
    'http': 'http://26.26.26.1:10809',
    'https': 'http://26.26.26.1:10809',
}


Data collecting

We will use the `requests` library to get the data from the website. We will use the `BeautifulSoup` library to parse the data and `pandas` library to store the data in a DataFrame.


In [90]:
# url of the webpage to scrape
url = 'https://www.worldometers.info/coronavirus/'

# using requests to get the webpage with proxy
webpage = requests.get(url, proxies=proxies)

# using requests to get the webpage without proxy
# webpage = requests.get(url)

# using BeautifulSoup to parse the webpage
soup = BeautifulSoup(webpage.text, 'lxml')


Choose the table we want to scrape.

In [91]:
# date_delta = 0 is today
# date_delta = -1 is yesterday
# date_delta = -2 is 2 days ago
date_delta = 0


Get header of the table.


In [92]:
# find all table tag in the webpage
tables = soup.find_all('table')

# find all th tags in the table
table_header = tables[abs(date_delta)].find_all('th')

header = []
for i in range(16):
    # append the header to the list and format it
    header.append(table_header[i].text.replace('\n', '').replace('\xa0', ''))

# remove the first element "#" in the header which is not needed
header.pop(0)


'#'

In [93]:
header


['Country,Other',
 'TotalCases',
 'NewCases',
 'TotalDeaths',
 'NewDeaths',
 'TotalRecovered',
 'NewRecovered',
 'ActiveCases',
 'Serious,Critical',
 'TotCases/1M pop',
 'Deaths/1M pop',
 'TotalTests',
 'Tests/1M pop',
 'Population',
 'Continent']

Get the data from the table.

In [94]:
%%capture
covid_data = pd.DataFrame(columns=header)

# find all rows in the table
for row in tables[abs(date_delta)].tbody.find_all('tr'):
    # find all columns in the row
    col = row.find_all('td')
    if (col != []):
        # col[0] is index which is not needed
        country = col[1].text.strip()
        total_cases = col[2].text.strip()
        new_cases = col[3].text.strip()
        total_deaths = col[4].text.strip()
        new_deaths = col[5].text.strip()
        total_recovered = col[6].text.strip()
        new_recovered = col[7].text.strip()
        active_cases = col[8].text.strip()
        serious = col[9].text.strip()
        total_cases_per_m = col[10].text.strip()
        deaths = col[11].text.strip()
        total_tests = col[12].text.strip()
        tests_per_m = col[13].text.strip()
        population = col[14].text.strip()
        continent = col[15].text.strip()
        # append the data to the DataFrame
        covid_data = covid_data.append({"Country,Other": country, "TotalCases": total_cases, "NewCases": new_cases,
                                        "TotalDeaths": total_deaths, "NewDeaths": new_deaths, "TotalRecovered": total_recovered,
                                        "NewRecovered": new_recovered, "ActiveCases": active_cases, "Serious,Critical": serious,
                                        "TotCases/1M pop": total_cases_per_m, "Deaths/1M pop": deaths, "TotalTests": total_tests,
                                        "Tests/1M pop": tests_per_m, "Population": population, "Continent": continent}, ignore_index=True)


Get the date of the table.

In [95]:
# get the date of the data
now = datetime.now()

# if date_delta is -1, get yesterday date
# if date_delta is -2, get 2 days ago date
now = now - timedelta(abs(date_delta))

# format the date
date = now.strftime("%Y-%m-%d")


In [96]:
date


'2023-03-06'

Convert the data we just scraped into a CSV file.


In [97]:
# save the data to CSV file
covid_data.to_csv(f"data/{date}.csv", index=False)


References: 

https://github.com/Shaikhmohddanish/Covid-data-webscraping
