## Import the necessary libraries

In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
import re

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt 
import seaborn as sb

from datetime import date

---

## Crawling population data from web
The dataset used for analysising is collected from [Worldometer](https://www.worldometers.info/coronavirus/)

- Get chrome driver to browse in `Chrome`

In [3]:
browser = webdriver.Chrome(executable_path="./../chromedriver_win32/chromedriver.exe")
# browser = webdriver.Chrome()

  browser = webdriver.Chrome(executable_path="./../chromedriver_win32/chromedriver.exe")


- Build function to get `html doc` from website

In [4]:
def get_html_from_link(link):
    browser.get(link)
    browser.implicitly_wait(20)
    browser.minimize_window()

    return BeautifulSoup(browser.page_source, "html.parser")

In [5]:
html_text = get_html_from_link("https://www.worldometers.info/coronavirus/")

In [6]:
# Close the Chrome browser
browser.quit()

In [7]:
def get_final_columns(columns):
    columns = str(columns[3:-1])[2:-2].replace("', '', '",'|').replace("', '",' ').split('|')
    return columns

In [8]:
def get_string(row):
    row.pop(0)
    if len(row) == 17:
        row.pop(-3)
    if len(row) == 16:
        row.pop(1)
    for i in range(len(row)):
        row[i] = row[i].string
        try:
            row[i] = int(row[i].replace(',',''))
        except: pass

    return row

In [9]:
def get_rows(html_text):
    rows = html_text.select("tr:not([style='display: none'])")[:233]
    columns = get_final_columns(re.findall('>([-\w\d,.+/# ]*)<',str(rows[0])))

    rows = list(map(lambda line: line.find_all(['td','a']), rows[2:]))
    rows = list(map(lambda line: get_string(line), rows))

    return columns, rows

Now, it's time to combine:

- Get data rows from the site.

- Finally, write the rows to file.

In [10]:
columns, rows = get_rows(html_text)
print(columns, len(columns))

['Country, Other', 'Total Cases', 'New Cases', 'Total Deaths', 'New Deaths', 'Total Recovered', 'New Recovered', 'Active Cases', 'Serious, Critical', '1M pop', 'Deaths/ 1M pop', 'Total Tests', 'Tests/ 1M pop', 'Population', 'Continent'] 15


In [11]:
df = pd.DataFrame(rows, columns=columns)
df

Unnamed: 0,"Country, Other",Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,"Serious, Critical",1M pop,Deaths/ 1M pop,Total Tests,Tests/ 1M pop,Population,Continent
0,USA,104872877,,1141220,,102163862,,1567795,2919,313235.0,3409.0,1.162615e+09,3472512.0,334805269,North America
1,India,44684376,,530756,,44151797,,1823,,31767.0,377.0,9.169433e+08,651872.0,1406631776,Asia
2,France,39574447,,164657,,39338816,,70974,869,603411.0,2511.0,2.714902e+08,4139547.0,65584518,Europe
3,Germany,37970357,,167124,,37558400,,244833,,452655.0,1992.0,1.223324e+08,1458359.0,83883596,Europe
4,Brazil,36960888,,697894,,36097919,,165075,,171629.0,3241.0,6.377617e+07,296146.0,215353593,South America
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,Diamond Princess,712,,13,,699,,0,,,,,,,
227,Vatican City,29,,,,29,,0,,36295.0,,,,799,Europe
228,Western Sahara,10,,1,,9,,0,,16.0,2.0,,,626161,Africa
229,MS Zaandam,9,,2,,7,,0,,,,,,,


In [17]:
# df.to_csv("data.csv")
df.to_csv(f"{date.today()}.csv")

---

### Read data from csv file into Pandas dataframe

In [3]:
# df = pd.read_csv("data.csv")
# df = pd.read_csv(f"{date.today()}.csv")
df = pd.read_csv("2023-02-16.csv")
df = df.drop(df.columns[0], axis=1)
df

Unnamed: 0,"Country, Other",Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,"Serious, Critical",1M pop,Deaths/ 1M pop,Total Tests,Tests/ 1M pop,Population,Continent
0,USA,104872877,,1141220,,102163862.0,,1567795.0,2919.0,313235.0,3409.0,1.162615e+09,3472512.0,334805269,North America
1,India,44684376,,530756,,44151797.0,,1823.0,,31767.0,377.0,9.169433e+08,651872.0,1406631776,Asia
2,France,39574447,,164657,,39338816.0,,70974.0,869.0,603411.0,2511.0,2.714902e+08,4139547.0,65584518,Europe
3,Germany,37970357,,167124,,37558400.0,,244833.0,,452655.0,1992.0,1.223324e+08,1458359.0,83883596,Europe
4,Brazil,36960888,,697894,,36097919.0,,165075.0,,171629.0,3241.0,6.377617e+07,296146.0,215353593,South America
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,Diamond Princess,712,,13,,699.0,,0.0,,,,,,,
227,Vatican City,29,,,,29.0,,0.0,,36295.0,,,,799,Europe
228,Western Sahara,10,,1,,9.0,,0.0,,16.0,2.0,,,626161,Africa
229,MS Zaandam,9,,2,,7.0,,0.0,,,,,,,
