## Import the necessary libraries

In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
import re

import pandas as pd
import numpy as np

from datetime import date

---

## Crawling data from web
The dataset used for analysising is collected from [Worldometer](https://www.worldometers.info/coronavirus/)

- Get chrome driver to browse in `Chrome`

In [2]:
browser = webdriver.Chrome(executable_path="./../chromedriver_win32/chromedriver.exe")
# browser = webdriver.Chrome()

  browser = webdriver.Chrome(executable_path="./../chromedriver_win32/chromedriver.exe")


In [3]:
browser.get("https://www.worldometers.info/coronavirus/")
browser.implicitly_wait(20)
browser.minimize_window()

html_text = BeautifulSoup(browser.page_source, "html.parser")
browser.quit()# Close the Chrome browser

In [4]:
def get_string(row):
    row.pop(0)# remove id value
    if len(row) == 17:
        row.pop(-3)# remove redundant empty value
    if len(row) == 16:
        row.pop(1)# remove duplicated country name
    for i in range(len(row)):
        row[i] = row[i].string# pure get data
        if row[i] in [' ']: row[i] = np.nan
        try:
            row[i] = int(row[i].replace(',',''))# astype numeric values
        except: pass
        
    return row

In [5]:
rows = html_text.select("tr:not([style='display: none'])")[:233]# get 233 rows of the table
columns = re.findall('>([\w\d,/ ]*)<',str(rows[0]))# get columns name from html
columns = str(columns[2:-1])[2:-2].replace("', '', '",'|').replace("', '",' ').split('|')# get columns name from list 
print(columns, len(columns))

#get data of each table's cell, except 2 first rows
rows = list(map(lambda line: line.find_all(['td','a']), rows[2:]))
rows = list(map(lambda line: get_string(line), rows))

['Country, Other', 'Total Cases', 'New Cases', 'Total Deaths', 'New Deaths', 'Total Recovered', 'New Recovered', 'Active Cases', 'Serious, Critical', '1M pop', 'Deaths/ 1M pop', 'Total Tests', 'Tests/ 1M pop', 'Population', 'Continent'] 15


In [6]:
df = pd.DataFrame(rows, columns=columns)
# df

## Pre-processing

- Missing ratio of attributes

In [7]:
df.apply(lambda c: c.isna().mean())

Country, Other       0.000000
Total Cases          0.000000
New Cases            0.779221
Total Deaths         0.025974
New Deaths           0.887446
Total Recovered      0.008658
New Recovered        0.701299
Active Cases         0.000000
Serious, Critical    0.402597
1M pop               0.008658
Deaths/ 1M pop       0.034632
Total Tests          0.077922
Tests/ 1M pop        0.077922
Population           0.008658
Continent            0.008658
dtype: float64

`Comment:` Because the covid situation is no longer serious, the attributes consist of 'New Cases', 'New Deaths', 'New Recovered' are not updated regularly (have high missing ratio). They wil be removed. 

In [8]:
df.drop(['New Cases','New Deaths','New Recovered'], axis=1, inplace=True)
df

Unnamed: 0,"Country, Other",Total Cases,Total Deaths,Total Recovered,Active Cases,"Serious, Critical",1M pop,Deaths/ 1M pop,Total Tests,Tests/ 1M pop,Population,Continent
0,USA,105820390,1151253.0,103485147,1183990,2261,316065.0,3439.0,1.169977e+09,3494499.0,3.348053e+08,North America
1,India,44693506,530795.0,44157685,5026,,31773.0,377.0,9.200129e+08,654054.0,1.406632e+09,Asia
2,France,39690610,165314.0,39447509,77787,869,605183.0,2521.0,2.714902e+08,4139547.0,6.558452e+07,Europe
3,Germany,38291497,169579.0,37931300,190618,,456484.0,2022.0,1.223324e+08,1458359.0,8.388360e+07,Europe
4,Brazil,37145514,699634.0,36249161,196719,,172486.0,3249.0,6.377617e+07,296146.0,2.153536e+08,South America
...,...,...,...,...,...,...,...,...,...,...,...,...
226,Diamond Princess,712,13.0,699,0,,,,,,,
227,Vatican City,29,,29,0,,36295.0,,,,7.990000e+02,Europe
228,Western Sahara,10,1.0,9,0,,16.0,2.0,,,6.261610e+05,Africa
229,MS Zaandam,9,2.0,7,0,,,,,,,


In [9]:
# df.to_csv("2023-02-20.csv")
df.to_csv(f"{date.today()}.csv")