<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [139]:
# Requests
import urllib.request #
import requests
from requests import get
from requests.exceptions import RequestException
from contextlib import closing

# for xml & html scrapping 
import lxml.html as lh
from bs4 import BeautifulSoup

# for table analysis
import pandas as pd

# write to csv
import csv

# Time
import time

import re


# Visualisation libraries
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()






# Disable warnings 
import warnings
warnings.filterwarnings('ignore')



In [140]:
# urls
w_url = "https://en.wikipedia.org/wiki/2019%E2%80%9320_coronavirus_pandemic"
c_url = "https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_Canada"
o_url = "https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_Ontario"
oak_url = "https://www.halton.ca/For-Residents/Immunizations-Preventable-Disease/Diseases-Infections/New-Coronavirus"

In [141]:
# responses
w_resp = requests.get(w_url, timeout=10)
c_resp = requests.get(c_url, timeout=10)
o_resp = requests.get(o_url, timeout=10)

In [142]:
# parse response content to html
w_soup = BeautifulSoup(w_resp.content, 'html.parser')
c_soup = BeautifulSoup(c_resp.content, 'html.parser')
o_soup = BeautifulSoup(o_resp.content, 'html.parser')

In [143]:
# title
print(w_soup.title.string)
print(c_soup.title.string)
print(o_soup.title.string)

2019–20 coronavirus pandemic - Wikipedia
2020 coronavirus pandemic in Canada - Wikipedia
2020 coronavirus pandemic in Ontario - Wikipedia


In [144]:
# find data tables
w_table=w_soup.find('table', {"class":"wikitable plainrowheaders sortable"})

In [145]:
# get rows
w_rows = w_table.findAll("tr")

In [146]:
# get header attributes of the table
w_cols = [th.text.rstrip() for th in w_rows[0].find_all('th')]

In [147]:
# get data - countries
w_ctry_lst = []
for row in w_rows[2:]:
            data = [h.text.rstrip() for h in row.find_all('th')]
            if data :
                w_ctry_lst.append(re.sub(r" ?\([^)]+\)", "", data[1]))  

w_ctry_dict = {w_cols[0]:w_ctry_lst}
w_ctry_df = pd.DataFrame(w_ctry_dict)
print(w_ctry_df.head(3))
print('----------------')
print(w_ctry_df.count())

       Locations[b]
0  United States[e]
1          Italy[f]
2          China[g]
----------------
Locations[b]    183
dtype: int64


In [148]:
# get data - Covid Stats
w_data_lst = []
for row in w_rows[2:]:
            data = [d.text.rstrip() for d in row.find_all('td')]
            w_data_lst.append(data)

w_df_lst = []
for d in w_data_lst:
    lst_df = pd.DataFrame(d)
    w_df_lst.append(lst_df.T)

w_data_df = pd.concat(w_df_lst,axis=0)
w_data_df = w_data_df.dropna(how='any')
w_data_df.head(3)   

Unnamed: 0,0,1,2,3
0,119938,1991,3229,[38][39]
0,92472,10023,12384,[42]
0,81394,3295,74971,[43]


In [149]:
w_ctry_df.reset_index(drop=True, inplace=True)
w_data_df.reset_index(drop=True, inplace=True)
w_df = pd.concat([w_ctry_df, w_data_df], axis=1)

In [150]:
w_df.columns = w_cols

In [151]:
# drop 'comm'column 
w_df = w_df.drop(w_df[['Ref.']],axis=1)
w_df = w_df.replace(',','', regex=True)
w_cols1 = ["Country", "Confirmed","Deaths","Recovered" ]
w_df.columns = w_cols1
w_df.head(5)

Unnamed: 0,Country,Confirmed,Deaths,Recovered
0,United States[e],119938,1991,3229
1,Italy[f],92472,10023,12384
2,China[g],81394,3295,74971
3,Spain[h],72251,5812,12285
4,Germany,56493,421,4107


In [152]:
# convert just columns "a" and "b"
w_df[w_cols1[1:]] = w_df[w_cols1[1:]].apply(pd.to_numeric,errors='coerce').fillna(0).astype(int)

In [153]:
w_df.dtypes

Country      object
Confirmed     int32
Deaths        int32
Recovered     int32
dtype: object

In [154]:
w_df

Unnamed: 0,Country,Confirmed,Deaths,Recovered
0,United States[e],119938,1991,3229
1,Italy[f],92472,10023,12384
2,China[g],81394,3295,74971
3,Spain[h],72251,5812,12285
4,Germany,56493,421,4107
5,France[i],37611,2314,5700
6,Iran[j],35408,2517,11679
7,United Kingdom[k],17089,1028,140
8,Switzerland,14108,271,1595
9,Netherlands[l],9806,640,0


In [155]:
# find data tables
c_table=c_soup.find_all('table', {"class":"wikitable sortable"})
c_table1 = c_table[1]

In [156]:
# get rows
c_rows = c_table1.findAll("tr")

In [157]:
# get header attributes of the table
c_cols = [th.text.rstrip() for th in c_rows[1].find_all('th')]
print(c_cols)

['Province', 'Tests', 'Conf.', 'Pres.', 'Total', 'Population', 'Per m', 'Recov.', 'Deaths', 'Active', 'Ref.']


In [158]:
# get data - Covid Stats
c_data_lst = []
for row in c_rows[3:]:
            data = [d.text.rstrip() for d in row.find_all('td')]
            c_data_lst.append(data)

In [159]:
c_df_lst = []
for d in c_data_lst:
    lst_df = pd.DataFrame(d)
    c_df_lst.append(lst_df.T)

c_data_df = pd.concat(c_df_lst,axis=0)
c_data_df = c_data_df.dropna(how='any')

In [160]:
c_data_df.columns = c_cols

# drop 'comm'column 
c_data_df = c_data_df.drop(c_data_df[['Ref.']],axis=1)
c_data_df

Unnamed: 0,Province,Tests,Conf.,Pres.,Total,Population,Per m,Recov.,Deaths,Active
0,Alberta,38215.0,542,0,542,4413146.0,110.1,33,2,457
0,Saskatchewan,6915.0,95,0,95,1181666.0,80.4,3,0,92
0,Manitoba,6205.0,25,14,39,1377517.0,28.3,0,1,38
0,Ontario,41032.0,993,0,993,14711827.0,67.5,8,18,967
0,Quebec,43589.0,2498,0,2498,8537674.0,236.7,29,22,2477
0,New Brunswick,1828.0,45,0,45,779993.0,57.7,0,0,45
0,Prince Edward Island,556.0,11,0,11,158158.0,69.6,1,0,10
0,Nova Scotia,3739.0,90,0,90,977457.0,92.1,2,0,88
0,Newfoundland and Labrador,1491.0,120,0,102,521365.0,195.6,0,0,102
0,Yukon,517.0,3,0,3,41078.0,73.0,0,0,3


In [47]:
# find data tables
c_table2=c_soup.find('table',{"class":"wikitable"})

In [168]:
# find data tables
o_table=o_soup.find('table', {"class":"wikitable sortable"})

In [169]:
# get rows
o_rows = o_table.findAll("tr")

In [170]:
# get header attributes of the table
o_cols = [th.text.rstrip() for th in o_rows[0].find_all('th')]
print(o_cols)

['Public Health Unit', 'Cases', 'Per m', 'Deaths', 'Ref']


In [171]:
# get data - Covid Stats
o_data_lst = []
for row in o_rows[1:]:
            data = [d.text.rstrip() for d in row.find_all('td')]
            o_data_lst.append(data)

In [172]:
o_df_lst = []
for d in o_data_lst:
    lst_df = pd.DataFrame(d)
    o_df_lst.append(lst_df.T)

o_data_df = pd.concat(o_df_lst,axis=0)
o_data_df = o_data_df.dropna(how='any')

In [173]:
o_data_df.columns = o_cols
# drop 'comm'column 
o_data_df = o_data_df.drop(o_data_df[['Ref']],axis=1)
o_data_df

Unnamed: 0,Public Health Unit,Cases,Per m,Deaths
0,Algoma,1,8.85,
0,Brant County (including Brantford),7,51.85,
0,Chatham-Kent,4,39.22,0.0
0,Durham Region,86,132.31,3.0
0,"Eastern Ontario (Stormont, Dundas and Glengarr...",4,19.7,0.0
0,Grey Bruce,7,43.21,
0,Haldimand-Norfolk,5,45.45,1.0
0,"Haliburton, Kawartha, Pine Ridge District",33,184.36,3.0
0,Halton Region,20,36.36,1.0
0,Hamilton,51,94.44,1.0


In [174]:
o_data_df.dtypes

Public Health Unit    object
Cases                 object
Per m                 object
Deaths                object
dtype: object

In [175]:
o_cols = list(o_data_df.columns)
o_cols[1:]

['Cases', 'Per m', 'Deaths']

In [179]:
# convert just columns "a" and "b"
o_data_df[o_cols[1:]] = o_data_df[o_cols[1:]].apply(pd.to_numeric).fillna(0)
o_data_df.dtypes

Public Health Unit     object
Cases                   int64
Per m                 float64
Deaths                float64
dtype: object

In [180]:
def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: pink' if v else '' for v in is_max]

o_data_df.reset_index(drop=True).style.apply(highlight_max,subset=['Per m'])

Unnamed: 0,Public Health Unit,Cases,Per m,Deaths
0,Algoma,1,8.85,0
1,Brant County (including Brantford),7,51.85,0
2,Chatham-Kent,4,39.22,0
3,Durham Region,86,132.31,3
4,"Eastern Ontario (Stormont, Dundas and Glengarry-Prescott and Russell)",4,19.7,0
5,Grey Bruce,7,43.21,0
6,Haldimand-Norfolk,5,45.45,1
7,"Haliburton, Kawartha, Pine Ridge District",33,184.36,3
8,Halton Region,20,36.36,1
9,Hamilton,51,94.44,1


In [None]:
# find all the tables in the html
all_tables=c_soup.find_all('table')

In [None]:
rows = right_table.findAll("tr")
len(rows)

In [None]:
# header attributes of the table
header = [th.text.rstrip() for th in rows[0].find_all('th')]
print(header[0])
print('------------')
print(len(header))

In [None]:
# header attributes of the table
cols1 = [th.text.rstrip() for th in rows[2].find_all('th')]
print(cols1)
print('------------')
print(len(cols1))