In [54]:
import re
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
from datetime import datetime

In [55]:
def extract_data (web, date, time):
    # The name of the website is built, ready to be put in the browser:
    web_archive = "https://web.archive.org/web/"
    url = web_archive + str(date) + str(time) + "/" + web

    # Open website:
    html = urlopen(url)
    soup = BeautifulSoup(html, features="html.parser")
    
    # We find the table that contains the word "China"
    tables = soup.findAll('table')
    #print(len(tables))
    for idx, i in enumerate(tables):
        ctr = i.text.find('China')
        if ctr != -1:
            index = idx
    
    # Extract the rows of tha table and store in a list:
    list_rows = []
    rows = tables[index].find_all('tr')
    for row in rows:
        column = []
        for col in row.find_all('td'):
            column.append(col.text.strip().replace(',','').replace('\n','').replace(' ','_'))
        list_rows.append(column)

    # Transform the list into a pandas dataset:
    df = pd.DataFrame(list_rows)
    df.drop(df.index[0], inplace=True) #Remove 1st row
    
    # Dealing with the column names:
    list_cols = []
    cols = soup.find_all('th')
    if len(cols) > 9:
        cols = cols[0:9]
    for col in cols:
        col = col.text.strip()
        col = re.sub('^.*ountry.*', 'Country', col)
        col = re.sub('^.*otal.*ases.*', 'Total_Cases', col)
        col = re.sub('^.*otal.*eath.*', 'Total_Deaths', col)
        col = re.sub('^.*otal.*ecover.*', 'Total_Recovered', col)
        list_cols.append(col.replace(' ','_'))

    df.columns = list_cols
   
    # Only interested in some columns.
    # The index will be the country:
    df1 = df[["Country", "Total_Cases", "Total_Deaths", "Total_Recovered"]].set_index("Country")

    return df1

In [56]:
web = "https://www.worldometers.info/coronavirus"

# Set the desired date and time in the appropriate format that the function "extract_data" needs:
datestr = "11/3/2020"

date = datetime.strptime(datestr, '%d/%m/%Y').date().strftime("%Y%m%d")
time = datetime.strptime("23:00", '%H:%M').time().strftime("%H%M%S")

print(date)
print(time)

df = extract_data (web, date, time)

20200311
230000


In [57]:
# There are over 100 countries listed:
df.index

Index(['China', 'Italy', 'Iran', 'S._Korea', 'France', 'Spain', 'Germany',
       'USA', 'Diamond_Princess', 'Switzerland',
       ...
       'DRC', 'Gibraltar', 'Vatican_City', 'Liechtenstein', 'Mongolia',
       'Réunion', 'St._Barth', 'Togo', 'Turkey', 'Total:'],
      dtype='object', name='Country', length=123)

In [6]:
# List the countries that start with a "U":
r = re.compile("^U")
l = list(filter(r.match, df.index))
print(l)

['USA', 'UK', 'UAE', 'Ukraine']


In [7]:
# Examine the numerical values for a specific country:
df.loc["USA"].to_list()

['1283', '37', '15']

In [10]:
# Check if both Koreas are included:
r = re.compile(".*orea.*")
l = list(filter(r.match, df.index))
print(l)

['S._Korea']


# Send email

In [61]:
import smtplib
fromaddr = 'from@gmail.com'
toaddr   = 'to@gmail.com'
username = fromaddr
password = 'xxx'

server = smtplib.SMTP('smtp.gmail.com:587')
server.ehlo()
server.starttls()
server.login(username,password)

# An email will be sent to "toaddr" with the analyzed date in the Subject and 
# a table with the first 10 countries more affected by the virus. 

msg = "<html><body>" + str(df.head(10).to_html()) + "</body></html>"
#.encode('utf-8')

message = """From: Ricuit <ricuit@gilsingstrasse.com>
To: Januit <janetboram@gmail.com>
MIME-Version: 1.0
Content-type: text/html
Subject: Corona information at """ + datestr + """

""" + msg + """"""

##+ title.get_text() + """


server.sendmail(fromaddr, toaddr, message)
server.quit()

(221, b'2.0.0 closing connection g61sm13446764ede.96 - gsmtp')