In [None]:
import bs4 as bs
import requests
import urllib.request
import pandas as pd
import re
import numpy as np

In [None]:
url = 'https://www.worldometers.info/coronavirus/'

header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

r = requests.get(url, headers=header)

# Read text result into Pandas
dfs = pd.read_html(r.text)
# Look for the first table
df = dfs[0]
# Fill missing values with 0
df = df.fillna(0)

df.to_csv('./Data/world_info.csv')

In [None]:
homepage_soup = bs.BeautifulSoup(r.text, 'lxml')
# Using the table on the main page, only the countries with links (<a> tags) have detailed historical data.
country_elements = homepage_soup.select('table[id="main_table_countries_today"] > tbody > tr > td > a')
# Start with an empty list
countries_with_detailed_data = {}
# Iterate through each element, and add the contents (country name) to a list
for individual_element in country_elements:
    country_name = individual_element.contents[0]
    country_url = individual_element['href']
    countries_with_detailed_data[country_name] = country_url

print("Can get detailed data for %s" % ", ".join(list(countries_with_detailed_data.keys())))

In [None]:
# Looks for script elements that contain the JS stats by day, and extract x and y axis values
def extractDataFromGraph(soup, chart_id):
    scripts = soup.select('script[type="text/javascript"]')
    for individual_script in scripts:
        individual_script = individual_script.get_text(strip=True)
        if(chart_id in individual_script):
            x_text = re.search('categories: \[([^\]]+)\]', individual_script).group(1)
            y_text = re.search('data: \[((null|[0-9, ])+)]', individual_script).group(1)
            x_values = x_text.replace('"', '').split(",")
            y_values = y_text.split(",")
    return (x_values, y_values)

# Iterate over each country and extract the same data, save to CSV
def getDetailedDataForCountry(country, url_part):
    url = 'https://www.worldometers.info/coronavirus/' + url_part
    r = requests.get(url, headers=header).text

    soup = bs.BeautifulSoup(r, 'lxml')
    dates, cases = extractDataFromGraph(soup, "coronavirus-cases-linear")
    # errors='coerce' makes anything that's not a number (eg. null), NaN instead.
    cases = pd.to_numeric(cases, errors='coerce')
    # a bit risky ignoring dates for the following, if for some reason they are different on different graphs
    active_cases = pd.to_numeric(extractDataFromGraph(soup, "graph-active-cases-total")[1], errors='coerce')
    deaths = pd.to_numeric(extractDataFromGraph(soup, "coronavirus-deaths-linear")[1], errors='coerce')
    daily_deaths = pd.to_numeric(extractDataFromGraph(soup, "graph-deaths-daily")[1], errors='coerce')
    daily_cases = pd.to_numeric(extractDataFromGraph(soup, "graph-cases-daily")[1], errors='coerce')
    
    '''
    #calculate daily cases
    daily_cases = cases*0
    for i in range(0,len(cases)):
        daily_cases[i] = int(cases[i]) - int(cases[i-1])
    daily_cases[0] = 0
    '''
    
    #calculate death rate
    CFR = deaths/cases
    
    df_country = pd.DataFrame(
    {'Dates': dates,
     'Cases': cases,
     'Deaths': deaths,
     'Active Cases': active_cases,
     'Daily Cases': daily_cases,
     'Daily Deaths': daily_deaths,
     'CFR': CFR
    })
    
    df_country = df_country.fillna(0)
    file_name = './Data/Countries/' + country + '.csv'
    df_country.to_csv(file_name)

# Execute the functions to collect data for all countries
for country, url in countries_with_detailed_data.items():
    getDetailedDataForCountry(country, url)

In [None]:
#DOESN'T WORK YET
#From country dfs, create dfs for each stat and country over time
def createStatDfs(countries):
    df_cases = None
    for country in countries:
        file_name = './Data/Countries/' + country + '.csv'
        df = pd.read_csv(file_name)
        
        #Cases
        if df_cases is None: 
            df_cases = df.drop(df.columns[[0,3,4,5,6,7]], axis=1)
            df_cases = df_cases.set_index('Dates')
            df_cases_2 = df_cases
        else: 
            df_cases = df.drop(df.columns[[0,1,3,4,5,6,7]], axis=1)
        
        df_cases_2[country] = df_cases.values
        print(df_cases_2)
        
        #Deaths
        #df_deaths = df.drop(df.columns[[0,1,2,4,5,6]], axis=1)
        #df_deaths[country] = df_deaths.values
        
        #Daily cases
        #df_daily_cases = df.drop(df.columns[[0,1,2,3,4,6]], axis=1)
        #df_daily_cases[country] = df_daily_cases.values
        
        #Daily deaths
        #df_daily_deaths = df.drop(df.columns[[0,1,2,3,4,5]], axis=1)
        #df_daily_deaths[country] = df_daily_deaths.values
        
        #Active cases
        #df_active_cases = df.drop(df.columns[[0,1,2,3,5,6]], axis=1)
        #df_active_cases[country] = df_active_cases.values
        
        #CFR
        #df_CFR = df.drop(df.columns[[0,1,2,3,4,6]], axis=1)
        #df_CFR[country] = df_CFR.values
        
    df_cases_2.to_csv('./Data/Stats/Cases.csv')
    df_deaths.to_csv('./Data/Stats/Deaths.csv')
    df_daily_cases.to_csv('./Data/Stats/Daily cases.csv')
    df_daily_deaths.to_csv('./Data/Stats/Daily deaths.csv')
    df_active_cases.to_csv('./Data/Stats/Active cases.csv')
    
#Create list of countries
countries = []
for country, url in countries_with_detailed_data.items():
    countries.append(country)

createStatDfs(countries)