# Пиявка - Python-based scraping

Kludgier than we expected because we had to hack a somewhat reliable data source here.

---
## Setup

In [None]:
from bs4 import BeautifulSoup

import csv
import json
import datetime

---
## Make CSV from HTML file

These are not well-formed HTML files; they're the table snippets scraped off the BNO spreadsheet.  Each `table-XX.html` file contains only the data table and additional headers.

In [None]:
soup = BeautifulSoup(open('./site-data/table-01.html').read(), 'html.parser')

In [None]:
table = soup.find('table')

outputRows  = list()
rowCount    = 0
IGNORE_ROWS = 5
for tableRow in table.find_all('tr'):
    rowCount += 1
    
    if rowCount > IGNORE_ROWS:
        # print(tableRow)
        columns = tableRow.find_all('td')
        row = list()
        for column in columns:
            row.append(column.text.replace(',', ''))
    
        nixed = ('Diamond Princess', 'Queue', 'TOTAL',)
        if any(s in row for s in nixed) or not len(row[0]):
            continue

        outputRows.append(row)

outputRows

In [None]:
with open('./site-data/scraped-world.tsv', 'w') as outputFile:
    writer = csv.writer(outputFile, delimiter = '\t')
    writer.writerows(outputRows)

---
## World

In [None]:
rawSource = 'site-data/scraped-world.tsv'

updateWorld = dict()
today  = datetime.date.today().strftime("%m-%d-%Y")
with open(rawSource, 'r') as inputFile:
    rawData = csv.DictReader(inputFile, delimiter = '\t')
    for row in rawData:
        if 'Diamond' not in row['OTHER PLACES']:
                updateWorld[row['OTHER PLACES']] = { today: float(row['Cases'])}

if 'Queue' in updateWorld:
    del(updateWorld['Queue'])

updateWorld['US'] = updateWorld['United States']
del(updateWorld['United States'])
updateWorld['UK'] = updateWorld['United Kingdom']
del(updateWorld['United Kingdom'])
updateWorld['Denmark'] = updateWorld['Denmark*']
del(updateWorld['Denmark*'])
updateWorld['United Arab Emirates'] = updateWorld['UAE']
del(updateWorld['UAE'])
updateWorld['Bosnia and Herzegovina'] = updateWorld['Bosnia']
del(updateWorld['Bosnia'])
# UK same thing
updateWorld['US']

In [None]:
with open('site-data/confirmed.json', 'r') as inputFile:
    dataWorld = json.load(inputFile)

# Not in the list
dataWorld['Turkey'] = { today: 0.0, }

In [None]:
for country in updateWorld.keys():
    dataWorld[country][today] = updateWorld[country][today]

dataWorld

---
## United States

In [None]:
rawSource = 'site-data/scraped-US.tsv'

updateUS = dict()

with open(rawSource, 'r') as inputFile:
    rawData = csv.DictReader(inputFile, delimiter = "\t")
    for row in rawData:
        updateUS[row['UNITED STATES']] = { today: float(row['Cases'])}

updateUS['Washington D.C.'] = updateUS['District of Columbia']
del(updateUS['District of Columbia'])

In [None]:
updateUS

In [None]:
with open('site-data/confirmed-US.json', 'r') as inputFile:
    dataUS = json.load(inputFile)

dataUS['Grand Princess'] = { today: 0.0, }

In [None]:
for state in updateUS.keys():
    dataUS[state][today] = updateUS[state][today]

dataUS['California']

---
## US Regions


In [None]:
from covidvu.vujson import US_REGIONS_LONG

In [None]:
with open('site-data/confirmed-US.json', 'r') as inputFile:
    dataUSRegions = json.load(inputFile)

updateUSRegions = dict()

for state in updateUS:
    if US_REGIONS_LONG[state] not in updateUSRegions:
        updateUSRegions[US_REGIONS_LONG[state]] = { today: 0.0 }
    
    updateUSRegions[US_REGIONS_LONG[state]][today] += float(updateUS[state][today])

updateUSRegions

---
## Calculate total US cases from JH CSSE data

In [None]:
from covidvu.vujson import JH_CSSE_FILE_CONFIRMED
import pandas as pd

cases = pd.read_csv(JH_CSSE_FILE_CONFIRMED)
cases.groupby('Country/Region').sum().T['US'].tail()

In [None]:
cases = pd.read_csv(JH_CSSE_FILE_CONFIRMED)
cases

---
&#169; the COVIDvu Contributors.  All rights reserved