In [2]:
# from subprocess import call
from subprocess import Popen, PIPE
import pandas as pd
from datetime import datetime

p = Popen(["node", "js_scraping/scraper.js"], stdin=PIPE, stdout=PIPE, stderr=PIPE)
output = p.stdout # .read().decode("utf-8")


def next_line():
    return output.readline().decode("utf-8").replace("\n", "")


# Gen Data
source = next_line()
heading = next_line()
data_date = next_line()
next_line()

print(f"{source} {heading} {data_date}")

# def input_to_df():


def input_to_df(print_dict=True, headings=[]):
    if headings:  # if headings not empty - ignore input headings
        curr_headings = headings
        next_line()
    else:
        curr_headings = next_line().split(",")

    curr_data = []  # [[]]*11
    while True:
        curr_line = next_line()
        if curr_line != "###":
            curr_data.append(curr_line.split(","))
        else:
            break

    tmp_data = list(zip(*curr_data))
    transp_data = [list(tmp_data[i]) for i in range(len(tmp_data))]
    curr_data_dict = {key: val for key, val in zip(curr_headings, transp_data)}

    if print_dict:
        print(curr_data_dict)

    curr_data_df = pd.DataFrame.from_dict(curr_data_dict)

    return curr_data_df


prov_map = {
    "Eastern Cape": "EC",
    "Free State": "FS",
    "Gauteng": "GP",
    "KwaZulu-Natal": "KZN",
    "Limpopo": "LP",
    "Mpumalanga": "MP",
    "North West": "NW",
    "Northern Cape": "NC",
    "Western Cape": "WC",
    "Unknown": "UNKNOWN",
    "Total": "total"
}


# Going to assume columns will be 'Province'
def transform_df(df, value_name):
    tmp_df = df[['Province', value_name]].copy()
    tmp_df['Province'] = tmp_df['Province'].map(prov_map)
    tmp_df[value_name] = pd.to_numeric(tmp_df[value_name]).round()
    tmp_df['i'] = 0

    tmp_df_piv = tmp_df.pivot(index='i', columns='Province', values=[value_name])
    tmp_df_piv.index.name = None
    tmp_df_piv = tmp_df_piv.droplevel(0, axis=1)
    tmp_df_piv.columns.name = None

    return tmp_df_piv
    # return out_df


# Note this is done by reference
def add_date(df, given_date, date_format):
    df['date'] = datetime.strptime(given_date, date_format).strftime('%d-%m-%Y')  # '09-01-2021'
    df['YYYYMMDD'] = datetime.strptime(given_date, date_format).strftime('%Y%m%d')  # '20210109'


input_date_format = '%d %b %Y'

# ------------------
#       CASES
# ------------------
cases_data_df = input_to_df(print_dict=True, headings=['Province', 'Cases'])

cases_data_df_piv = transform_df(cases_data_df, 'Cases')

add_date(cases_data_df_piv, data_date, input_date_format)
cases_data_df_piv['source'] = source
cases_date = cases_data_df_piv['date'].iloc[-1]
cases_data_df_piv.set_index('date', inplace=True)

prov_cum_cases = pd.read_csv('data/scraped/provincial_cum_cases.csv', index_col='date')

https://www.nicd.ac.za/latest-confirmed-cases-of-covid-19-in-south-africa-11-jan-2021/ LATEST CONFIRMED CASES OF COVID-19 IN SOUTH AFRICA (11 JAN 2021) 11 JAN 2021
{'Province': ['Eastern Cape', 'Free State', 'Gauteng', 'KwaZulu-Natal', 'Limpopo', 'Mpumalanga', 'North West', 'Northern Cape', 'Western Cape', 'Unknown', 'Total'], 'Cases': ['181532', '67337', '338071', '255819', '38878', '46830', '47021', '27973', '243182', '0', '1246643']}


In [55]:
cases_data_df_piv

Unnamed: 0_level_0,EC,FS,GP,KZN,LP,MP,NC,NW,UNKNOWN,WC,total,YYYYMMDD,source
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
11-01-2021,181532,67337,338071,255819,38878,46830,27973,47021,0,243182,1246643,20210111,https://www.nicd.ac.za/latest-confirmed-cases-...


In [56]:
# Append new day's data to csv if it has not already been added otherwise updated day's data
# Update values instead of doing nothing in case values have been changed
if cases_date in prov_cum_cases.index:
    prov_cum_cases.loc[cases_date] = cases_data_df_piv.loc[cases_date]
else:
    prov_cum_cases = prov_cum_cases.append(cases_data_df_piv)
    
prov_cum_cases.tail()

Unnamed: 0_level_0,YYYYMMDD,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total,source
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
05-01-2021,20210105,174853.0,63981.0,304420.0,223487.0,29265.0,39528.0,26095.0,42244.0,223886.0,0.0,1127759,https://www.nicd.ac.za/latest-confirmed-cases-...
06-01-2021,20210106,175941.0,64409.0,310201.0,230283.0,30840.0,40751.0,26353.0,43159.0,227654.0,0.0,1149591,https://www.nicd.ac.za/latest-confirmed-cases-...
07-01-2021,20210107,177314.0,64925.0,315738.0,236177.0,32493.0,41927.0,26701.0,44255.0,231060.0,0.0,1170590,https://twitter.com/DrZweliMkhize/status/13472...
08-01-2021,20210108,178525.0,65516.0,322707.0,241740.0,34040.0,43364.0,27059.0,45075.0,234544.0,0.0,1192570,https://www.nicd.ac.za/latest-confirmed-cases-...
11-01-2021,20210111,181532.0,67337.0,338071.0,255819.0,38878.0,46830.0,27973.0,47021.0,243182.0,0.0,1246643,https://www.nicd.ac.za/latest-confirmed-cases-...


In [57]:
prov_cum_cases.to_csv("data/scraped/provincial_cum_cases.csv", index=True)

In [None]:
cases_data_df_piv

In [10]:
prov_cum_cases.tail(2)

Unnamed: 0_level_0,YYYYMMDD,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total,source
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
08-01-2021,20210108,178525.0,65516.0,322707.0,241740.0,34040.0,43364.0,27059.0,45075.0,234544.0,0.0,1192570,https://www.nicd.ac.za/latest-confirmed-cases-...
09-01-2021,20210109,179713.0,66133.0,328925.0,247647.0,35813.0,44818.0,27383.0,45879.0,237865.0,0.0,1214176,https://www.nicd.ac.za/latest-confirmed-cases-...


In [11]:
cases_date in prov_cum_cases.index

False

In [43]:
tmp_df = pd.read_csv('data/scraped/provincial_cum_cases_orig.csv', index_col ='date')
tmp_df.tail()

Unnamed: 0_level_0,YYYYMMDD,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total,source
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
04-01-2021,20210104,174176.0,63740.0,301212.0,217432.0,28333.0,39045.0,26019.0,41735.0,221657.0,0.0,1113349,https://www.nicd.ac.za/latest-confirmed-cases-...
05-01-2021,20210105,174853.0,63981.0,304420.0,223487.0,29265.0,39528.0,26095.0,42244.0,223886.0,0.0,1127759,https://www.nicd.ac.za/latest-confirmed-cases-...
06-01-2021,20210106,175941.0,64409.0,310201.0,230283.0,30840.0,40751.0,26353.0,43159.0,227654.0,0.0,1149591,https://www.nicd.ac.za/latest-confirmed-cases-...
07-01-2021,20210107,177314.0,64925.0,315738.0,236177.0,32493.0,41927.0,26701.0,44255.0,231060.0,0.0,1170590,https://twitter.com/DrZweliMkhize/status/13472...
08-01-2021,20210108,178525.0,65516.0,322707.0,241740.0,34040.0,43364.0,27059.0,45075.0,234544.0,0.0,1192570,https://www.nicd.ac.za/latest-confirmed-cases-...


In [44]:
tmp_df.to_csv('data/scraped/provincial_cum_cases.csv')

In [45]:
pd.read_csv('data/scraped/provincial_cum_cases.csv').tail()

Unnamed: 0,date,YYYYMMDD,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total,source
303,04-01-2021,20210104,174176.0,63740.0,301212.0,217432.0,28333.0,39045.0,26019.0,41735.0,221657.0,0.0,1113349,https://www.nicd.ac.za/latest-confirmed-cases-...
304,05-01-2021,20210105,174853.0,63981.0,304420.0,223487.0,29265.0,39528.0,26095.0,42244.0,223886.0,0.0,1127759,https://www.nicd.ac.za/latest-confirmed-cases-...
305,06-01-2021,20210106,175941.0,64409.0,310201.0,230283.0,30840.0,40751.0,26353.0,43159.0,227654.0,0.0,1149591,https://www.nicd.ac.za/latest-confirmed-cases-...
306,07-01-2021,20210107,177314.0,64925.0,315738.0,236177.0,32493.0,41927.0,26701.0,44255.0,231060.0,0.0,1170590,https://twitter.com/DrZweliMkhize/status/13472...
307,08-01-2021,20210108,178525.0,65516.0,322707.0,241740.0,34040.0,43364.0,27059.0,45075.0,234544.0,0.0,1192570,https://www.nicd.ac.za/latest-confirmed-cases-...


In [3]:
tests_data_df = input_to_df(print_dict=True, headings=['Sector', 'Tests'])
tests_data_df

{'Sector': ['PRIVATE', 'PUBLIC', 'Total'], 'Tests': ['4196584', '3039805', '7236389']}


Unnamed: 0,Sector,Tests
0,PRIVATE,4196584
1,PUBLIC,3039805
2,Total,7236389


In [18]:
sector_map = {
    "PRIVATE":"cumulative_tests_private",
    "PUBLIC":"cumulative_tests_public",
    "Total":"cumulative_tests"
}

tests_df = tests_data_df[['Sector', 'Tests']].copy()
tests_df['Sector'] = tests_df['Sector'].map(sector_map)
tests_df['Tests'] = pd.to_numeric(tests_df['Tests']).round()
tests_df['i'] = 0

tests_df_piv = tests_df.pivot(index='i', columns='Sector', values=['Tests'])
tests_df_piv.index.name = None
tests_df_piv = tests_df_piv.droplevel(0, axis=1)
tests_df_piv.columns.name = None

add_date(tests_df_piv, data_date, input_date_format)
tests_df_piv['source'] = source
tests_date = tests_df_piv['date'].iloc[-1]
tests_df_piv.set_index('date', inplace=True)

cum_tests = pd.read_csv('data/scraped/timeline_cum_tests.csv', index_col='date')

# Append new day's data to csv if it has not already been added otherwise updated day's data
# Update values instead of doing nothing in case values have been changed
if tests_date in cum_tests.index:
    cum_tests.loc[tests_date] = tests_df_piv.loc[tests_date]
else:
    cum_tests = cum_tests.append(tests_df_piv)
    
cum_tests.to_csv('data/scraped/timeline_cum_tests.csv', index=True)

In [17]:
pd.read_csv('data/scraped/timeline_cum_tests.csv', index_col='date').tail()

Unnamed: 0_level_0,YYYYMMDD,cumulative_tests,cumulative_tests_private,cumulative_tests_public,recovered,hospitalisation,critical_icu,ventilation,deaths,contacts_identified,contacts_traced,scanned_travellers,passengers_elevated_temperature,covid_suspected_criteria,source
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
07-01-2021,20210107,6967478.0,4043320.0,2924158.0,938216.0,,,,31809.0,,,,,,https://twitter.com/DrZweliMkhize/status/13472...
08-01-2021,20210108,7043680.0,4090066.0,2953614.0,947919.0,,,,32425.0,,,,,,https://www.nicd.ac.za/latest-confirmed-cases-...
09-01-2021,20210109,7120847.0,4138584.0,2982263.0,956712.0,,,,32824.0,,,,,,https://www.nicd.ac.za/latest-confirmed-cases-...
10-01-2021,20210110,7183893.0,4174664.0,3009229.0,966368.0,,,,33163.0,,,,,,https://www.nicd.ac.za/latest-confirmed-cases-...
11-01-2021,20210111,7236389.0,4196584.0,3039805.0,,,,,,,,,,,https://www.nicd.ac.za/latest-confirmed-cases-...


In [1]:
import pandas as pd

date,EC,FS,GP,KZN,LP,MP,NC,NW,UNKNOWN,Unnamed: 14,WC,YYYYMMDD,source,total,Unnamed: 15

In [16]:
prov_timeline_cases = pd.read_csv('data/scraped/covid19za_provincial_cumulative_timeline_confirmed.csv')
prov_timeline_cases.tail()

Unnamed: 0,date,YYYYMMDD,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total,source
316,17-01-2021,20210117,186771.0,70511.0,361881.0,279974.0,47207.0,53409.0,29558.0,51380.0,257235.0,0.0,1337926.0,https://www.nicd.ac.za/latest-confirmed-cases-...
317,18-01-2021,20210118,187171.0,70891.0,364269.0,283176.0,48158.0,53843.0,29757.0,51547.0,258124.0,0.0,1346936.0,https://www.nicd.ac.za/latest-confirmed-cases-...
318,19-01-2021,20210119,187552.0,71237.0,366432.0,287042.0,49012.0,54678.0,29894.0,51771.0,259098.0,0.0,1356716.0,https://www.nicd.ac.za/latest-confirmed-cases-...
319,20-01-2021,20210120,188259.0,71734.0,370264.0,290356.0,49823.0,56063.0,30119.0,52316.0,260492.0,0.0,1369426.0,https://www.nicd.ac.za/latest-confirmed-cases-...
320,22-01-2021,20210122,189559.0,72711.0,375761.0,295949.0,52847.0,58059.0,30568.0,53728.0,263386.0,0.0,1392568.0,https://www.nicd.ac.za/latest-confirmed-cases-...


In [15]:
# prov_timeline_cases =
cols_to_use = 'date,YYYYMMDD,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total,source'.split(',')
new_df = prov_timeline_cases[cols_to_use].copy()
new_df
new_df.to_csv('data/scraped/covid19za_provincial_cumulative_timeline_confirmed.csv', index=False)

In [21]:
prov_timeline_deaths = pd.read_csv('data/scraped/covid19za_provincial_cumulative_timeline_deaths.csv')
prov_timeline_deaths.tail(3)

Unnamed: 0,date,YYYYMMDD,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total,source
294,19-01-2021,20210119,9612,2493,7057,6672,977,808,511,808,9350,,38288,https://www.nicd.ac.za/latest-confirmed-cases-...
295,20-01-2021,20210120,9716,2539,7137,6890,977,814,519,808,9454,,38854,https://www.nicd.ac.za/latest-confirmed-cases-...
296,22-01-2021,20210122,9984,2593,7396,7177,1004,849,542,863,9668,,40076,https://www.nicd.ac.za/latest-confirmed-cases-...


In [20]:
# prov_timeline_cases =
cols_to_use = 'date,YYYYMMDD,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total,source'.split(',')
new_df = prov_timeline_deaths[cols_to_use].copy()
new_df
new_df.to_csv('data/scraped/covid19za_provincial_cumulative_timeline_deaths.csv', index=False)

In [24]:
prov_timeline_recoveries = pd.read_csv('data/scraped/covid19za_provincial_cumulative_timeline_recoveries.csv')
prov_timeline_recoveries.tail(3)

Unnamed: 0,date,YYYYMMDD,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total,source
271,19-01-2021,20210119,173575,59268,322434.0,226342,40232,46394,24734,37418,214460,,1144857.0,https://www.nicd.ac.za/latest-confirmed-cases-...
272,20-01-2021,20210120,173575,59757,327569.0,230024,40892,46935,25348,37640,218672,,1160412.0,https://www.nicd.ac.za/latest-confirmed-cases-...
273,22-01-2021,20210122,175708,60694,336054.0,242404,44700,50507,25599,40165,225453,,1201284.0,https://www.nicd.ac.za/latest-confirmed-cases-...


In [22]:
# prov_timeline_cases =
cols_to_use = 'date,YYYYMMDD,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total,source'.split(',')
new_df = prov_timeline_recoveries[cols_to_use].copy()
new_df
new_df.to_csv('data/scraped/covid19za_provincial_cumulative_timeline_recoveries.csv', index=False)