# Data Preprocessing - Neater Implementation
___

In [1]:
# Enable Intellisense
%config IPCompleter.greedy=True

import pandas as pd
import numpy as np
import io
import requests
import seaborn as sns
from matplotlib import pyplot, dates
import plotly.express as px 
import datetime
import plotly.graph_objects as go
import plotly.io as pio
sns.set()

# Annoying warning
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
# ['EC', 'FS', 'GP', 'KZN', 'LP', 'MP', 'NC', 'NW', 'WC']

## Gen Helper Functions

In [2]:
def df_from_url(df_url, pd_kwargs={}):
    df_req = requests.get(df_url).content
    df = pd.read_csv(io.StringIO(df_req.decode('utf-8')), **pd_kwargs)
    return df

In [4]:
tmp_kwargs = {"usecols":['date','total']}
df_from_url("https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv", 
            tmp_kwargs).head()

Unnamed: 0,date,total
0,05-03-2020,1
1,07-03-2020,2
2,08-03-2020,3
3,09-03-2020,7
4,11-03-2020,13


# Cumulative/Daily Totals Per Day
Currently only applicable for confirmed cases and deaths.

In [5]:
def get_cum_daily(data_url, cum_col='total', index_col='date'): # kwargs={}):
#     data_req = requests.get(data_url).content
#     data = pd.read_csv(io.StringIO(data_req.decode('utf-8')), usecols=cols, index_col=['date'])
    cols = ['date', 'total']
    pd_kwargs = {"usecols":[cum_col, index_col],"index_col":[index_col]}
#     if usecols != []:
#         pd_kwargs.update({"usecols":usecols})
#     pd_kwargs.update(kwargs)
    data = df_from_url(data_url, pd_kwargs)
    data.reset_index(inplace=True)
    data['date'] = pd.to_datetime(data['date'], format='%d-%m-%Y')
    data.set_index('date', inplace = True)
    data.rename({cum_col:"cum_no"}, axis=1, inplace = True)
    data.ffill(inplace=True)
    
    data['daily_no'] = data['cum_no']
    # slice is required as first entry of diff will be NaN but data we want should be equal to the 
    # starting value
    data['daily_no'][1:] = data['cum_no'].diff()[1:]
    # Cast columns to integer
    data = data.astype('int32')
    return data

## Confirmed Cases

In [6]:
confirmed_cases_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv"
confirmed_data = get_cum_daily(confirmed_cases_url)
confirmed_data

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-05,1,1
2020-03-07,2,1
2020-03-08,3,1
2020-03-09,7,4
2020-03-11,13,6
...,...,...
2020-05-24,22583,1240
2020-05-25,23615,1032
2020-05-26,24264,649
2020-05-27,25937,1673


## Deaths

In [7]:
deaths_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_deaths.csv"
deaths_data = get_cum_daily(deaths_url)
deaths_data

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-27,1,1
2020-03-28,2,1
2020-03-30,3,1
2020-03-31,5,2
2020-04-03,9,4
2020-04-05,11,2
2020-04-06,12,1
2020-04-07,13,1
2020-04-08,18,5
2020-04-09,18,0


## Tests

In [8]:
tests_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv"
tests_data = get_cum_daily(tests_url, 'cumulative_tests', 'date')
tests_data

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-11,61,61
2020-02-13,67,6
2020-02-14,71,4
2020-02-19,95,24
2020-02-20,106,11
...,...,...
2020-05-24,583855,19485
2020-05-25,596777,12922
2020-05-26,605991,9214
2020-05-27,634996,29005


## Recoveries

In [9]:
tests_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv"
recovered_data = get_cum_daily(tests_url, 'recovered', 'date')
recovered_data

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-11,0,0
2020-02-13,0,0
2020-02-14,0,0
2020-02-19,0,0
2020-02-20,0,0
...,...,...
2020-05-24,11100,996
2020-05-25,11917,817
2020-05-26,12741,824
2020-05-27,13451,710


## Active Cases
Cases that have not yet had an outcome. I.e. confirmed cases - recovered - deaths

In [10]:
active_data = confirmed_data[['cum_no']].copy().rename({"cum_no":"confirmed"}, axis = 1)
active_data = pd.concat([active_data, 
                         recovered_data[['cum_no']].copy().rename({"cum_no":"recovered"}, axis =1),
                         deaths_data[['cum_no']].copy().rename({"cum_no":"deaths"}, axis = 1)
                        ], 
                        axis =1)
# active_data.fillna(0, inplace=True)
active_data = active_data.iloc[9:]
active_data = active_data.ffill().fillna(0)
active_data

Unnamed: 0_level_0,confirmed,recovered,deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-03-05,1.0,0.0,0.0
2020-03-06,1.0,0.0,0.0
2020-03-07,2.0,0.0,0.0
2020-03-08,3.0,0.0,0.0
2020-03-09,7.0,0.0,0.0
...,...,...,...
2020-05-24,22583.0,11100.0,429.0
2020-05-25,23615.0,11917.0,481.0
2020-05-26,24264.0,12741.0,524.0
2020-05-27,25937.0,13451.0,552.0


In [11]:
active_data['cum_no'] = active_data['confirmed'] - active_data['recovered'] - active_data['deaths']
active_data.drop(['confirmed','recovered','deaths'], axis=1, inplace=True)
active_data['daily_no'] = active_data['cum_no'].copy()
active_data['daily_no'].iloc[1:] = active_data['cum_no'].diff().iloc[1:]
active_data = active_data.astype('int32')
active_data

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-05,1,1
2020-03-06,1,0
2020-03-07,2,1
2020-03-08,3,1
2020-03-09,7,4
...,...,...
2020-05-24,11054,222
2020-05-25,11217,163
2020-05-26,10999,-218
2020-05-27,11934,935


## All Cumulative/Totals Per Day

In [12]:
all_cum_data = confirmed_data[['cum_no']].rename({"cum_no":"confirmed"}, axis =1)
all_cum_data = pd.concat([
    all_cum_data, 
    tests_data[['cum_no']].rename({"cum_no":"tests"},axis=1),
    deaths_data[['cum_no']].rename({"cum_no":"deaths"},axis=1),
    recovered_data[['cum_no']].rename({"cum_no":"recovered"},axis=1),
    active_data[['cum_no']].rename({"cum_no":"active"},axis=1),

], axis=1)
# all_cum_data['recovered'] = recovered_data['cum_no']
# all_cum_data['active'] = active_data['cum_no']
all_cum_data.ffill(inplace=True)
all_cum_data.fillna(0, inplace=True)
all_cum_data = all_cum_data.astype('int32')
all_cum_data

Unnamed: 0_level_0,confirmed,tests,deaths,recovered,active
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-02-11,0,61,0,0,0
2020-02-13,0,67,0,0,0
2020-02-14,0,71,0,0,0
2020-02-19,0,95,0,0,0
2020-02-20,0,106,0,0,0
...,...,...,...,...,...
2020-05-24,22583,583855,429,11100,11054
2020-05-25,23615,596777,481,11917,11217
2020-05-26,24264,605991,524,12741,10999
2020-05-27,25937,634996,552,13451,11934


## Derived Stats
Added to cumulative/totals per day `all_cum_data`
### Confirmed divided by Tests

In [13]:
all_cum_data['confirmed_div_by_tests'] = all_cum_data['confirmed']/all_cum_data['tests']
all_cum_data['confirmed_div_by_tests'] = all_cum_data['confirmed_div_by_tests'].round(3)
all_cum_data

Unnamed: 0_level_0,confirmed,tests,deaths,recovered,active,confirmed_div_by_tests
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-02-11,0,61,0,0,0,0.000
2020-02-13,0,67,0,0,0,0.000
2020-02-14,0,71,0,0,0,0.000
2020-02-19,0,95,0,0,0,0.000
2020-02-20,0,106,0,0,0,0.000
...,...,...,...,...,...,...
2020-05-24,22583,583855,429,11100,11054,0.039
2020-05-25,23615,596777,481,11917,11217,0.040
2020-05-26,24264,605991,524,12741,10999,0.040
2020-05-27,25937,634996,552,13451,11934,0.041


### Deaths divided by confirmed

In [14]:
all_cum_data['deaths_div_by_confirmed'] = all_cum_data['deaths']/all_cum_data['confirmed']
all_cum_data['deaths_div_by_confirmed'] = all_cum_data['deaths_div_by_confirmed'].round(3)
all_cum_data.fillna(0.000, inplace=True)
all_cum_data

Unnamed: 0_level_0,confirmed,tests,deaths,recovered,active,confirmed_div_by_tests,deaths_div_by_confirmed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-02-11,0,61,0,0,0,0.000,0.000
2020-02-13,0,67,0,0,0,0.000,0.000
2020-02-14,0,71,0,0,0,0.000,0.000
2020-02-19,0,95,0,0,0,0.000,0.000
2020-02-20,0,106,0,0,0,0.000,0.000
...,...,...,...,...,...,...,...
2020-05-24,22583,583855,429,11100,11054,0.039,0.019
2020-05-25,23615,596777,481,11917,11217,0.040,0.020
2020-05-26,24264,605991,524,12741,10999,0.040,0.022
2020-05-27,25937,634996,552,13451,11934,0.041,0.021


### Recovered divided by confirmed

In [15]:
all_cum_data['recovered_div_by_confirmed'] = all_cum_data['recovered']/all_cum_data['confirmed']
all_cum_data['recovered_div_by_confirmed'] = all_cum_data['recovered_div_by_confirmed'].round(3)
all_cum_data.fillna(0.000, inplace=True)
all_cum_data

Unnamed: 0_level_0,confirmed,tests,deaths,recovered,active,confirmed_div_by_tests,deaths_div_by_confirmed,recovered_div_by_confirmed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-02-11,0,61,0,0,0,0.000,0.000,0.000
2020-02-13,0,67,0,0,0,0.000,0.000,0.000
2020-02-14,0,71,0,0,0,0.000,0.000,0.000
2020-02-19,0,95,0,0,0,0.000,0.000,0.000
2020-02-20,0,106,0,0,0,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...
2020-05-24,22583,583855,429,11100,11054,0.039,0.019,0.492
2020-05-25,23615,596777,481,11917,11217,0.040,0.020,0.505
2020-05-26,24264,605991,524,12741,10999,0.040,0.022,0.525
2020-05-27,25937,634996,552,13451,11934,0.041,0.021,0.519


### Stats Per Million Population
[Source for stats](https://worldpopulationreview.com/countries/south-africa-population/)

In [16]:
sa_tot_population = 59195720
# total population rounded in millions
sa_tot_pop_mil = sa_tot_population/1000000
sa_tot_pop_mil

59.19572

In [17]:
all_cum_data['confirmed_per_mil'] = all_cum_data['confirmed']/sa_tot_pop_mil
all_cum_data['tests_per_mil'] = all_cum_data['tests']/sa_tot_pop_mil
all_cum_data['deaths_per_mil'] = all_cum_data['deaths']/sa_tot_pop_mil
all_cum_data['recovered_per_mil'] = all_cum_data['recovered']/sa_tot_pop_mil
all_cum_data['active_per_mil'] = all_cum_data['active']/sa_tot_pop_mil
tmp_cols = ['confirmed_per_mil','tests_per_mil','deaths_per_mil','recovered_per_mil','active_per_mil']
all_cum_data[tmp_cols] = all_cum_data[tmp_cols].round(2)
all_cum_data.fillna(0.00, inplace=True)
all_cum_data

Unnamed: 0_level_0,confirmed,tests,deaths,recovered,active,confirmed_div_by_tests,deaths_div_by_confirmed,recovered_div_by_confirmed,confirmed_per_mil,tests_per_mil,deaths_per_mil,recovered_per_mil,active_per_mil
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2020-02-11,0,61,0,0,0,0.000,0.000,0.000,0.00,1.03,0.00,0.00,0.00
2020-02-13,0,67,0,0,0,0.000,0.000,0.000,0.00,1.13,0.00,0.00,0.00
2020-02-14,0,71,0,0,0,0.000,0.000,0.000,0.00,1.20,0.00,0.00,0.00
2020-02-19,0,95,0,0,0,0.000,0.000,0.000,0.00,1.60,0.00,0.00,0.00
2020-02-20,0,106,0,0,0,0.000,0.000,0.000,0.00,1.79,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-05-24,22583,583855,429,11100,11054,0.039,0.019,0.492,381.50,9863.13,7.25,187.51,186.74
2020-05-25,23615,596777,481,11917,11217,0.040,0.020,0.505,398.93,10081.42,8.13,201.32,189.49
2020-05-26,24264,605991,524,12741,10999,0.040,0.022,0.525,409.89,10237.07,8.85,215.24,185.81
2020-05-27,25937,634996,552,13451,11934,0.041,0.021,0.519,438.16,10727.06,9.32,227.23,201.60


**Save to csv**

In [18]:
# all_cum_data.to_csv('data/all_cum_data.csv')

## All Daily Change Data Per Day

In [19]:
all_daily_data = confirmed_data[['daily_no']].rename({"daily_no":"confirmed"}, axis =1)
all_daily_data = pd.concat([
    all_daily_data, 
    tests_data[['daily_no']].rename({"daily_no":"tests"},axis=1),
    deaths_data[['daily_no']].rename({"daily_no":"deaths"},axis=1),
    recovered_data[['daily_no']].rename({"daily_no":"recovered"},axis=1),
    active_data[['daily_no']].rename({"daily_no":"active"},axis=1),

], axis=1)
# all_daily_data['recovered'] = recovered_data['daily_no']
# all_daily_data['active'] = active_data['daily_no']
all_daily_data.ffill(inplace=True)
all_daily_data.fillna(0, inplace=True)
all_daily_data = all_daily_data.astype('int32')
all_daily_data

Unnamed: 0_level_0,confirmed,tests,deaths,recovered,active
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-02-11,0,61,0,0,0
2020-02-13,0,6,0,0,0
2020-02-14,0,4,0,0,0
2020-02-19,0,24,0,0,0
2020-02-20,0,11,0,0,0
...,...,...,...,...,...
2020-05-24,1240,19485,22,996,222
2020-05-25,1032,12922,52,817,163
2020-05-26,649,9214,43,824,-218
2020-05-27,1673,29005,28,710,935


**Save to csv**

In [20]:
# all_daily_data.to_csv("data/all_daily_data.csv")

### Change format of date

In [21]:
all_cum_data_alt = all_cum_data.copy().reset_index()

# Per Province
___

# Daily/Cumulative Data Per Prov Per Day

In [3]:
# Generator method to get all dates in specified interval
from datetime import timedelta, datetime
def datetime_range(start_datetime, end_datetime):
    curr_date = start_datetime
    yield curr_date
    while curr_date < end_datetime:
        curr_date += timedelta(days=1)
        yield curr_date
        
list(datetime_range(datetime(2020, 8, 1), datetime(2020, 8, 10)))

[datetime.datetime(2020, 8, 1, 0, 0),
 datetime.datetime(2020, 8, 2, 0, 0),
 datetime.datetime(2020, 8, 3, 0, 0),
 datetime.datetime(2020, 8, 4, 0, 0),
 datetime.datetime(2020, 8, 5, 0, 0),
 datetime.datetime(2020, 8, 6, 0, 0),
 datetime.datetime(2020, 8, 7, 0, 0),
 datetime.datetime(2020, 8, 8, 0, 0),
 datetime.datetime(2020, 8, 9, 0, 0),
 datetime.datetime(2020, 8, 10, 0, 0)]

In [4]:
# round_no - decimals to round to
def get_cum_daily_by_prov(data_url, fill_date_gaps=False, dropna=True, round_no=3):
    cols = ['date', 'EC', 'FS', 'GP', 'KZN', 'LP', 'MP', 'NC', 'NW', 'WC', 'UNKNOWN']
    pd_kwargs = {"usecols": cols}
    cum_data = df_from_url(data_url, pd_kwargs)

    if dropna:
        cum_data.dropna(inplace=True)

    cum_data['date'] = pd.to_datetime(cum_data['date'], format='%d-%m-%Y')

    if fill_date_gaps:
        start_date = cum_data.iloc[0]['date']
        end_date = cum_data.iloc[-1]['date']
        date_range = list(datetime_range(start_date, end_date))
        cum_data.set_index('date', inplace=True)
        cum_data = cum_data.reindex(date_range)
        cum_data.ffill(inplace=True)
        cum_data.reset_index(inplace=True)

    daily_data = cum_data.copy()
    daily_data.iloc[1:, 1:] = daily_data.iloc[:, 1:].diff().iloc[1:]
    daily_data_melt = daily_data.melt(id_vars=['date'], var_name='province', value_name='daily_no')
    daily_data_melt.set_index(['date'], inplace=True)

    cum_data_melt = cum_data.melt(id_vars=['date'], var_name='province', value_name='cum_no')
    cum_data_melt.set_index(['date'], inplace=True)

    data = pd.concat([cum_data_melt, daily_data_melt[['daily_no']]], axis=1)
    data[['cum_no', 'daily_no']] = data[['cum_no', 'daily_no']].astype('int32')

    prov_pops = {  # https://github.com/dsfsi/covid19za/blob/master/data/district_data/za_province_pop.csv
        "EC": 6712276.0,
        "FS": 2887465.0,
        "GP": 15176115.0,
        "KZN": 11289086.0,
        "LP": 5982584.0,
        "MP": 4592187.0,
        "NW": 4072160.0,
        "NC": 1263875.0,
        "WC": 6844272.0,
        "UNKNOWN": None
    }

    data['cum_no_perc_pop'] = data['province'].map(prov_pops)
    data['cum_no_perc_pop'] = data['cum_no'] / data['cum_no_perc_pop'] * 100
    data['cum_no_perc_pop'] = data['cum_no_perc_pop'].round(round_no)

    data['daily_no_perc_pop'] = data['province'].map(prov_pops)
    data['daily_no_perc_pop'] = data['daily_no'] / data['daily_no_perc_pop'] * 100
    data['daily_no_perc_pop'] = data['daily_no_perc_pop'].round(round_no)

    return data

## Confirmed Cases Per Prov
Daily change and cumulative

In [7]:
confirmed_by_prov_timeline = get_cum_daily_by_prov("https://raw.githubusercontent.com/dsfsi/covid19za/master/" +
                      "data/covid19za_provincial_cumulative_timeline_confirmed.csv", 
                                                   fill_date_gaps=True)
confirmed_by_prov_timeline

Unnamed: 0_level_0,province,cum_no,daily_no,cum_no_perc_pop,daily_no_perc_pop
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-03-05,EC,0,0,0.0,0.0
2020-03-06,EC,0,0,0.0,0.0
2020-03-07,EC,0,0,0.0,0.0
2020-03-08,EC,0,0,0.0,0.0
2020-03-09,EC,0,0,0.0,0.0
...,...,...,...,...,...
2020-05-28,UNKNOWN,0,0,,
2020-05-29,UNKNOWN,32,32,,
2020-05-30,UNKNOWN,33,1,,
2020-05-31,UNKNOWN,6,-27,,


**Save to csv**

In [54]:
# confirmed_by_prov_timeline.to_csv("data/confirmed_by_prov_timeline.csv")

In [55]:
confirmed_by_prov_timeline.head()

Unnamed: 0_level_0,province,cum_no,daily_no,cum_no_perc_pop,daily_no_perc_pop
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-03-05,EC,0,0,0.0,0.0
2020-03-06,EC,0,0,0.0,0.0
2020-03-07,EC,0,0,0.0,0.0
2020-03-08,EC,0,0,0.0,0.0
2020-03-09,EC,0,0,0.0,0.0


In [27]:
# prov_pops = { # https://github.com/dsfsi/covid19za/blob/master/data/district_data/za_province_pop.csv
#         "EC":6712276.0,
#         "FS":2887465.0,
#         "GP":15176115.0,
#         "KZN":11289086.0,
#         "LP":5982584.0,
#         "MP":4592187.0,
#         "NW":4072160.0,
#         "NC":1263875.0,
#         "WC":6844272.0,
#         "UNKNOWN":None
#     }

# confirmed_by_prov_timeline['cum_no_perc_pop'] = confirmed_by_prov_timeline['province'].map(prov_pops)
# confirmed_by_prov_timeline['cum_no_perc_pop'] = confirmed_by_prov_timeline['cum_no_perc_pop']/ \
#                                                 confirmed_by_prov_timeline['cum_no']
# confirmed_by_prov_timeline['daily_no_perc_pop'] = confirmed_by_prov_timeline['province'].map(prov_pops)
# confirmed_by_prov_timeline['daily_no_perc_pop'] = confirmed_by_prov_timeline['daily_no_perc_pop']/ \
#                                                 confirmed_by_prov_timeline['daily_no']
# confirmed_by_prov_timeline.replace(np.inf, 0, inplace=True)
# confirmed_by_prov_timeline.head()

## Deaths Per Prov
Daily change and cumulative

In [56]:
deaths_by_prov_timeline = get_cum_daily_by_prov("https://raw.githubusercontent.com/dsfsi/covid19za/master/" +
                      "data/covid19za_provincial_cumulative_timeline_deaths.csv")
deaths_by_prov_timeline

Unnamed: 0_level_0,province,cum_no,daily_no,cum_no_perc_pop,daily_no_perc_pop
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-03-27,EC,0,0,0.0,0.0
2020-03-28,EC,0,0,0.0,0.0
2020-03-30,EC,0,0,0.0,0.0
2020-03-31,EC,0,0,0.0,0.0
2020-04-03,EC,0,0,0.0,0.0
...,...,...,...,...,...
2020-05-24,UNKNOWN,0,0,,
2020-05-25,UNKNOWN,0,0,,
2020-05-26,UNKNOWN,0,0,,
2020-05-27,UNKNOWN,0,0,,


**Save to csv**

In [29]:
# deaths_by_prov_timeline.to_csv("data/deaths_by_prov_timeline.csv")

## Recoveries Per Prov

In [61]:
recoveries_by_prov_timeline = get_cum_daily_by_prov("https://raw.githubusercontent.com/dsfsi/covid19za/master/" +
                      "data/covid19za_provincial_cumulative_timeline_recoveries.csv", fill_date_gaps=True)
recoveries_by_prov_timeline

Unnamed: 0_level_0,province,cum_no,daily_no,cum_no_perc_pop,daily_no_perc_pop
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-04-17,EC,9,9,0.000134,0.000134
2020-04-18,EC,9,0,0.000134,0.000000
2020-04-19,EC,9,0,0.000134,0.000000
2020-04-20,EC,15,6,0.000223,0.000089
2020-04-21,EC,15,0,0.000223,0.000000
...,...,...,...,...,...
2020-05-24,UNKNOWN,0,0,,
2020-05-25,UNKNOWN,0,0,,
2020-05-26,UNKNOWN,0,0,,
2020-05-27,UNKNOWN,0,0,,


**Save to csv**

In [31]:
# recoveries_by_prov_timeline.to_csv("data/recoveries_by_prov_timeline.csv")

## Tests Per Prov

In [5]:
tests_by_prov_timeline = get_cum_daily_by_prov("https://raw.githubusercontent.com/dsfsi/covid19za/master/" +
                      "data/covid19za_provincial_cumulative_timeline_testing.csv", fill_date_gaps=False)
tests_by_prov_timeline

Unnamed: 0_level_0,province,cum_no,daily_no,cum_no_perc_pop,daily_no_perc_pop
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-04-27,EC,14054,14054,0.209,0.209
2020-05-03,EC,24835,10781,0.37,0.161
2020-05-09,EC,34032,9197,0.507,0.137
2020-05-13,EC,38122,4090,0.568,0.061
2020-05-20,EC,51888,13766,0.773,0.205
2020-05-28,EC,66013,14125,0.983,0.21
2020-04-27,FS,6317,6317,0.219,0.219
2020-05-03,FS,12634,6317,0.438,0.219
2020-05-09,FS,17231,4597,0.597,0.159
2020-05-13,FS,19265,2034,0.667,0.07


In [13]:
tests_by_prov_timeline.to_csv("data/tests_by_prov_timeline.csv")

# Total & Latest Change

In [7]:
def get_tot_latest_change(data_url, ):
    cols = ['date','EC', 'FS', 'GP', 'KZN', 'LP', 'MP', 'NC', 'NW','WC', 'UNKNOWN']
    pd_kwargs = {"usecols":cols}
    cum_data = df_from_url(data_url, pd_kwargs)
    cum_data.dropna(inplace=True)
    cum_data['date'] = pd.to_datetime(cum_data['date'], format='%d-%m-%Y')
    
    province_names = {
        "EC":"Eastern Cape",
        "FS" : "Free State",
        "GP" : "Gauteng",
        "KZN" : "KwaZulu-Natal",
        "LP" : "Limpopo",
        "MP" : "Mpumalanga",
        "NW" : "North West",
        "NC" : "Northern Cape",
        "WC" : "Western Cape",
        "UNKNOWN": "Unknown"
    }
    
    daily_data = cum_data.copy()
    daily_data.iloc[1:,1:] = daily_data.iloc[:,1:].diff().iloc[1:]
    daily_data = daily_data.tail(1) # get last entry
    daily_data_melt = daily_data.melt(id_vars=['date'], var_name='province', value_name='latest_change')
    daily_data_melt['province'] = daily_data_melt['province'].map(province_names)
    daily_data_melt.set_index(['province'], inplace=True)
    
    cum_data = cum_data.tail(1) # get last entry
    cum_data_melt = cum_data.melt(id_vars=['date'], var_name='province', value_name='total')
    cum_data_melt['province'] = cum_data_melt['province'].map(province_names)
    cum_data_melt.set_index(['province'], inplace=True)
    
    data = pd.concat([cum_data_melt, daily_data_melt[['latest_change']]], axis = 1)
    data.drop(['date'], axis=1, inplace=True)
    data = data.astype('int32')
    
    prov_pops = { # https://github.com/dsfsi/covid19za/blob/master/data/district_data/za_province_pop.csv
        "Eastern Cape":6712276.0,
        "Free State":2887465.0,
        "Gauteng":15176115.0,
        "KwaZulu-Natal":11289086.0,
        "Limpopo":5982584.0,
        "Mpumalanga":4592187.0,
        "North West":4072160.0,
        "Northern Cape":1263875.0,
        "Western Cape":6844272.0,
        "Unknown":None
    }
    data.reset_index(inplace=True)
    
    data['total_perc_pop'] = data['province'].map(prov_pops)
    data['total_perc_pop'] = data['total']/data['total_perc_pop'] * 100
    data['total_perc_pop'] = data['total_perc_pop'].round(3)
    
    data['latest_change_perc_pop'] = data['province'].map(prov_pops)
    data['latest_change_perc_pop'] = data['latest_change']/data['latest_change_perc_pop'] * 100
    data['latest_change_perc_pop'] = data['latest_change_perc_pop'].round(3)
    
    data.set_index('province', inplace=True)

    return data

**Only get total data just for Testing**

In [6]:
def get_tot(data_url,):
    # TEMP FIX - UNKNOWN is misspelt
    cols = ['date','EC', 'FS', 'GP', 'KZN', 'LP', 'MP', 'NC', 'NW','WC', 'UNKNOWN'] 
    pd_kwargs = {"usecols":cols}
    cum_data = df_from_url(data_url, pd_kwargs)
    cum_data.dropna(inplace=True)
    cum_data['date'] = pd.to_datetime(cum_data['date'], format='%d-%m-%Y')
    
    province_names = {
        "EC":"Eastern Cape",
        "FS" : "Free State",
        "GP" : "Gauteng",
        "KZN" : "KwaZulu-Natal",
        "LP" : "Limpopo",
        "MP" : "Mpumalanga",
        "NW" : "North West",
        "NC" : "Northern Cape",
        "WC" : "Western Cape",
        "UNKNOWN": "Unknown"
    }
    
    cum_data = cum_data.tail(1) # get last entry
    cum_data_melt = cum_data.melt(id_vars=['date'], var_name='province', value_name='total')
    cum_data_melt['province'] = cum_data_melt['province'].map(province_names)
    cum_data_melt.set_index(['province'], inplace=True)
    
#     data = pd.concat([cum_data_melt, daily_data_melt[['latest_change']]], axis = 1)
    data = cum_data_melt.copy()
    data.drop(['date'], axis=1, inplace=True)
    data = data.astype('int32')

    return data

## Deaths

In [34]:
deaths_by_prov_total = get_tot_latest_change("https://raw.githubusercontent.com/dsfsi/covid19za/master/" +
                      "data/covid19za_provincial_cumulative_timeline_deaths.csv")
deaths_by_prov_total

Unnamed: 0_level_0,total,latest_change,total_perc_pop,latest_change_perc_pop
province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Eastern Cape,77,7,,
Free State,8,2,,
Gauteng,31,0,,
KwaZulu-Natal,50,1,,
Limpopo,3,0,,
Mpumalanga,0,0,,
Northern Cape,1,0,,
North West,1,0,,
Western Cape,406,15,,
Unknown,0,0,,


**Save to csv**

In [35]:
# deaths_by_prov_total.to_csv('data/tot_deaths_provinces.csv')

## Confirmed

In [36]:
confirmed_by_prov_total = get_tot_latest_change("https://raw.githubusercontent.com/dsfsi/covid19za/master/" +
                      "data/covid19za_provincial_cumulative_timeline_confirmed.csv")
confirmed_by_prov_total

Unnamed: 0_level_0,total,latest_change,total_perc_pop,latest_change_perc_pop
province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Eastern Cape,3306,259,,
Free State,225,4,,
Gauteng,3329,162,,
KwaZulu-Natal,2349,163,,
Limpopo,144,3,,
Mpumalanga,111,5,,
Northern Cape,51,3,,
North West,134,6,,
Western Cape,17754,861,,
Unknown,0,0,,


**Save to csv**

In [37]:
# confirmed_by_prov_total.to_csv('data/tot_provinces.csv')

## Tests

In [8]:
tests_by_prov_total = get_tot_latest_change("https://raw.githubusercontent.com/dsfsi/covid19za/master/" +
                      "data/covid19za_provincial_cumulative_timeline_testing.csv")
tests_by_prov_total

Unnamed: 0_level_0,total,latest_change,total_perc_pop,latest_change_perc_pop
province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Eastern Cape,66013,14125,0.983,0.21
Free State,34760,10345,1.204,0.358
Gauteng,215959,49565,1.423,0.327
KwaZulu-Natal,109643,23195,0.971,0.205
Limpopo,14365,3786,0.24,0.063
Mpumalanga,18629,3665,0.406,0.08
Northern Cape,6061,1285,0.48,0.102
North West,11214,2384,0.275,0.059
Western Cape,148174,35562,2.165,0.52
Unknown,30905,4950,,


**Save to csv**

In [39]:
# tests_by_prov_total.to_csv('data/tot_tests_provinces.csv')

## Recoveries

In [40]:
recoveries_by_prov_total = get_tot_latest_change("https://raw.githubusercontent.com/dsfsi/covid19za/master/" +
                      "data/covid19za_provincial_cumulative_timeline_recoveries.csv")
recoveries_by_prov_total

Unnamed: 0_level_0,total,latest_change,total_perc_pop,latest_change_perc_pop
province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Eastern Cape,1700,209,,
Free State,123,0,,
Gauteng,1993,38,,
KwaZulu-Natal,1180,0,,
Limpopo,75,8,,
Mpumalanga,67,5,,
Northern Cape,30,0,,
North West,45,6,,
Western Cape,9157,653,,
Unknown,0,0,,


**Save to csv**

In [41]:
recoveries_by_prov_total.to_csv('data/tot_recoveries_provinces.csv')

# Data to be added to text of website
___

In [42]:
def zero_space(num):
    return format(num,',d').replace(","," ")

In [43]:
tests_data

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-11,61,61
2020-02-13,67,6
2020-02-14,71,4
2020-02-19,95,24
2020-02-20,106,11
...,...,...
2020-05-24,583855,19485
2020-05-25,596777,12922
2020-05-26,605991,9214
2020-05-27,634996,29005


In [44]:
tot_tested = zero_space(tests_data.tail(1).iloc[0]['cum_no'].astype(int))
change_tested = zero_space(tests_data.tail(1).iloc[0]['daily_no'].astype(int))
print(tot_tested + ", " + change_tested)

655 723, 20 727


In [45]:
tot_infected = zero_space(confirmed_data.tail(1).iloc[0]['cum_no'].astype(int))
change_infected = zero_space(confirmed_data.tail(1).iloc[0]['daily_no'].astype(int))
print(tot_infected + ", " + change_infected)

27 403, 1 466


In [46]:
tot_deaths = zero_space(deaths_data.tail(1).iloc[0]['cum_no'].astype(int))
change_deaths = zero_space(deaths_data['daily_no'].tail(1).iloc[0].astype(int))
print(tot_deaths + ", " + change_deaths)

577, 25


In [47]:
tot_recoveries = zero_space(recovered_data.tail(1).iloc[0]['cum_no'].astype(int))
change_recoveries = zero_space(recovered_data.tail(1).iloc[0]['daily_no'].astype(int))
print(tot_recoveries + ", " + change_recoveries)

14 370, 919


In [48]:
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M %d %B %Y")
print("Current Time =", current_time)

Current Time = 21:15 29 May 2020


In [49]:
gen_data = pd.DataFrame(dict(tot_infected=[tot_infected], change_infected=[change_infected], 
                             tot_deaths=[tot_deaths], change_deaths=[change_deaths],
                             tot_tested=[tot_tested], change_tested=[change_tested], 
                             tot_recoveries=[tot_recoveries], change_recoveries=[change_recoveries], 
                             datetime_updated=[current_time]))
gen_data

Unnamed: 0,tot_infected,change_infected,tot_deaths,change_deaths,tot_tested,change_tested,tot_recoveries,change_recoveries,datetime_updated
0,27 403,1 466,577,25,655 723,20 727,14 370,919,21:15 29 May 2020


In [50]:
# gen_data.to_csv("data/gen_data.csv", index=False)

### Render Template

In [51]:
# import template_renderer as tr
# tr.render_all()