# Data Preprocessing - Neater Implementation
___

In [1]:
# Enable Intellisense
%config IPCompleter.greedy=True

import pandas as pd
import numpy as np
import io
import requests
import seaborn as sns
from matplotlib import pyplot, dates
import plotly.express as px 
import datetime
import plotly.graph_objects as go
import plotly.io as pio
sns.set()

# Annoying warning
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
# ['EC', 'FS', 'GP', 'KZN', 'LP', 'MP', 'NC', 'NW', 'WC']

## Gen Helper Functions

In [3]:
def df_from_url(df_url, pd_kwargs={}):
    df_req = requests.get(df_url).content
    df = pd.read_csv(io.StringIO(df_req.decode('utf-8')), **pd_kwargs)
    return df

In [4]:
tmp_kwargs = {"usecols":['date','total']}
df_from_url("https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv", 
            tmp_kwargs).head()

Unnamed: 0,date,total
0,05-03-2020,1
1,07-03-2020,2
2,08-03-2020,3
3,09-03-2020,7
4,11-03-2020,13


# Cumulative/Daily Totals Per Day
Currently only applicable for confirmed cases and deaths.

In [5]:
def get_cum_daily(data_url, cum_col='total', index_col='date'): # kwargs={}):
#     data_req = requests.get(data_url).content
#     data = pd.read_csv(io.StringIO(data_req.decode('utf-8')), usecols=cols, index_col=['date'])
    cols = ['date', 'total']
    pd_kwargs = {"usecols":[cum_col, index_col],"index_col":[index_col]}
#     if usecols != []:
#         pd_kwargs.update({"usecols":usecols})
#     pd_kwargs.update(kwargs)
    data = df_from_url(data_url, pd_kwargs)
    data.reset_index(inplace=True)
    data['date'] = pd.to_datetime(data['date'], format='%d-%m-%Y')
    data.set_index('date', inplace = True)
    data.rename({cum_col:"cum_no"}, axis=1, inplace = True)
    data.ffill(inplace=True)
    
    data['daily_no'] = data['cum_no']
    # slice is required as first entry of diff will be NaN but data we want should be equal to the 
    # starting value
    data['daily_no'][1:] = data['cum_no'].diff()[1:]
    # Cast columns to integer
    data = data.astype('int32')
    return data

## Confirmed Cases

In [6]:
confirmed_cases_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv"
confirmed_data = get_cum_daily(confirmed_cases_url)
confirmed_data

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-05,1,1
2020-03-07,2,1
2020-03-08,3,1
2020-03-09,7,4
2020-03-11,13,6
...,...,...
2020-05-18,16433,918
2020-05-19,17200,767
2020-05-20,18003,803
2020-05-21,19137,1134


## Deaths

In [7]:
deaths_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_deaths.csv"
deaths_data = get_cum_daily(deaths_url)
deaths_data

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-27,1,1
2020-03-28,2,1
2020-03-30,3,1
2020-03-31,5,2
2020-04-03,9,4
2020-04-05,11,2
2020-04-06,12,1
2020-04-07,13,1
2020-04-08,18,5
2020-04-09,18,0


## Tests

In [8]:
tests_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv"
tests_data = get_cum_daily(tests_url, 'cumulative_tests', 'date')
tests_data

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-11,61,61
2020-02-13,67,6
2020-02-14,71,4
2020-02-19,95,24
2020-02-20,106,11
...,...,...
2020-05-18,475071,14198
2020-05-19,488609,13538
2020-05-20,505861,17252
2020-05-21,525433,19572


## Recoveries

In [9]:
tests_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv"
recovered_data = get_cum_daily(tests_url, 'recovered', 'date')
recovered_data

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-11,0,0
2020-02-13,0,0
2020-02-14,0,0
2020-02-19,0,0
2020-02-20,0,0
...,...,...
2020-05-18,7298,292
2020-05-19,7960,662
2020-05-20,8950,990
2020-05-21,8950,0


## Active Cases
Cases that have not yet had an outcome. I.e. confirmed cases - recovered - deaths

In [10]:
active_data = confirmed_data[['cum_no']].copy().rename({"cum_no":"confirmed"}, axis = 1)
active_data = pd.concat([active_data, 
                         recovered_data[['cum_no']].copy().rename({"cum_no":"recovered"}, axis =1),
                         deaths_data[['cum_no']].copy().rename({"cum_no":"deaths"}, axis = 1)
                        ], 
                        axis =1)
# active_data.fillna(0, inplace=True)
active_data = active_data.iloc[9:]
active_data = active_data.ffill().fillna(0)
active_data

Unnamed: 0_level_0,confirmed,recovered,deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-03-05,1.0,0.0,0.0
2020-03-06,1.0,0.0,0.0
2020-03-07,2.0,0.0,0.0
2020-03-08,3.0,0.0,0.0
2020-03-09,7.0,0.0,0.0
...,...,...,...
2020-05-18,16433.0,7298.0,286.0
2020-05-19,17200.0,7960.0,312.0
2020-05-20,18003.0,8950.0,339.0
2020-05-21,19137.0,8950.0,369.0


In [11]:
active_data['cum_no'] = active_data['confirmed'] - active_data['recovered'] - active_data['deaths']
active_data.drop(['confirmed','recovered','deaths'], axis=1, inplace=True)
active_data['daily_no'] = active_data['cum_no'].copy()
active_data['daily_no'].iloc[1:] = active_data['cum_no'].diff().iloc[1:]
active_data = active_data.astype('int32')
active_data

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-05,1,1
2020-03-06,1,0
2020-03-07,2,1
2020-03-08,3,1
2020-03-09,7,4
...,...,...
2020-05-18,8849,604
2020-05-19,8928,79
2020-05-20,8714,-214
2020-05-21,9818,1104


## All Cumulative/Totals Per Day

In [12]:
all_cum_data = confirmed_data[['cum_no']].rename({"cum_no":"confirmed"}, axis =1)
all_cum_data = pd.concat([
    all_cum_data, 
    tests_data[['cum_no']].rename({"cum_no":"tests"},axis=1),
    deaths_data[['cum_no']].rename({"cum_no":"deaths"},axis=1),
    recovered_data[['cum_no']].rename({"cum_no":"recovered"},axis=1),
    active_data[['cum_no']].rename({"cum_no":"active"},axis=1),

], axis=1)
# all_cum_data['recovered'] = recovered_data['cum_no']
# all_cum_data['active'] = active_data['cum_no']
all_cum_data.ffill(inplace=True)
all_cum_data.fillna(0, inplace=True)
all_cum_data = all_cum_data.astype('int32')
all_cum_data

Unnamed: 0_level_0,confirmed,tests,deaths,recovered,active
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-02-11,0,61,0,0,0
2020-02-13,0,67,0,0,0
2020-02-14,0,71,0,0,0
2020-02-19,0,95,0,0,0
2020-02-20,0,106,0,0,0
...,...,...,...,...,...
2020-05-18,16433,475071,286,7298,8849
2020-05-19,17200,488609,312,7960,8928
2020-05-20,18003,505861,339,8950,8714
2020-05-21,19137,525433,369,8950,9818


## Derived Stats
Added to cumulative/totals per day `all_cum_data`
### Confirmed divided by Tests

In [13]:
all_cum_data['confirmed_div_by_tests'] = all_cum_data['confirmed']/all_cum_data['tests']
all_cum_data['confirmed_div_by_tests'] = all_cum_data['confirmed_div_by_tests'].round(3)
all_cum_data

Unnamed: 0_level_0,confirmed,tests,deaths,recovered,active,confirmed_div_by_tests
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-02-11,0,61,0,0,0,0.000
2020-02-13,0,67,0,0,0,0.000
2020-02-14,0,71,0,0,0,0.000
2020-02-19,0,95,0,0,0,0.000
2020-02-20,0,106,0,0,0,0.000
...,...,...,...,...,...,...
2020-05-18,16433,475071,286,7298,8849,0.035
2020-05-19,17200,488609,312,7960,8928,0.035
2020-05-20,18003,505861,339,8950,8714,0.036
2020-05-21,19137,525433,369,8950,9818,0.036


### Deaths divided by confirmed

In [14]:
all_cum_data['deaths_div_by_confirmed'] = all_cum_data['deaths']/all_cum_data['confirmed']
all_cum_data['deaths_div_by_confirmed'] = all_cum_data['deaths_div_by_confirmed'].round(3)
all_cum_data.fillna(0.000, inplace=True)
all_cum_data

Unnamed: 0_level_0,confirmed,tests,deaths,recovered,active,confirmed_div_by_tests,deaths_div_by_confirmed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-02-11,0,61,0,0,0,0.000,0.000
2020-02-13,0,67,0,0,0,0.000,0.000
2020-02-14,0,71,0,0,0,0.000,0.000
2020-02-19,0,95,0,0,0,0.000,0.000
2020-02-20,0,106,0,0,0,0.000,0.000
...,...,...,...,...,...,...,...
2020-05-18,16433,475071,286,7298,8849,0.035,0.017
2020-05-19,17200,488609,312,7960,8928,0.035,0.018
2020-05-20,18003,505861,339,8950,8714,0.036,0.019
2020-05-21,19137,525433,369,8950,9818,0.036,0.019


### Recovered divided by confirmed

In [15]:
all_cum_data['recovered_div_by_confirmed'] = all_cum_data['recovered']/all_cum_data['confirmed']
all_cum_data['recovered_div_by_confirmed'] = all_cum_data['recovered_div_by_confirmed'].round(3)
all_cum_data.fillna(0.000, inplace=True)
all_cum_data

Unnamed: 0_level_0,confirmed,tests,deaths,recovered,active,confirmed_div_by_tests,deaths_div_by_confirmed,recovered_div_by_confirmed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-02-11,0,61,0,0,0,0.000,0.000,0.000
2020-02-13,0,67,0,0,0,0.000,0.000,0.000
2020-02-14,0,71,0,0,0,0.000,0.000,0.000
2020-02-19,0,95,0,0,0,0.000,0.000,0.000
2020-02-20,0,106,0,0,0,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...
2020-05-18,16433,475071,286,7298,8849,0.035,0.017,0.444
2020-05-19,17200,488609,312,7960,8928,0.035,0.018,0.463
2020-05-20,18003,505861,339,8950,8714,0.036,0.019,0.497
2020-05-21,19137,525433,369,8950,9818,0.036,0.019,0.468


### Stats Per Million Population
[Source for stats](https://worldpopulationreview.com/countries/south-africa-population/)

In [16]:
sa_tot_population = 59195720
# total population rounded in millions
sa_tot_pop_mil = sa_tot_population/1000000
sa_tot_pop_mil

59.19572

In [17]:
all_cum_data['confirmed_per_mil'] = all_cum_data['confirmed']/sa_tot_pop_mil
all_cum_data['tests_per_mil'] = all_cum_data['tests']/sa_tot_pop_mil
all_cum_data['deaths_per_mil'] = all_cum_data['deaths']/sa_tot_pop_mil
all_cum_data['recovered_per_mil'] = all_cum_data['recovered']/sa_tot_pop_mil
all_cum_data['active_per_mil'] = all_cum_data['active']/sa_tot_pop_mil
tmp_cols = ['confirmed_per_mil','tests_per_mil','deaths_per_mil','recovered_per_mil','active_per_mil']
all_cum_data[tmp_cols] = all_cum_data[tmp_cols].round(2)
all_cum_data.fillna(0.00, inplace=True)
all_cum_data

Unnamed: 0_level_0,confirmed,tests,deaths,recovered,active,confirmed_div_by_tests,deaths_div_by_confirmed,recovered_div_by_confirmed,confirmed_per_mil,tests_per_mil,deaths_per_mil,recovered_per_mil,active_per_mil
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2020-02-11,0,61,0,0,0,0.000,0.000,0.000,0.00,1.03,0.00,0.00,0.00
2020-02-13,0,67,0,0,0,0.000,0.000,0.000,0.00,1.13,0.00,0.00,0.00
2020-02-14,0,71,0,0,0,0.000,0.000,0.000,0.00,1.20,0.00,0.00,0.00
2020-02-19,0,95,0,0,0,0.000,0.000,0.000,0.00,1.60,0.00,0.00,0.00
2020-02-20,0,106,0,0,0,0.000,0.000,0.000,0.00,1.79,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-05-18,16433,475071,286,7298,8849,0.035,0.017,0.444,277.60,8025.43,4.83,123.29,149.49
2020-05-19,17200,488609,312,7960,8928,0.035,0.018,0.463,290.56,8254.13,5.27,134.47,150.82
2020-05-20,18003,505861,339,8950,8714,0.036,0.019,0.497,304.13,8545.57,5.73,151.19,147.21
2020-05-21,19137,525433,369,8950,9818,0.036,0.019,0.468,323.28,8876.20,6.23,151.19,165.86


**Save to csv**

In [18]:
# all_cum_data.to_csv('data/all_cum_data.csv')

## All Daily Change Data Per Day

In [19]:
all_daily_data = confirmed_data[['daily_no']].rename({"daily_no":"confirmed"}, axis =1)
all_daily_data = pd.concat([
    all_daily_data, 
    tests_data[['daily_no']].rename({"daily_no":"tests"},axis=1),
    deaths_data[['daily_no']].rename({"daily_no":"deaths"},axis=1),
    recovered_data[['daily_no']].rename({"daily_no":"recovered"},axis=1),
    active_data[['daily_no']].rename({"daily_no":"active"},axis=1),

], axis=1)
# all_daily_data['recovered'] = recovered_data['daily_no']
# all_daily_data['active'] = active_data['daily_no']
all_daily_data.ffill(inplace=True)
all_daily_data.fillna(0, inplace=True)
all_daily_data = all_daily_data.astype('int32')
all_daily_data

Unnamed: 0_level_0,confirmed,tests,deaths,recovered,active
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-02-11,0,61,0,0,0
2020-02-13,0,6,0,0,0
2020-02-14,0,4,0,0,0
2020-02-19,0,24,0,0,0
2020-02-20,0,11,0,0,0
...,...,...,...,...,...
2020-05-18,918,14198,22,292,604
2020-05-19,767,13538,26,662,79
2020-05-20,803,17252,27,990,-214
2020-05-21,1134,19572,30,0,1104


**Save to csv**

In [20]:
# all_daily_data.to_csv("data/all_daily_data.csv")

### Change format of date

In [21]:
all_cum_data_alt = all_cum_data.copy().reset_index()

# Per Province
___

# Daily/Cumulative Data Per Prov Per Day

In [22]:
# Generator method to get all dates in specified interval
from datetime import timedelta, datetime
def datetime_range(start_datetime, end_datetime):
    curr_date = start_datetime
    yield curr_date
    while curr_date < end_datetime:
        curr_date += timedelta(days=1)
        yield curr_date
        
list(datetime_range(datetime(2020, 8, 1), datetime(2020, 8, 10)))

[datetime.datetime(2020, 8, 1, 0, 0),
 datetime.datetime(2020, 8, 2, 0, 0),
 datetime.datetime(2020, 8, 3, 0, 0),
 datetime.datetime(2020, 8, 4, 0, 0),
 datetime.datetime(2020, 8, 5, 0, 0),
 datetime.datetime(2020, 8, 6, 0, 0),
 datetime.datetime(2020, 8, 7, 0, 0),
 datetime.datetime(2020, 8, 8, 0, 0),
 datetime.datetime(2020, 8, 9, 0, 0),
 datetime.datetime(2020, 8, 10, 0, 0)]

In [23]:
def get_cum_daily_by_prov(data_url, fill_date_gaps=False):
    cols = ['date','EC', 'FS', 'GP', 'KZN', 'LP', 'MP', 'NC', 'NW','WC', 'UNKNOWN']
    pd_kwargs = {"usecols":cols}
    cum_data = df_from_url(data_url, pd_kwargs)

#     cum_data.dropna(inplace=True)
    
    cum_data['date'] = pd.to_datetime(cum_data['date'], format='%d-%m-%Y')
    
    
    if fill_date_gaps:
        start_date = cum_data.iloc[0]['date']
        end_date = cum_data.iloc[-1]['date']
        date_range = list(datetime_range(start_date,end_date))
        cum_data.set_index('date', inplace = True)
        cum_data = cum_data.reindex(date_range)
        cum_data.ffill(inplace=True)
        cum_data.reset_index(inplace=True)
        tmp_data = cum_data.copy()
    
    daily_data = cum_data.copy()
    daily_data.iloc[1:,1:] = daily_data.iloc[:,1:].diff().iloc[1:]
    daily_data_melt = daily_data.melt(id_vars=['date'], var_name='province', value_name='daily_no')
    daily_data_melt.set_index(['date'], inplace=True)
    
    
    cum_data_melt = cum_data.melt(id_vars=['date'], var_name='province', value_name='cum_no')
    cum_data_melt.set_index(['date'], inplace=True)
    
    data = pd.concat([cum_data_melt, daily_data_melt[['daily_no']]], axis = 1)
    data[['cum_no','daily_no']] = data[['cum_no','daily_no']].astype('int32')

    return data

## Confirmed Cases Per Prov
Daily change and cumulative

In [24]:
confirmed_by_prov_timeline = get_cum_daily_by_prov("https://raw.githubusercontent.com/dsfsi/covid19za/master/" +
                      "data/covid19za_provincial_cumulative_timeline_confirmed.csv", 
                                                   fill_date_gaps=True)
confirmed_by_prov_timeline

Unnamed: 0_level_0,province,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-03-05,EC,0,0
2020-03-06,EC,0,0
2020-03-07,EC,0,0
2020-03-08,EC,0,0
2020-03-09,EC,0,0
...,...,...,...
2020-05-18,UNKNOWN,0,0
2020-05-19,UNKNOWN,0,0
2020-05-20,UNKNOWN,0,0
2020-05-21,UNKNOWN,0,0


**Save to csv**

In [25]:
confirmed_by_prov_timeline.to_csv("data/confirmed_by_prov_timeline.csv")

## Deaths Per Prov
Daily change and cumulative

In [26]:
deaths_by_prov_timeline = get_cum_daily_by_prov("https://raw.githubusercontent.com/dsfsi/covid19za/master/" +
                      "data/covid19za_provincial_cumulative_timeline_deaths.csv")
deaths_by_prov_timeline

Unnamed: 0_level_0,province,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-03-27,EC,0,0
2020-03-28,EC,0,0
2020-03-30,EC,0,0
2020-03-31,EC,0,0
2020-04-03,EC,0,0
...,...,...,...
2020-05-18,UNKNOWN,0,0
2020-05-19,UNKNOWN,0,0
2020-05-20,UNKNOWN,0,0
2020-05-21,UNKNOWN,0,0


**Save to csv**

In [27]:
# deaths_by_prov_timeline.to_csv("data/deaths_by_prov_timeline.csv")

## Recoveries Per Prov

In [28]:
recoveries_by_prov_timeline = get_cum_daily_by_prov("https://raw.githubusercontent.com/dsfsi/covid19za/master/" +
                      "data/covid19za_provincial_cumulative_timeline_recoveries.csv", fill_date_gaps=True)
recoveries_by_prov_timeline

Unnamed: 0_level_0,province,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-04-17,EC,9,9
2020-04-18,EC,9,0
2020-04-19,EC,9,0
2020-04-20,EC,15,6
2020-04-21,EC,15,0
...,...,...,...
2020-05-18,UNKNOWN,0,0
2020-05-19,UNKNOWN,0,0
2020-05-20,UNKNOWN,0,0
2020-05-21,UNKNOWN,0,0


**Save to csv**

In [29]:
recoveries_by_prov_timeline.to_csv("data/recoveries_by_prov_timeline.csv")

# Total & Latest Change

In [30]:
def get_tot_latest_change(data_url, ):
    cols = ['date','EC', 'FS', 'GP', 'KZN', 'LP', 'MP', 'NC', 'NW','WC', 'UNKNOWN']
    pd_kwargs = {"usecols":cols}
    cum_data = df_from_url(data_url, pd_kwargs)
    cum_data.dropna(inplace=True)
    cum_data['date'] = pd.to_datetime(cum_data['date'], format='%d-%m-%Y')
    
    province_names = {
        "EC":"Eastern Cape",
        "FS" : "Free State",
        "GP" : "Gauteng",
        "KZN" : "KwaZulu-Natal",
        "LP" : "Limpopo",
        "MP" : "Mpumalanga",
        "NW" : "North West",
        "NC" : "Northern Cape",
        "WC" : "Western Cape",
        "UNKNOWN": "Unknown"
    }
    
    daily_data = cum_data.copy()
    daily_data.iloc[1:,1:] = daily_data.iloc[:,1:].diff().iloc[1:]
    daily_data = daily_data.tail(1) # get last entry
    daily_data_melt = daily_data.melt(id_vars=['date'], var_name='province', value_name='latest_change')
    daily_data_melt['province'] = daily_data_melt['province'].map(province_names)
    daily_data_melt.set_index(['province'], inplace=True)
    
    cum_data = cum_data.tail(1) # get last entry
    cum_data_melt = cum_data.melt(id_vars=['date'], var_name='province', value_name='total')
    cum_data_melt['province'] = cum_data_melt['province'].map(province_names)
    cum_data_melt.set_index(['province'], inplace=True)
    
    data = pd.concat([cum_data_melt, daily_data_melt[['latest_change']]], axis = 1)
    data.drop(['date'], axis=1, inplace=True)
    data = data.astype('int32')

    return data

**Only get total data just for Testing**

In [31]:
def get_tot(data_url,):
    # TEMP FIX - UNKNOWN is misspelt
    cols = ['date','EC', 'FS', 'GP', 'KZN', 'LP', 'MP', 'NC', 'NW','WC', 'UNKNOWN'] 
    pd_kwargs = {"usecols":cols}
    cum_data = df_from_url(data_url, pd_kwargs)
    cum_data.dropna(inplace=True)
    cum_data['date'] = pd.to_datetime(cum_data['date'], format='%d-%m-%Y')
    
    province_names = {
        "EC":"Eastern Cape",
        "FS" : "Free State",
        "GP" : "Gauteng",
        "KZN" : "KwaZulu-Natal",
        "LP" : "Limpopo",
        "MP" : "Mpumalanga",
        "NW" : "North West",
        "NC" : "Northern Cape",
        "WC" : "Western Cape",
        "UNKNOWN": "Unknown"
    }
    
    cum_data = cum_data.tail(1) # get last entry
    cum_data_melt = cum_data.melt(id_vars=['date'], var_name='province', value_name='total')
    cum_data_melt['province'] = cum_data_melt['province'].map(province_names)
    cum_data_melt.set_index(['province'], inplace=True)
    
#     data = pd.concat([cum_data_melt, daily_data_melt[['latest_change']]], axis = 1)
    data = cum_data_melt.copy()
    data.drop(['date'], axis=1, inplace=True)
    data = data.astype('int32')

    return data

## Deaths

In [32]:
deaths_by_prov_total = get_tot_latest_change("https://raw.githubusercontent.com/dsfsi/covid19za/master/" +
                      "data/covid19za_provincial_cumulative_timeline_deaths.csv")
deaths_by_prov_total

Unnamed: 0_level_0,total,latest_change
province,Unnamed: 1_level_1,Unnamed: 2_level_1
Eastern Cape,53,3
Free State,6,0
Gauteng,29,2
KwaZulu-Natal,47,1
Limpopo,3,0
Mpumalanga,0,0
Northern Cape,1,0
North West,1,0
Western Cape,257,22
Unknown,0,0


**Save to csv**

In [33]:
# deaths_by_prov_total.to_csv('data/tot_deaths_provinces.csv')

## Confirmed

In [34]:
confirmed_by_prov_total = get_tot_latest_change("https://raw.githubusercontent.com/dsfsi/covid19za/master/" +
                      "data/covid19za_provincial_cumulative_timeline_confirmed.csv")
confirmed_by_prov_total

Unnamed: 0_level_0,total,latest_change
province,Unnamed: 1_level_1,Unnamed: 2_level_1
Eastern Cape,2459,135
Free State,185,1
Gauteng,2521,68
KwaZulu-Natal,1735,42
Limpopo,120,-1
Mpumalanga,98,3
Northern Cape,39,2
North West,80,3
Western Cape,12888,735
Unknown,0,0


**Save to csv**

In [35]:
# confirmed_by_prov_total.to_csv('data/tot_provinces.csv')

## Tests

In [36]:
tests_by_prov_total = get_tot("https://raw.githubusercontent.com/dsfsi/covid19za/master/" +
                      "data/covid19za_provincial_cumulative_timeline_testing.csv")
tests_by_prov_total

Unnamed: 0_level_0,total
province,Unnamed: 1_level_1
Eastern Cape,51888
Free State,24415
Gauteng,166394
KwaZulu-Natal,86448
Limpopo,10579
Mpumalanga,14964
Northern Cape,4776
North West,8830
Western Cape,112612
Unknown,25955


**Save to csv**

In [50]:
tests_by_prov_total.to_csv('data/tot_tests_provinces.csv')

## Recoveries

In [38]:
recoveries_by_prov_total = get_tot_latest_change("https://raw.githubusercontent.com/dsfsi/covid19za/master/" +
                      "data/covid19za_provincial_cumulative_timeline_recoveries.csv")
recoveries_by_prov_total

Unnamed: 0_level_0,total,latest_change
province,Unnamed: 1_level_1,Unnamed: 2_level_1
Eastern Cape,1036,0
Free State,121,3
Gauteng,1776,82
KwaZulu-Natal,880,38
Limpopo,51,9
Mpumalanga,58,0
Northern Cape,27,0
North West,29,1
Western Cape,6126,1021
Unknown,0,0


**Save to csv**

In [39]:
recoveries_by_prov_total.to_csv('data/tot_recoveries_provinces.csv')

# Data to be added to text of website
___

In [40]:
def zero_space(num):
    return format(num,',d').replace(","," ")

In [41]:
tests_data

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-11,61,61
2020-02-13,67,6
2020-02-14,71,4
2020-02-19,95,24
2020-02-20,106,11
...,...,...
2020-05-18,475071,14198
2020-05-19,488609,13538
2020-05-20,505861,17252
2020-05-21,525433,19572


In [42]:
tot_tested = zero_space(tests_data.tail(1).iloc[0]['cum_no'].astype(int))
change_tested = zero_space(tests_data.tail(1).iloc[0]['daily_no'].astype(int))
print(tot_tested + ", " + change_tested)

543 032, 17 599


In [43]:
tot_infected = zero_space(confirmed_data.tail(1).iloc[0]['cum_no'].astype(int))
change_infected = zero_space(confirmed_data.tail(1).iloc[0]['daily_no'].astype(int))
print(tot_infected + ", " + change_infected)

20 125, 988


In [44]:
tot_deaths = zero_space(deaths_data.tail(1).iloc[0]['cum_no'].astype(int))
change_deaths = zero_space(deaths_data['daily_no'].tail(1).iloc[0].astype(int))
print(tot_deaths + ", " + change_deaths)

397, 28


In [45]:
tot_recoveries = zero_space(recovered_data.tail(1).iloc[0]['cum_no'].astype(int))
change_recoveries = zero_space(recovered_data.tail(1).iloc[0]['daily_no'].astype(int))
print(tot_recoveries + ", " + change_recoveries)

10 104, 1 154


In [46]:
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M %d %B %Y")
print("Current Time =", current_time)

Current Time = 14:53 23 May 2020


In [47]:
gen_data = pd.DataFrame(dict(tot_infected=[tot_infected], change_infected=[change_infected], 
                             tot_deaths=[tot_deaths], change_deaths=[change_deaths],
                             tot_tested=[tot_tested], change_tested=[change_tested], 
                             tot_recoveries=[tot_recoveries], change_recoveries=[change_recoveries], 
                             datetime_updated=[current_time]))
gen_data

Unnamed: 0,tot_infected,change_infected,tot_deaths,change_deaths,tot_tested,change_tested,tot_recoveries,change_recoveries,datetime_updated
0,20 125,988,397,28,543 032,17 599,10 104,1 154,14:53 23 May 2020


In [48]:
# gen_data.to_csv("data/gen_data.csv", index=False)

### Render Template

In [49]:
# import template_renderer as tr
# tr.render_all()