# Data Preprocessing - Neater Implementation
___

In [1]:
# Enable Intellisense
%config IPCompleter.greedy=True

import pandas as pd
import numpy as np
import io
import requests
import seaborn as sns
from matplotlib import pyplot, dates
import plotly.express as px 
import datetime
import plotly.graph_objects as go
import plotly.io as pio
sns.set()

# Annoying warning
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
# ['EC', 'FS', 'GP', 'KZN', 'LP', 'MP', 'NC', 'NW', 'WC']

## Gen Helper Functions

In [3]:
def df_from_url(df_url, pd_kwargs={}):
    df_req = requests.get(df_url).content
    df = pd.read_csv(io.StringIO(df_req.decode('utf-8')), **pd_kwargs)
    return df

In [4]:
tmp_kwargs = {"usecols":['date','total']}
df_from_url("https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv", 
            tmp_kwargs).head()

Unnamed: 0,date,total
0,05-03-2020,1
1,07-03-2020,2
2,08-03-2020,3
3,09-03-2020,7
4,11-03-2020,13


# Cumulative/Daily Totals Per Day
Currently only applicable for confirmed cases and deaths.

In [5]:
def get_cum_daily(data_url, cum_col='total', index_col='date'): # kwargs={}):
#     data_req = requests.get(data_url).content
#     data = pd.read_csv(io.StringIO(data_req.decode('utf-8')), usecols=cols, index_col=['date'])
    cols = ['date', 'total']
    pd_kwargs = {"usecols":[cum_col, index_col],"index_col":[index_col]}
#     if usecols != []:
#         pd_kwargs.update({"usecols":usecols})
#     pd_kwargs.update(kwargs)
    data = df_from_url(data_url, pd_kwargs)
    data.reset_index(inplace=True)
    data['date'] = pd.to_datetime(data['date'], format='%d-%m-%Y')
    data.set_index('date', inplace = True)
    data.rename({cum_col:"cum_no"}, axis=1, inplace = True)
    data.ffill(inplace=True)
    
    data['daily_no'] = data['cum_no']
    # slice is required as first entry of diff will be NaN but data we want should be equal to the 
    # starting value
    data['daily_no'][1:] = data['cum_no'].diff()[1:]
    # Cast columns to integer
    data = data.astype('int32')
    return data

## Confirmed Cases

In [6]:
confirmed_cases_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv"
confirmed_data = get_cum_daily(confirmed_cases_url)
confirmed_data

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-05,1,1
2020-03-07,2,1
2020-03-08,3,1
2020-03-09,7,4
2020-03-11,13,6
2020-03-12,16,3
2020-03-13,24,8
2020-03-14,38,14
2020-03-15,51,13
2020-03-16,62,11


## Deaths

In [7]:
deaths_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_deaths.csv"
deaths_data = get_cum_daily(deaths_url)
deaths_data

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-27,1,1
2020-03-28,2,1
2020-03-30,3,1
2020-03-31,5,2
2020-04-03,9,4
2020-04-05,11,2
2020-04-06,12,1
2020-04-07,13,1
2020-04-08,18,5
2020-04-09,18,0


## Tests

In [8]:
tests_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv"
tests_data = get_cum_daily(tests_url, 'cumulative_tests', 'date')
tests_data

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-11,61,61
2020-02-13,67,6
2020-02-14,71,4
2020-02-19,95,24
2020-02-20,106,11
...,...,...
2020-05-01,217522,9992
2020-05-02,230686,13164
2020-05-03,245747,15061
2020-05-04,257541,11794


## Recoveries

In [9]:
tests_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv"
recovered_data = get_cum_daily(tests_url, 'recovered', 'date')
recovered_data

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-11,0,0
2020-02-13,0,0
2020-02-14,0,0
2020-02-19,0,0
2020-02-20,0,0
...,...,...
2020-05-01,2382,309
2020-05-02,2549,167
2020-05-03,2549,0
2020-05-04,2746,197


## Active Cases

In [23]:
active_data = confirmed_data[['cum_no']].copy().rename({"cum_no":"confirmed"}, axis = 1)
active_data = pd.concat([active_data, 
                         recovered_data[['cum_no']].copy().rename({"cum_no":"recovered"}, axis = 1)], 
                        axis =1)
# active_data.fillna(0, inplace=True)
active_data = active_data.iloc[9:]
active_data = active_data.ffill().fillna(0)
active_data

Unnamed: 0_level_0,confirmed,recovered
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-05,1.0,0.0
2020-03-06,1.0,0.0
2020-03-07,2.0,0.0
2020-03-08,3.0,0.0
2020-03-09,7.0,0.0
...,...,...
2020-05-01,5951.0,2382.0
2020-05-02,6336.0,2549.0
2020-05-03,6783.0,2549.0
2020-05-04,7220.0,2746.0


In [35]:
active_data['cum_no'] = active_data['confirmed'] - active_data['recovered']
active_data.drop(['confirmed','recovered'], axis=1, inplace=True)
active_data = active_data.astype('int32')
active_data

Unnamed: 0_level_0,cum_no
date,Unnamed: 1_level_1
2020-03-05,1
2020-03-06,1
2020-03-07,2
2020-03-08,3
2020-03-09,7
...,...
2020-05-01,3569
2020-05-02,3787
2020-05-03,4234
2020-05-04,4474


## All Cumulative/Totals Per Day

In [39]:
all_cum_data = confirmed_data[['cum_no']].rename({"cum_no":"confirmed"}, axis =1)
all_cum_data = pd.concat([
    all_cum_data, 
    tests_data[['cum_no']].rename({"cum_no":"tests"},axis=1),
    recovered_data[['cum_no']].rename({"cum_no":"recovered"},axis=1),
    active_data[['cum_no']].rename({"cum_no":"active"},axis=1),

], axis=1)
# all_cum_data['recovered'] = recovered_data['cum_no']
# all_cum_data['active'] = active_data['cum_no']
all_cum_data.ffill(inplace=True)
all_cum_data.fillna(0, inplace=True)
all_cum_data = all_cum_data.astype('int32')
all_cum_data

Unnamed: 0_level_0,confirmed,tests,recovered,active
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-02-11,0,61,0,0
2020-02-13,0,67,0,0
2020-02-14,0,71,0,0
2020-02-19,0,95,0,0
2020-02-20,0,106,0,0
...,...,...,...,...
2020-05-01,5951,217522,2382,3569
2020-05-02,6336,230686,2549,3787
2020-05-03,6783,245747,2549,4234
2020-05-04,7220,257541,2746,4474


# Daily/Cumulative Data Per Prov Per Day

In [12]:
def get_cum_daily_by_prov(data_url):
    cols = ['date','EC', 'FS', 'GP', 'KZN', 'LP', 'MP', 'NC', 'NW','WC', 'UNKNOWN']
    pd_kwargs = {"usecols":cols}
    cum_data = df_from_url(data_url, pd_kwargs)
    cum_data.dropna(inplace=True)
    cum_data['date'] = pd.to_datetime(cum_data['date'], format='%d-%m-%Y')
    
    daily_data = cum_data.copy()
    
    
    cum_data_melt = cum_data.melt(id_vars=['date'], var_name='province', value_name='cum_cases')
    cum_data_melt.set_index(['date'], inplace=True)
    
    data = cum_data_melt
    return data

In [13]:
get_cum_daily_by_prov("https://raw.githubusercontent.com/dsfsi/covid19za/master/data/" +
                      "covid19za_provincial_cumulative_timeline_confirmed.csv")

Unnamed: 0_level_0,province,cum_cases
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-05,EC,0.0
2020-03-07,EC,0.0
2020-03-08,EC,0.0
2020-03-09,EC,0.0
2020-03-11,EC,0.0
...,...,...
2020-05-01,UNKNOWN,0.0
2020-05-02,UNKNOWN,0.0
2020-05-03,UNKNOWN,0.0
2020-05-04,UNKNOWN,0.0
