# Data Preprocessing - Neater Implementation

In [6]:
# Enable Intellisense
%config IPCompleter.greedy=True

import pandas as pd
import numpy as np
import io
import requests
import seaborn as sns
from matplotlib import pyplot, dates
import plotly.express as px 
import datetime
import plotly.graph_objects as go
import plotly.io as pio
sns.set()

# Annoying warning
pd.options.mode.chained_assignment = None  # default='warn'

In [7]:
# ['EC', 'FS', 'GP', 'KZN', 'LP', 'MP', 'NC', 'NW', 'WC']

## Gen Helper Functions

In [8]:
def df_from_url(df_url, pd_kwargs={}):
    df_req = requests.get(df_url).content
    df = pd.read_csv(io.StringIO(df_req.decode('utf-8')), **pd_kwargs)
    return df

In [9]:
tmp_kwargs = {"usecols":['date','total']}
df_from_url("https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv", 
            tmp_kwargs).head()

Unnamed: 0,date,total
0,05-03-2020,1
1,07-03-2020,2
2,08-03-2020,3
3,09-03-2020,7
4,11-03-2020,13


# Cumulative/Daily Totals Per Day
Currently only applicable for confirmed cases and deaths.

In [28]:
def get_cum_daily(data_url, cum_col='total', index_col='date'): # kwargs={}):
#     data_req = requests.get(data_url).content
#     data = pd.read_csv(io.StringIO(data_req.decode('utf-8')), usecols=cols, index_col=['date'])
    cols = ['date', 'total']
    pd_kwargs = {"usecols":[cum_col, index_col],"index_col":[index_col]}
#     if usecols != []:
#         pd_kwargs.update({"usecols":usecols})
#     pd_kwargs.update(kwargs)
    data = df_from_url(data_url, pd_kwargs)
    data.rename({cum_col:"cum_no"}, axis=1, inplace = True)
    data.ffill(inplace=True)
    
    data['daily_no'] = data['cum_no']
    # slice is required as first entry of diff will be NaN but data we want should be equal to the 
    # starting value
    data['daily_no'][1:] = data['cum_no'].diff()[1:]
    # Cast columns to integer
    data = data.astype('int32')
    return data

### Confirmed Cases

In [25]:
confirmed_cases_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv"
get_cum_daily(confirmed_cases_url)

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
05-03-2020,1,1
07-03-2020,2,1
08-03-2020,3,1
09-03-2020,7,4
11-03-2020,13,6
12-03-2020,16,3
13-03-2020,24,8
14-03-2020,38,14
15-03-2020,51,13
16-03-2020,62,11


### Deaths

In [26]:
deaths_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_deaths.csv"
get_cum_daily(deaths_url)

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
27-03-2020,1,1
28-03-2020,2,1
30-03-2020,3,1
31-03-2020,5,2
03-04-2020,9,4
05-04-2020,11,2
06-04-2020,12,1
07-04-2020,13,1
08-04-2020,18,5
09-04-2020,18,0


### Tests

In [29]:
tests_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv"
get_cum_daily(tests_url, 'cumulative_tests', 'date')

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
11-02-2020,61,61
13-02-2020,67,6
14-02-2020,71,4
19-02-2020,95,24
20-02-2020,106,11
...,...,...
28-04-2020,185497,7027
29-04-2020,197127,11630
30-04-2020,207530,10403
01-05-2020,217522,9992


### Recoveries

In [30]:
tests_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv"
get_cum_daily(tests_url, 'recovered', 'date')

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
11-02-2020,0,0
13-02-2020,0,0
14-02-2020,0,0
19-02-2020,0,0
20-02-2020,0,0
...,...,...
28-04-2020,2073,600
29-04-2020,2073,0
30-04-2020,2073,0
01-05-2020,2382,309


# Daily/Cumulative Data Per Prov Per Day

In [48]:
def get_cum_daily_by_prov(data_url):
    cols = ['date','EC', 'FS', 'GP', 'KZN', 'LP', 'MP', 'NC', 'NW','WC', 'UNKNOWN']
    pd_kwargs = {"usecols":cols}
    cum_data = df_from_url(data_url, pd_kwargs)
    cum_data.dropna(inplace=True)
    cum_data['date'] = pd.to_datetime(cum_data['date'], format='%d-%m-%Y')
    
    daily_data = cum_data.copy()
    
    
    cum_data_melt = cum_data.melt(id_vars=['date'], var_name='province', value_name='cum_cases')
    cum_data_melt.set_index(['date'], inplace=True)
    
    data = cum_data_melt
    return data

In [49]:
get_cum_daily_by_prov("https://raw.githubusercontent.com/dsfsi/covid19za/master/data/" +
                      "covid19za_provincial_cumulative_timeline_confirmed.csv")

Unnamed: 0_level_0,province,cum_cases
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-05,EC,0.0
2020-03-07,EC,0.0
2020-03-08,EC,0.0
2020-03-09,EC,0.0
2020-03-11,EC,0.0
...,...,...
2020-04-28,UNKNOWN,0.0
2020-04-29,UNKNOWN,0.0
2020-04-30,UNKNOWN,0.0
2020-05-01,UNKNOWN,0.0


In [None]:
province_data['date'] = pd.to_datetime(province_data['date'], format='%d-%m-%Y')