# Data Preprocessing - Test Space

In [1]:
# Enable Intellisense
%config IPCompleter.greedy=True

import pandas as pd
import numpy as np
import io
import requests
import seaborn as sns
from matplotlib import pyplot, dates
import plotly.express as px 
import datetime
import plotly.graph_objects as go
import plotly.io as pio
sns.set()

# Annoying warning
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
# ['EC', 'FS', 'GP', 'KZN', 'LP', 'MP', 'NC', 'NW', 'WC']

## Gen Helper Functions

In [32]:
def df_from_url(df_url, pd_kwargs={}):
    df_req = requests.get(df_url).content
    df = pd.read_csv(io.StringIO(df_req.decode('utf-8')), **pd_kwargs)
    return df

In [35]:
tmp_kwargs = {"usecols":['date','total']}
df_from_url("https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv", 
            tmp_kwargs).head()

Unnamed: 0,date,total
0,05-03-2020,1
1,07-03-2020,2
2,08-03-2020,3
3,09-03-2020,7
4,11-03-2020,13


## Cumulative & Daily Data Over Time
Currently only applicable for confirmed cases and deaths.

In [38]:
def get_cum_daily(data_url):
#     data_req = requests.get(data_url).content
#     data = pd.read_csv(io.StringIO(data_req.decode('utf-8')), usecols=cols, index_col=['date'])
    cols = ['date', 'total']
    pd_kwargs = {"usecols":cols, "index_col":['date']}
    data = df_from_url(data_url, pd_kwargs)
    data.rename({"total":"cum_no"}, axis=1, inplace = True)
    
    data['daily_no'] = data['cum_no']
    # slice is required as first entry of diff will be NaN but data we want should be equal to the 
    # starting value
    data['daily_no'][1:] = data['cum_no'].diff()[1:]
    # Cast columns to integer
    data = data.astype('int32')
    return data

In [39]:
confirmed_cases_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv"
get_cum_daily(confirmed_cases_url)

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
05-03-2020,1,1
07-03-2020,2,1
08-03-2020,3,1
09-03-2020,7,4
11-03-2020,13,6
12-03-2020,16,3
13-03-2020,24,8
14-03-2020,38,14
15-03-2020,51,13
16-03-2020,62,11


In [24]:
deaths_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_deaths.csv"
get_cum_daily(deaths_url)

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
08-04-2020,18,18
09-04-2020,18,0
10-04-2020,24,6
11-04-2020,25,1
12-04-2020,25,0
13-04-2020,27,2
14-04-2020,27,0
15-04-2020,34,7
16-04-2020,48,14
17-04-2020,50,2


## Fix for incomplete deaths data
'covid19za_provincial_cumulative_timeline_deaths.csv' includes all but the first 18 cases thus I will only be using the first 18 entries in `start_deaths_data`

In [46]:
start_deaths_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_deaths.csv"
start_deaths_data = df_from_url(start_deaths_data_url, pd_kwargs = {"usecols":['date','province']})
start_deaths_data = start_deaths_data.iloc[0:18] 
start_deaths_data

Unnamed: 0,date,province
0,27-03-2020,WC
1,28-03-2020,KZN
2,30-03-2020,FS
3,31-03-2020,GP
4,31-03-2020,KZN
5,03-04-2020,KZN
6,03-04-2020,KZN
7,03-04-2020,KZN
8,03-04-2020,KZN
9,05-04-2020,WC


**Group by date and province and count occurrences**

In [50]:
start_deaths_by_prov = start_deaths_data.copy()
start_deaths_by_prov['tmp'] = 1
start_deaths_by_prov = start_deaths_by_prov.groupby(['date','province']).count()
start_deaths_by_prov.reset_index(inplace=True)
start_deaths_by_prov

Unnamed: 0,date,province,tmp
0,03-04-2020,KZN,4
1,05-04-2020,KZN,1
2,05-04-2020,WC,1
3,06-04-2020,WC,1
4,07-04-2020,KZN,1
5,08-04-2020,FS,2
6,08-04-2020,GP,2
7,08-04-2020,KZN,1
8,27-03-2020,WC,1
9,28-03-2020,KZN,1


In [66]:
tmp_pivot = start_deaths_by_prov.pivot(index='date', columns='province', values=['tmp'])
tmp_pivot

Unnamed: 0_level_0,tmp,tmp,tmp,tmp
province,FS,GP,KZN,WC
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
03-04-2020,,,4.0,
05-04-2020,,,1.0,1.0
06-04-2020,,,,1.0
07-04-2020,,,1.0,
08-04-2020,2.0,2.0,1.0,
27-03-2020,,,,1.0
28-03-2020,,,1.0,
30-03-2020,1.0,,,
31-03-2020,,1.0,1.0,


In [67]:
tmp_pivot.columns = tmp_pivot.columns.droplevel(level=0)
tmp_pivot.columns.name = ""
tmp_pivot.reset_index(inplace = True)
tmp_pivot

Unnamed: 0,date,FS,GP,KZN,WC
0,03-04-2020,,,4.0,
1,05-04-2020,,,1.0,1.0
2,06-04-2020,,,,1.0
3,07-04-2020,,,1.0,
4,08-04-2020,2.0,2.0,1.0,
5,27-03-2020,,,,1.0
6,28-03-2020,,,1.0,
7,30-03-2020,1.0,,,
8,31-03-2020,,1.0,1.0,


In [69]:
tmp_pivot.fillna(0, inplace=True)