# Data Preprocessing

In [32]:
# Enable Intellisense
%config IPCompleter.greedy=True

import pandas as pd
import numpy as np
import io
import requests
import seaborn as sns
from matplotlib import pyplot, dates
import plotly.express as px 
import datetime
import plotly.graph_objects as go
import plotly.io as pio
sns.set()

# Annoying warning
pd.options.mode.chained_assignment = None  # default='warn'

# Gen Data
## All Confirm

In [42]:
confirm_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_confirmed.csv"

confirm_data_req = requests.get(confirm_data_url).content

confirmed_all_data = pd.read_csv(io.StringIO(confirm_data_req.decode('utf-8')), delimiter = ',', 
                                 usecols=['date','province','age','gender','type'])
confirmed_all_data.head()

Unnamed: 0,date,province,age,gender,type
0,05-03-2020,KZN,38.0,male,travel
1,07-03-2020,GP,39.0,female,travel
2,08-03-2020,KZN,38.0,female,travel
3,09-03-2020,KZN,38.0,male,travel
4,09-03-2020,KZN,38.0,female,travel


# Over time

## Provinces
Taken from 'covid19za_provincial_cumulative_timeline_confirmed.csv'
### Cumulative

In [2]:
province_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv"

province_data_req = requests.get(province_data_url).content

province_data = pd.read_csv(io.StringIO(province_data_req.decode('utf-8')), delimiter = ',')
province_data.drop(['YYYYMMDD'], axis = 1, inplace=True)
province_data.head()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
0,05-03-2020,0,0,0,1,0,0,0,0,0,0
1,07-03-2020,0,0,1,1,0,0,0,0,0,0
2,08-03-2020,0,0,1,2,0,0,0,0,0,0
3,09-03-2020,0,0,1,6,0,0,0,0,0,0
4,11-03-2020,0,0,5,7,0,0,0,0,1,0


In [3]:
province_data['date'] = pd.to_datetime(province_data['date'], format='%d-%m-%Y')

In [108]:
province_data_melt = province_data.melt(id_vars=['date'], var_name='province', value_name='cumulative_cases')
province_data_melt

Unnamed: 0,date,province,cumulative_cases
0,2020-03-05,EC,0
1,2020-03-07,EC,0
2,2020-03-08,EC,0
3,2020-03-09,EC,0
4,2020-03-11,EC,0
...,...,...,...
165,2020-03-19,UNKNOWN,0
166,2020-03-20,UNKNOWN,0
167,2020-03-21,UNKNOWN,0
168,2020-03-22,UNKNOWN,0


In [109]:
province_names = {"EC":"Eastern Cape",
                  "FS" : "Free State",
                  "GP" : "Gauteng",
                  "KZN" : "KwaZula-Natal",
                  "LP" : "Limpopo",
                  "MP" : "Mpumalanga",
                  "NW" : "North West",
                  "NC" : "Northern Cape",
                  "WC" : "Western Cape",
                  "UNKNOWN": "Unknown"}
province_data_melt['province'] = province_data_melt['province'].map(province_names)
province_data_melt.tail()

Unnamed: 0,date,province,cumulative_cases
165,2020-03-19,Unknown,0
166,2020-03-20,Unknown,0
167,2020-03-21,Unknown,0
168,2020-03-22,Unknown,0
169,2020-03-23,Unknown,1


### Daily

In [111]:
province_data_daily = province_data.copy()
province_data_daily.iloc[1:,1:] = province_data_daily.iloc[:,1:].diff().iloc[1:,:]
province_data_daily.head()
# province_data_daily

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
0,2020-03-05,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2020-03-07,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-03-08,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-03-09,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2020-03-11,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [112]:
province_data_daily_melt = province_data_daily.melt(id_vars=['date'], var_name='province', 
                                                    value_name='daily_cases')
province_data_daily_melt.tail()

Unnamed: 0,date,province,daily_cases
165,2020-03-19,UNKNOWN,0.0
166,2020-03-20,UNKNOWN,0.0
167,2020-03-21,UNKNOWN,0.0
168,2020-03-22,UNKNOWN,0.0
169,2020-03-23,UNKNOWN,1.0


In [113]:
province_names = {"EC":"Eastern Cape",
                  "FS" : "Free State",
                  "GP" : "Gauteng",
                  "KZN" : "KwaZula-Natal",
                  "LP" : "Limpopo",
                  "MP" : "Mpumalanga",
                  "NW" : "North West",
                  "NC" : "Northern Cape",
                  "WC" : "Western Cape",
                  "UNKNOWN": "Unknown"}
province_data_daily_melt['province'] = province_data_daily_melt['province'].map(province_names)
province_data_daily_melt.tail()

Unnamed: 0,date,province,daily_cases
165,2020-03-19,Unknown,0.0
166,2020-03-20,Unknown,0.0
167,2020-03-21,Unknown,0.0
168,2020-03-22,Unknown,0.0
169,2020-03-23,Unknown,1.0


### Concatenate Cumulative & Daily

In [125]:
prov_cumulative_daily = pd.concat([province_data_melt, province_data_daily_melt['daily_cases']], axis =1)
prov_cumulative_daily.tail()

Unnamed: 0,date,province,cumulative_cases,daily_cases
165,2020-03-19,Unknown,0,0.0
166,2020-03-20,Unknown,0,0.0
167,2020-03-21,Unknown,0,0.0
168,2020-03-22,Unknown,0,0.0
169,2020-03-23,Unknown,1,1.0


**Save to csv**

In [127]:
prov_cumulative_daily.to_csv('data/daily_cumulative_confirmed_prov.csv', index=False)

## Tests

In [27]:
tests_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv"

tests_data_req = requests.get(tests_data_url).content

tests_data = pd.read_csv(io.StringIO(tests_data_req.decode('utf-8')), delimiter = ',',
                         usecols=['date','cumulative_tests'])

So far tests_data only includes cumulative.

In [28]:
tests_data

Unnamed: 0,date,cumulative_tests
0,11-02-2020,61
1,13-02-2020,67
2,14-02-2020,71
3,19-02-2020,95
4,20-02-2020,106
5,24-02-2020,116
6,26-02-2020,121
7,02-03-2020,160
8,03-03-2020,164
9,06-03-2020,200


### Daily

In [36]:
tests_data['daily_tests'] = tests_data['cumulative_tests']
# slice is required as first entry of diff will be NaN but data we want should be equal to the starting value
tests_data['daily_tests'][1:] = tests_data['cumulative_tests'].diff()[1:] 
tests_data

Unnamed: 0,date,cumulative_tests,daily_tests
0,11-02-2020,61,61.0
1,13-02-2020,67,6.0
2,14-02-2020,71,4.0
3,19-02-2020,95,24.0
4,20-02-2020,106,11.0
5,24-02-2020,116,10.0
6,26-02-2020,121,5.0
7,02-03-2020,160,39.0
8,03-03-2020,164,4.0
9,06-03-2020,200,36.0


#### Save to CSV

In [37]:
tests_data.to_csv('data/daily_cumulative_tests.csv', index=False)

## Confirmed Cases
### Daily

In [56]:
confirmed_data = confirmed_all_data.groupby(['date']).count()[['province']]
confirmed_data.rename(columns={'province':'daily_cases'}, inplace = True)
confirmed_data

Unnamed: 0_level_0,daily_cases
date,Unnamed: 1_level_1
05-03-2020,1
07-03-2020,1
08-03-2020,1
09-03-2020,4
11-03-2020,6
12-03-2020,3
13-03-2020,8
14-03-2020,14
15-03-2020,13
16-03-2020,11


### Cumulative

In [57]:
confirmed_data['cumulative_cases'] = confirmed_data['daily_cases'].cumsum()
confirmed_data

Unnamed: 0_level_0,daily_cases,cumulative_cases
date,Unnamed: 1_level_1,Unnamed: 2_level_1
05-03-2020,1,1
07-03-2020,1,2
08-03-2020,1,3
09-03-2020,4,7
11-03-2020,6,13
12-03-2020,3,16
13-03-2020,8,24
14-03-2020,14,38
15-03-2020,13,51
16-03-2020,11,62


**Save to csv**

In [59]:
confirmed_data.to_csv('data/daily_cumulative_confirmed.csv')

## Confirmed & Tests
### Daily

In [96]:
confirmed_data_tmp = confirmed_data.reset_index()
confirmed_data_tmp['date'] = pd.to_datetime(confirmed_data_tmp['date'], format='%d-%m-%Y')
confirmed_data_tmp.set_index('date', inplace = True)
confirmed_data_tmp.head()

Unnamed: 0_level_0,daily_cases,cumulative_cases
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-05,1,1
2020-03-07,1,2
2020-03-08,1,3
2020-03-09,4,7
2020-03-11,6,13


In [97]:
tests_data_tmp = tests_data.copy()
# test_data_tmp.info()
tests_data_tmp['date'] = pd.to_datetime(tests_data_tmp['date'], format='%d-%m-%Y')
tests_data_tmp.set_index('date', inplace = True)
tests_data_tmp.head()

Unnamed: 0_level_0,cumulative_tests,daily_tests
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-11,61,61.0
2020-02-13,67,6.0
2020-02-14,71,4.0
2020-02-19,95,24.0
2020-02-20,106,11.0


In [102]:
daily_tests_confirmed = pd.concat([confirmed_data_tmp[['daily_cases']], 
                                   tests_data_tmp['daily_tests']], axis = 1, sort = True)
daily_tests_confirmed.fillna(0,inplace=True)
daily_tests_confirmed

Unnamed: 0_level_0,daily_cases,daily_tests
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-11,0.0,61.0
2020-02-13,0.0,6.0
2020-02-14,0.0,4.0
2020-02-19,0.0,24.0
2020-02-20,0.0,11.0
2020-02-24,0.0,10.0
2020-02-26,0.0,5.0
2020-03-02,0.0,39.0
2020-03-03,0.0,4.0
2020-03-05,1.0,0.0


**Save to csv**

In [103]:
daily_tests_confirmed.to_csv('data/daily_tests_confirmed.csv')

### Cumulative

In [104]:
cumulative_tests_confirmed = daily_tests_confirmed.cumsum()
cumulative_tests_confirmed

Unnamed: 0_level_0,daily_cases,daily_tests
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-11,0.0,61.0
2020-02-13,0.0,67.0
2020-02-14,0.0,71.0
2020-02-19,0.0,95.0
2020-02-20,0.0,106.0
2020-02-24,0.0,116.0
2020-02-26,0.0,121.0
2020-03-02,0.0,160.0
2020-03-03,0.0,164.0
2020-03-05,1.0,164.0


**Save to csv**

In [105]:
cumulative_tests_confirmed.to_csv('data/cumulative_tests_confirmed.csv')

# Totals
## Province

In [23]:
prov_totals = province_data.tail(1).copy()
prov_totals = prov_totals.melt(id_vars=['date'], var_name='province', value_name='total')
prov_totals['province'] = prov_totals['province'].map(province_names)
prov_totals.drop('date',axis=1,inplace=True)
prov_totals

Unnamed: 0,province,total
0,Eastern Cape,2
1,Free State,13
2,Gauteng,207
3,KwaZula-Natal,60
4,Limpopo,4
5,Mpumalanga,9
6,Northern Cape,2
7,North West,4
8,Western Cape,100
9,Unknown,1


**Save to csv**

In [24]:
prov_totals.to_csv('data/tot_provinces.csv',index=False)

## Age
For now this is just being done by hand. I.e. copying from official report.

## Gender

In [123]:
tot_gender_data =  confirmed_all_data.groupby(['gender']).count()[['date']]
tot_gender_data.rename(columns={'date':'total'}, inplace=True)
tot_gender_data.rename({"not specified":"unknown"}, axis='index',inplace=True)
tot_gender_data

Unnamed: 0_level_0,total
gender,Unnamed: 1_level_1
female,100
male,169
unknown,133


**Save to csv**

In [124]:
tot_gender_data.to_csv('data/tot_gender.csv')

## Transmission Type

In [120]:
tot_transmission_data =  confirmed_all_data.groupby(['type']).count()[['date']]
tot_transmission_data.rename(columns={'date':'total'}, inplace=True)
tot_transmission_data.rename({"pending":"unknown"}, axis='index',inplace=True)
tot_transmission_data

Unnamed: 0_level_0,total
type,Unnamed: 1_level_1
local,26
unknown,177
travel,199


**Save to csv**

In [121]:
tot_transmission_data.to_csv('data/tot_transmission_type.csv')