# Data Preprocessing

In [1]:
# Enable Intellisense
%config IPCompleter.greedy=True

import pandas as pd
import numpy as np
import io
import requests
import seaborn as sns
from matplotlib import pyplot, dates
import plotly.express as px 
import datetime
import plotly.graph_objects as go
import plotly.io as pio
sns.set()

# Annoying warning
pd.options.mode.chained_assignment = None  # default='warn'

# Gen Data
## All Confirm

**No longer going to use above data**

# Over time

## Provinces
Taken from 'covid19za_provincial_cumulative_timeline_confirmed.csv'
### Cumulative

In [2]:
province_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv"

province_data_req = requests.get(province_data_url).content

province_data = pd.read_csv(io.StringIO(province_data_req.decode('utf-8')), delimiter = ',')
province_data.drop(['YYYYMMDD'], axis = 1, inplace=True)
province_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total,source
47,23-04-2020,417.0,106.0,1252.0,807.0,27.0,23.0,16.0,25.0,1279.0,1.0,3953,https://sacoronavirus.co.za/2020/04/23/update-...
48,24-04-2020,480.0,111.0,1281.0,841.0,29.0,24.0,16.0,25.0,1413.0,0.0,4220,https://twitter.com/nicd_sa/status/12537692103...
49,25-04-2020,488.0,111.0,1304.0,841.0,30.0,23.0,16.0,28.0,1514.0,0.0,4361,https://twitter.com/DrZweliMkhize/status/12541...
50,26-04-2020,535.0,110.0,1331.0,863.0,31.0,23.0,17.0,28.0,1608.0,0.0,4546,https://twitter.com/DrZweliMkhize/status/12544...
51,27-04-2020,588.0,111.0,1353.0,902.0,31.0,26.0,17.0,31.0,1737.0,0.0,4793,https://twitter.com/drzwelimkhize/status/12548...


In [3]:
province_data.drop(['total','source'],axis=1, inplace = True)
province_data.dropna(inplace=True)
province_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
47,23-04-2020,417.0,106.0,1252.0,807.0,27.0,23.0,16.0,25.0,1279.0,1.0
48,24-04-2020,480.0,111.0,1281.0,841.0,29.0,24.0,16.0,25.0,1413.0,0.0
49,25-04-2020,488.0,111.0,1304.0,841.0,30.0,23.0,16.0,28.0,1514.0,0.0
50,26-04-2020,535.0,110.0,1331.0,863.0,31.0,23.0,17.0,28.0,1608.0,0.0
51,27-04-2020,588.0,111.0,1353.0,902.0,31.0,26.0,17.0,31.0,1737.0,0.0


In [4]:
province_data.to_csv('data/daily_prov.csv', index = False)

In [5]:
province_data = pd.read_csv('data/daily_prov.csv')
province_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
45,23-04-2020,417.0,106.0,1252.0,807.0,27.0,23.0,16.0,25.0,1279.0,1.0
46,24-04-2020,480.0,111.0,1281.0,841.0,29.0,24.0,16.0,25.0,1413.0,0.0
47,25-04-2020,488.0,111.0,1304.0,841.0,30.0,23.0,16.0,28.0,1514.0,0.0
48,26-04-2020,535.0,110.0,1331.0,863.0,31.0,23.0,17.0,28.0,1608.0,0.0
49,27-04-2020,588.0,111.0,1353.0,902.0,31.0,26.0,17.0,31.0,1737.0,0.0


In [6]:
province_data['date'] = pd.to_datetime(province_data['date'], format='%d-%m-%Y')

In [7]:
province_data_melt = province_data.melt(id_vars=['date'], var_name='province', value_name='cumulative_cases')
province_data_melt

Unnamed: 0,date,province,cumulative_cases
0,2020-03-05,EC,0.0
1,2020-03-07,EC,0.0
2,2020-03-08,EC,0.0
3,2020-03-09,EC,0.0
4,2020-03-11,EC,0.0
...,...,...,...
495,2020-04-23,UNKNOWN,1.0
496,2020-04-24,UNKNOWN,0.0
497,2020-04-25,UNKNOWN,0.0
498,2020-04-26,UNKNOWN,0.0


In [8]:
province_names = {
#     "EC":"Eastern Cape",
#     "FS" : "Free State",
#     "GP" : "Gauteng",
#     "KZN" : "KwaZula-Natal",
#     "LP" : "Limpopo",
#     "MP" : "Mpumalanga",
#     "NW" : "North West",
#     "NC" : "Northern Cape",
#     "WC" : "Western Cape",
    "UNKNOWN": "UNK"
}
# province_data_melt['province'] = province_data_melt['province'].map(province_names)
province_data_melt['province'] = province_data_melt['province'].replace("UNKNOWN","UNK")
province_data_melt

Unnamed: 0,date,province,cumulative_cases
0,2020-03-05,EC,0.0
1,2020-03-07,EC,0.0
2,2020-03-08,EC,0.0
3,2020-03-09,EC,0.0
4,2020-03-11,EC,0.0
...,...,...,...
495,2020-04-23,UNK,1.0
496,2020-04-24,UNK,0.0
497,2020-04-25,UNK,0.0
498,2020-04-26,UNK,0.0


### Daily

In [9]:
province_data_daily = province_data.copy()
province_data_daily.iloc[1:,1:] = province_data_daily.iloc[:,1:].diff().iloc[1:,:]
province_data_daily.tail()
# province_data_daily

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
45,2020-04-23,40.0,0.0,28.0,49.0,0.0,0.0,0.0,1.0,200.0,0.0
46,2020-04-24,63.0,5.0,29.0,34.0,2.0,1.0,0.0,0.0,134.0,-1.0
47,2020-04-25,8.0,0.0,23.0,0.0,1.0,-1.0,0.0,3.0,101.0,0.0
48,2020-04-26,47.0,-1.0,27.0,22.0,1.0,0.0,1.0,0.0,94.0,0.0
49,2020-04-27,53.0,1.0,22.0,39.0,0.0,3.0,0.0,3.0,129.0,0.0


In [10]:
province_data_daily_melt = province_data_daily.melt(id_vars=['date'], var_name='province', 
                                                    value_name='daily_cases')
province_data_daily_melt.tail()

Unnamed: 0,date,province,daily_cases
495,2020-04-23,UNKNOWN,0.0
496,2020-04-24,UNKNOWN,-1.0
497,2020-04-25,UNKNOWN,0.0
498,2020-04-26,UNKNOWN,0.0
499,2020-04-27,UNKNOWN,0.0


In [11]:
province_names = {
#     "EC":"Eastern Cape",
#     "FS" : "Free State",
#     "GP" : "Gauteng",
#     "KZN" : "KwaZula-Natal",
#     "LP" : "Limpopo",
#     "MP" : "Mpumalanga",
#     "NW" : "North West",
#     "NC" : "Northern Cape",
#     "WC" : "Western Cape",
    "UNKNOWN": "UNK"
}
# province_data_daily_melt['province'] = province_data_daily_melt['province'].map(province_names)
province_data_daily_melt['province'] = province_data_daily_melt['province'].replace("UNKNOWN","UNK")
province_data_daily_melt

Unnamed: 0,date,province,daily_cases
0,2020-03-05,EC,0.0
1,2020-03-07,EC,0.0
2,2020-03-08,EC,0.0
3,2020-03-09,EC,0.0
4,2020-03-11,EC,0.0
...,...,...,...
495,2020-04-23,UNK,0.0
496,2020-04-24,UNK,-1.0
497,2020-04-25,UNK,0.0
498,2020-04-26,UNK,0.0


### Concatenate Cumulative & Daily

In [12]:
prov_cumulative_daily = pd.concat([province_data_melt, province_data_daily_melt['daily_cases']], axis =1)
prov_cumulative_daily.tail()

Unnamed: 0,date,province,cumulative_cases,daily_cases
495,2020-04-23,UNK,1.0,0.0
496,2020-04-24,UNK,0.0,-1.0
497,2020-04-25,UNK,0.0,0.0
498,2020-04-26,UNK,0.0,0.0
499,2020-04-27,UNK,0.0,0.0


**Save to csv**

In [13]:
prov_cumulative_daily.to_csv('data/daily_cumulative_confirmed_prov.csv', index=False)

## Tests

In [14]:
tests_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv"

tests_data_req = requests.get(tests_data_url).content

tests_data = pd.read_csv(io.StringIO(tests_data_req.decode('utf-8')), delimiter = ',',
                         usecols=['date','cumulative_tests'])
tests_data.dropna(inplace=True)

So far tests_data only includes cumulative.

In [15]:
tests_data

Unnamed: 0,date,cumulative_tests
0,11-02-2020,61.0
1,13-02-2020,67.0
2,14-02-2020,71.0
3,19-02-2020,95.0
4,20-02-2020,106.0
5,24-02-2020,116.0
6,26-02-2020,121.0
7,02-03-2020,160.0
8,03-03-2020,164.0
9,06-03-2020,200.0


In [16]:
tests_data.to_csv('data/tests_data.csv', index=False)

### Daily

In [17]:
tests_data['daily_tests'] = tests_data['cumulative_tests']
# slice is required as first entry of diff will be NaN but data we want should be equal to the starting value
tests_data['daily_tests'][1:] = tests_data['cumulative_tests'].diff()[1:] 
tests_data

Unnamed: 0,date,cumulative_tests,daily_tests
0,11-02-2020,61.0,61.0
1,13-02-2020,67.0,6.0
2,14-02-2020,71.0,4.0
3,19-02-2020,95.0,24.0
4,20-02-2020,106.0,11.0
5,24-02-2020,116.0,10.0
6,26-02-2020,121.0,5.0
7,02-03-2020,160.0,39.0
8,03-03-2020,164.0,4.0
9,06-03-2020,200.0,36.0


#### Save to CSV

In [18]:
tests_data.to_csv('data/daily_cumulative_tests.csv', index=False)

## Confirmed Cases
Get no of cumulative cases from 'covid19za_provincial_cumulative_timeline_confirmed.csv'
### Cumulative

In [19]:
confirmed_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv"

confirmed_data_req = requests.get(confirmed_data_url).content

confirmed_data = pd.read_csv(io.StringIO(confirmed_data_req.decode('utf-8')), usecols=['date','total'])

confirmed_data.tail()

Unnamed: 0,date,total
47,23-04-2020,3953
48,24-04-2020,4220
49,25-04-2020,4361
50,26-04-2020,4546
51,27-04-2020,4793


In [20]:
confirmed_data['date'] = pd.to_datetime(confirmed_data['date'], format='%d-%m-%Y')
confirmed_data.set_index('date', inplace=True)
confirmed_data.rename(columns={"total":"cumulative_cases"}, inplace = True)
confirmed_data.tail()

Unnamed: 0_level_0,cumulative_cases
date,Unnamed: 1_level_1
2020-04-23,3953
2020-04-24,4220
2020-04-25,4361
2020-04-26,4546
2020-04-27,4793


### Daily

In [21]:
confirmed_data['daily_cases'] = confirmed_data['cumulative_cases']
# slice is required as first entry of diff will be NaN but data we want should be equal to the starting value
confirmed_data['daily_cases'][1:] = confirmed_data['cumulative_cases'].diff()[1:] 
confirmed_data

Unnamed: 0_level_0,cumulative_cases,daily_cases
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-05,1,1.0
2020-03-07,2,1.0
2020-03-08,3,1.0
2020-03-09,7,4.0
2020-03-11,13,6.0
2020-03-12,16,3.0
2020-03-13,24,8.0
2020-03-14,38,14.0
2020-03-15,51,13.0
2020-03-16,62,11.0


**Save to csv**

In [22]:
confirmed_data.to_csv('data/daily_cumulative_confirmed.csv')

### Daily

In [23]:
# confirmed_data = confirmed_all_data.groupby(['date']).count()[['province']]
# confirmed_data.rename(columns={'province':'daily_cases'}, inplace = True)
# confirmed_data

### Cumulative

In [24]:
# confirmed_data['cumulative_cases'] = confirmed_data['daily_cases'].cumsum()
# confirmed_data

**Save to csv**

Above method is no longer being used. Thus why below is commented out.

In [25]:
# confirmed_data.to_csv('data/daily_cumulative_confirmed.csv')

**Use saved data rather than above**

In [26]:
# confirmed_data = pd.read_csv('data/daily_cumulative_confirmed.csv')
# confirmed_data['date'] = pd.to_datetime(confirmed_data['date'], format='%Y-%m-%d')
# confirmed_data.set_index('date', inplace=True)
# confirmed_data

**Add data to above**

Data to be added:

In [27]:
# # Format must be dd-mm-YYYY
# date_str = "29-03-2020"
# date_dt = pd.to_datetime(date_str, format='%d-%m-%Y')
# new_tot_cases = 1326

In [28]:
# new_daily_cases = new_tot_cases - confirmed_data.iloc[-1]['cumulative_cases']
# new_df_entry = pd.DataFrame({"date":[date_dt],
#              "daily_cases":[new_daily_cases],
#              "cumulative_cases":[new_tot_cases]}).set_index('date')
# confirmed_data = pd.concat([confirmed_data, new_df_entry])
# confirmed_data.tail()

**Save to csv**

In [29]:
# confirmed_data.to_csv('data/daily_cumulative_confirmed.csv')

## Confirmed & Tests
### Daily

In [30]:
confirmed_data_tmp = confirmed_data.reset_index()
confirmed_data_tmp['date'] = pd.to_datetime(confirmed_data_tmp['date'], format='%d-%m-%Y')
confirmed_data_tmp.set_index('date', inplace = True)
confirmed_data_tmp.tail()

Unnamed: 0_level_0,cumulative_cases,daily_cases
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-04-23,3953,318.0
2020-04-24,4220,267.0
2020-04-25,4361,141.0
2020-04-26,4546,185.0
2020-04-27,4793,247.0


In [31]:
tests_data_tmp = tests_data.copy()
# test_data_tmp.info()
tests_data_tmp['date'] = pd.to_datetime(tests_data_tmp['date'], format='%d-%m-%Y')
tests_data_tmp.set_index('date', inplace = True)
tests_data_tmp.tail()

Unnamed: 0_level_0,cumulative_tests,daily_tests
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-04-23,143570.0,9796.0
2020-04-24,152390.0,8820.0
2020-04-25,161004.0,8614.0
2020-04-26,168643.0,7639.0
2020-04-27,178470.0,9827.0


In [32]:
daily_tests_confirmed = pd.concat([confirmed_data_tmp[['daily_cases']], 
                                   tests_data_tmp['daily_tests']], axis = 1, sort = True)
daily_tests_confirmed['daily_cases'].fillna(0,inplace=True)
daily_tests_confirmed

Unnamed: 0_level_0,daily_cases,daily_tests
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-11,0.0,61.0
2020-02-13,0.0,6.0
2020-02-14,0.0,4.0
2020-02-19,0.0,24.0
2020-02-20,0.0,11.0
...,...,...
2020-04-23,318.0,9796.0
2020-04-24,267.0,8820.0
2020-04-25,141.0,8614.0
2020-04-26,185.0,7639.0


NaN for daily_tests corresponds to test data not available.
### Percentage of Positive Tests

In [33]:
daily_tests_confirmed['perc_positive'] = (daily_tests_confirmed['daily_cases']/
                                          daily_tests_confirmed['daily_tests']) * 100
daily_tests_confirmed['perc_positive'] = daily_tests_confirmed['perc_positive'].round(1)
daily_tests_confirmed

Unnamed: 0_level_0,daily_cases,daily_tests,perc_positive
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-02-11,0.0,61.0,0.0
2020-02-13,0.0,6.0,0.0
2020-02-14,0.0,4.0,0.0
2020-02-19,0.0,24.0,0.0
2020-02-20,0.0,11.0,0.0
...,...,...,...
2020-04-23,318.0,9796.0,3.2
2020-04-24,267.0,8820.0,3.0
2020-04-25,141.0,8614.0,1.6
2020-04-26,185.0,7639.0,2.4


**Save to csv**

In [34]:
daily_tests_confirmed.to_csv('data/daily_tests_confirmed.csv')

### Cumulative

In [35]:
cumulative_tests_confirmed = daily_tests_confirmed.cumsum()
cumulative_tests_confirmed

Unnamed: 0_level_0,daily_cases,daily_tests,perc_positive
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-02-11,0.0,61.0,0.0
2020-02-13,0.0,67.0,0.0
2020-02-14,0.0,71.0,0.0
2020-02-19,0.0,95.0,0.0
2020-02-20,0.0,106.0,0.0
...,...,...,...
2020-04-23,3953.0,143570.0,144.4
2020-04-24,4220.0,152390.0,147.4
2020-04-25,4361.0,161004.0,149.0
2020-04-26,4546.0,168643.0,151.4


// TODO - Rename to Cumulative as this is confusing

### Percentage of Positive Tests

In [36]:
cumulative_tests_confirmed['perc_positive'] = (cumulative_tests_confirmed['daily_cases']/
                                          cumulative_tests_confirmed['daily_tests']) * 100
cumulative_tests_confirmed['perc_positive'] = cumulative_tests_confirmed['perc_positive'].round(1)
cumulative_tests_confirmed

Unnamed: 0_level_0,daily_cases,daily_tests,perc_positive
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-02-11,0.0,61.0,0.0
2020-02-13,0.0,67.0,0.0
2020-02-14,0.0,71.0,0.0
2020-02-19,0.0,95.0,0.0
2020-02-20,0.0,106.0,0.0
...,...,...,...
2020-04-23,3953.0,143570.0,2.8
2020-04-24,4220.0,152390.0,2.8
2020-04-25,4361.0,161004.0,2.7
2020-04-26,4546.0,168643.0,2.7


**Save to csv**

In [37]:
cumulative_tests_confirmed.to_csv('data/cumulative_tests_confirmed.csv')

# Deaths & Recoveries
## Recoveries

In [38]:
recovered_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv"
recovered_data_req = requests.get(recovered_data_url).content

recovered_data = pd.read_csv(io.StringIO(recovered_data_req.decode('utf-8')), delimiter = ',', 
                            usecols=['date','recovered'])
# rename recovered
recovered_data.rename(columns={"recovered":"cum_recovered"}, inplace=True)

# drop fields where recovered is 0
recovered_data.drop(recovered_data[recovered_data['cum_recovered']==0]['cum_recovered'].index, inplace=True)
recovered_data

Unnamed: 0,date,cum_recovered
22,22-03-2020,1
23,23-03-2020,1
24,24-03-2020,2
25,25-03-2020,4
26,26-03-2020,4
27,27-03-2020,31
28,28-03-2020,31
29,29-03-2020,31
30,30-03-2020,31
31,31-03-2020,31


In [39]:
recovered_data['date'] = pd.to_datetime(recovered_data['date'], format='%d-%m-%Y')
recovered_data.set_index('date', inplace = True)
recovered_data

Unnamed: 0_level_0,cum_recovered
date,Unnamed: 1_level_1
2020-03-22,1
2020-03-23,1
2020-03-24,2
2020-03-25,4
2020-03-26,4
2020-03-27,31
2020-03-28,31
2020-03-29,31
2020-03-30,31
2020-03-31,31


In [40]:
recovered_data['daily_recovered'] = recovered_data['cum_recovered']
# slice is required as first entry of diff will be NaN but data we want should be equal to the starting value
recovered_data['daily_recovered'][1:] = recovered_data['cum_recovered'].diff()[1:] 
recovered_data

Unnamed: 0_level_0,cum_recovered,daily_recovered
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-22,1,1.0
2020-03-23,1,0.0
2020-03-24,2,1.0
2020-03-25,4,2.0
2020-03-26,4,0.0
2020-03-27,31,27.0
2020-03-28,31,0.0
2020-03-29,31,0.0
2020-03-30,31,0.0
2020-03-31,31,0.0


**Save to csv**

In [41]:
recovered_data.to_csv('data/recovered_data.csv')

## Deaths

In [42]:
start_deaths_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_deaths.csv"
start_deaths_data_req = requests.get(start_deaths_data_url).content

start_deaths_data = pd.read_csv(io.StringIO(start_deaths_data_req.decode('utf-8')), delimiter = ',',
                            usecols=['date', 'province']).head(13)

start_deaths_data

Unnamed: 0,date,province
0,27-03-2020,WC
1,28-03-2020,KZN
2,30-03-2020,FS
3,31-03-2020,GP
4,31-03-2020,KZN
5,03-04-2020,KZN
6,03-04-2020,KZN
7,03-04-2020,KZN
8,03-04-2020,KZN
9,05-04-2020,WC


In [43]:
start_deaths_data['date'] = pd.to_datetime(start_deaths_data['date'], format='%d-%m-%Y')
start_deaths_data.tail()

Unnamed: 0,date,province
8,2020-04-03,KZN
9,2020-04-05,WC
10,2020-04-05,KZN
11,2020-04-06,WC
12,2020-04-07,KZN


In [44]:
start_deaths_data = start_deaths_data.groupby(['date']).count()
start_deaths_data.rename({"province":"daily_deaths"}, axis=1, inplace = True)
start_deaths_data

Unnamed: 0_level_0,daily_deaths
date,Unnamed: 1_level_1
2020-03-27,1
2020-03-28,1
2020-03-30,1
2020-03-31,2
2020-04-03,4
2020-04-05,2
2020-04-06,1
2020-04-07,1


In [45]:
start_deaths_data['cum_deaths'] = start_deaths_data[['daily_deaths']].cumsum()
start_deaths_data

Unnamed: 0_level_0,daily_deaths,cum_deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-27,1,1
2020-03-28,1,2
2020-03-30,1,3
2020-03-31,2,5
2020-04-03,4,9
2020-04-05,2,11
2020-04-06,1,12
2020-04-07,1,13


#### Deaths data from 8 April onwards
From https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_deaths.csv

In [46]:
deaths_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_deaths.csv"

deaths_data_req = requests.get(deaths_data_url).content

deaths_data = pd.read_csv(io.StringIO(deaths_data_req.decode('utf-8')), usecols=['date','total'])

deaths_data.tail()

Unnamed: 0,date,total
15,23-04-2020,75
16,24-04-2020,79
17,25-04-2020,86
18,26-04-2020,87
19,27-04-2020,90


In [47]:
deaths_data['date'] = pd.to_datetime(deaths_data['date'], format='%d-%m-%Y')
deaths_data.set_index('date', inplace=True)
deaths_data.rename(columns={"total":"cum_deaths"}, inplace = True)
deaths_data.tail()

Unnamed: 0_level_0,cum_deaths
date,Unnamed: 1_level_1
2020-04-23,75
2020-04-24,79
2020-04-25,86
2020-04-26,87
2020-04-27,90


### Daily

In [48]:
deaths_data.iloc[0]['cum_deaths'] - start_deaths_data.iloc[-1]['cum_deaths']

5

In [49]:
deaths_data['daily_deaths'] = deaths_data['cum_deaths'].diff()
deaths_data['daily_deaths'][0] = deaths_data.iloc[0]['cum_deaths'] - start_deaths_data.iloc[-1]['cum_deaths']
deaths_data.head()

Unnamed: 0_level_0,cum_deaths,daily_deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-04-08,18,5.0
2020-04-09,18,0.0
2020-04-10,24,6.0
2020-04-11,25,1.0
2020-04-12,25,0.0


**Concat missing data from before 8 April**

In [50]:
deaths_data=pd.concat([start_deaths_data, deaths_data])
deaths_data

Unnamed: 0_level_0,daily_deaths,cum_deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-27,1.0,1
2020-03-28,1.0,2
2020-03-30,1.0,3
2020-03-31,2.0,5
2020-04-03,4.0,9
2020-04-05,2.0,11
2020-04-06,1.0,12
2020-04-07,1.0,13
2020-04-08,5.0,18
2020-04-09,0.0,18


**Save to csv**

In [51]:
deaths_data.to_csv('data/daily_cum_deaths.csv')

### Death Per Province

In [52]:
deaths_data_prov_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_deaths.csv"
deaths_data_prov_req = requests.get(deaths_data_prov_url).content

deaths_prov_data = pd.read_csv(io.StringIO(deaths_data_prov_req.decode('utf-8'))).drop("YYYYMMDD", axis =1)
deaths_prov_data['date'] = pd.to_datetime(deaths_prov_data['date'], format='%d-%m-%Y')

deaths_prov_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total
15,2020-04-23,6,5,8,27,1,0,0,0,28,0,75
16,2020-04-24,6,5,8,29,1,0,0,0,30,0,79
17,2020-04-25,10,5,8,29,2,0,0,0,32,0,86
18,2020-04-26,10,5,8,29,2,0,0,0,33,0,87
19,2020-04-27,10,5,8,30,2,0,0,0,35,0,90


In [53]:
deaths_prov_data.drop(['total'], axis = 1, inplace = True)
deaths_prov_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
15,2020-04-23,6,5,8,27,1,0,0,0,28,0
16,2020-04-24,6,5,8,29,1,0,0,0,30,0
17,2020-04-25,10,5,8,29,2,0,0,0,32,0
18,2020-04-26,10,5,8,29,2,0,0,0,33,0
19,2020-04-27,10,5,8,30,2,0,0,0,35,0


**Confirmed Data**

In [54]:
confirmed_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv"

confirmed_data_req = requests.get(confirmed_data_url).content

confirmed_data = pd.read_csv(io.StringIO(confirmed_data_req.decode('utf-8')), usecols=['date','total'])

confirmed_data.tail()

Unnamed: 0,date,total
47,23-04-2020,3953
48,24-04-2020,4220
49,25-04-2020,4361
50,26-04-2020,4546
51,27-04-2020,4793


In [55]:
confirmed_data['date'] = pd.to_datetime(confirmed_data['date'], format='%d-%m-%Y')
confirmed_data.set_index('date', inplace=True)
confirmed_data.rename(columns={"total":"cumulative_cases"}, inplace = True)
confirmed_data.tail()

Unnamed: 0_level_0,cumulative_cases
date,Unnamed: 1_level_1
2020-04-23,3953
2020-04-24,4220
2020-04-25,4361
2020-04-26,4546
2020-04-27,4793


### Daily

In [56]:
confirmed_data['daily_cases'] = confirmed_data['cumulative_cases']
# slice is required as first entry of diff will be NaN but data we want should be equal to the starting value
confirmed_data['daily_cases'][1:] = confirmed_data['cumulative_cases'].diff()[1:] 
confirmed_data.head()

Unnamed: 0_level_0,cumulative_cases,daily_cases
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-05,1,1.0
2020-03-07,2,1.0
2020-03-08,3,1.0
2020-03-09,7,4.0
2020-03-11,13,6.0


**Save to csv**

In [57]:
confirmed_data.to_csv('data/daily_cumulative_confirmed.csv')

### Per Province

In [58]:
# deaths_data_per_day = deaths_data.groupby('date')[['province']].count()

### Total per day
#### Daily

In [59]:
# deaths_data_per_day = deaths_data.groupby('date')[['province']].count()
# deaths_data_per_day.rename(columns={"province":"daily_deaths"},inplace=True)
# deaths_data_per_day

Manually add entry

#### Cumulative

In [60]:
# deaths_data_per_day['cum_deaths']=deaths_data_per_day['daily_deaths'].cumsum()
# deaths_data_per_day

In [61]:
# def add_deaths(org_df, date_of_deaths, cumulative_deaths):
#     date_dt = pd.to_datetime(date_of_deaths, format='%d-%m-%Y') # Format must be dd-mm-YYYY
#     new_tot_deaths = cumulative_deaths

#     new_daily_deaths = new_tot_deaths - org_df.iloc[-1]['cum_deaths']
#     new_df_entry = pd.DataFrame({"date":[date_dt], 
#                              "daily_deaths":[new_daily_deaths],
#                              "cum_deaths":[new_tot_deaths],}).set_index('date')
#     new_df = pd.concat([org_df, new_df_entry])
#     return new_df

In [62]:
# # deaths_data_per_day = add_deaths(deaths_data_per_day, "17-04-2020", 50) # 17th April - 50 deaths
# # deaths_data_per_day = add_deaths(deaths_data_per_day, "18-04-2020", 52) # 18th April - 52 deaths
# deaths_data_per_day = add_deaths(deaths_data_per_day, "24-04-2020", 79)
# deaths_data_per_day = add_deaths(deaths_data_per_day, "25-04-2020", 86)

# deaths_data_per_day

**Save to csv**

In [63]:
# deaths_data_per_day.to_csv('data/daily_cum_deaths.csv')

#### Deaths Vs Recovered

In [64]:
deaths_vs_recoveries = pd.concat([deaths_data[['cum_deaths']], recovered_data['cum_recovered']], 
                                 axis =1)
deaths_vs_recoveries.iloc[0,0] = 0
deaths_vs_recoveries['cum_deaths'].ffill(inplace=True)
deaths_vs_recoveries['cum_recovered'].ffill(inplace=True)
deaths_vs_recoveries

Unnamed: 0_level_0,cum_deaths,cum_recovered
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-22,0.0,1
2020-03-23,0.0,1
2020-03-24,0.0,2
2020-03-25,0.0,4
2020-03-26,0.0,4
2020-03-27,1.0,31
2020-03-28,2.0,31
2020-03-29,2.0,31
2020-03-30,3.0,31
2020-03-31,5.0,31


**Save to csv**

In [65]:
deaths_vs_recoveries.to_csv('data/deaths_vs_recoveries.csv')

# Totals
## Province
### Confirmed Cases

In [66]:
province_names = {
    "EC":"Eastern Cape",
    "FS" : "Free State",
    "GP" : "Gauteng",
    "KZN" : "KwaZula-Natal",
    "LP" : "Limpopo",
    "MP" : "Mpumalanga",
    "NW" : "North West",
    "NC" : "Northern Cape",
    "WC" : "Western Cape",
    "UNKNOWN": "Unknown"
}

In [67]:
prov_totals = province_data.tail(1).copy()
prov_totals = prov_totals.melt(id_vars=['date'], var_name='province', value_name='total')
prov_totals['province'] = prov_totals['province'].map(province_names)
prov_totals.drop('date',axis=1,inplace=True)
prov_totals

Unnamed: 0,province,total
0,Eastern Cape,588.0
1,Free State,111.0
2,Gauteng,1353.0
3,KwaZula-Natal,902.0
4,Limpopo,31.0
5,Mpumalanga,26.0
6,Northern Cape,17.0
7,North West,31.0
8,Western Cape,1737.0
9,Unknown,0.0


**Save to csv**

In [68]:
prov_totals.to_csv('data/tot_provinces.csv',index=False)

### Deaths

In [69]:
tot_deaths_per_province = deaths_prov_data.tail(1)
tot_deaths_per_province = tot_deaths_per_province.melt(id_vars=['date'], var_name='province', 
                                              value_name='tot_deaths')
tot_deaths_per_province.drop(['date'], axis=1, inplace = True)
tot_deaths_per_province['province'] = tot_deaths_per_province['province'].map(province_names)
tot_deaths_per_province.set_index(['province'], inplace = True)
tot_deaths_per_province

Unnamed: 0_level_0,tot_deaths
province,Unnamed: 1_level_1
Eastern Cape,10
Free State,5
Gauteng,8
KwaZula-Natal,30
Limpopo,2
Mpumalanga,0
Northern Cape,0
North West,0
Western Cape,35
Unknown,0


**Save to csv**

In [70]:
tot_deaths_per_province.to_csv('data/tot_deaths_provinces.csv',index=True)

## Deaths Per Prov

In [71]:
# deaths_data_copy.drop(['gender','age'], axis =1, inplace=True)
# deaths_data_copy

#### Daily

In [72]:
# deaths_data_per_prov = deaths_data_copy.copy().fillna('Unknown')
# deaths_data_per_prov['total'] = 1
# # deaths_data_per_prov
# deaths_data_per_prov = deaths_data_per_prov.groupby(['date','province']).count()
# deaths_data_per_prov.tail()

In [73]:
# cum_deaths_data_per_prov = deaths_data_per_prov.groupby(level=-1)[['total']].cumsum()
# cum_deaths_data_per_prov.head()
# # deaths_data_per_prov
# # cum_deaths_data_per_prov.groupby(level=-1)['total'].cumsum()

## Data to be displayed as text on website

In [74]:
format(200003,',d').replace(","," ")

'200 003'

In [75]:
def zero_space(num):
    return format(num,',d').replace(","," ")

In [76]:
tot_tested = zero_space(tests_data.tail(1).iloc[0]['cumulative_tests'].astype(int))
change_tested = zero_space(tests_data.tail(1).iloc[0]['daily_tests'].astype(int))
print(tot_tested, change_tested)

178 470 9 827


In [77]:
tot_infected = zero_space(confirmed_data.tail(1).iloc[0]['cumulative_cases'].astype(int))
change_infected = zero_space(confirmed_data.tail(1).iloc[0]['daily_cases'].astype(int))
print(tot_infected, change_infected)

4 793 247


In [78]:
tot_deaths = zero_space(deaths_vs_recoveries.tail(1).iloc[0]['cum_deaths'].astype(int))
change_deaths = zero_space(deaths_vs_recoveries['cum_deaths'].diff().tail(1).iloc[0].astype(int))
print(tot_deaths, change_deaths)

90 3


In [79]:
tot_recoveries = zero_space(deaths_vs_recoveries.tail(1).iloc[0]['cum_recovered'].astype(int))
change_recoveries = zero_space(deaths_vs_recoveries['cum_recovered'].diff().tail(1).iloc[0].astype(int))
print(tot_recoveries, change_recoveries)

1 473 0


In [80]:
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M %d %B %Y")
print("Current Time =", current_time)

Current Time = 15:01 28 April 2020


In [81]:
gen_data = pd.DataFrame(dict(tot_infected=[tot_infected], change_infected=[change_infected], 
                             tot_deaths=[tot_deaths], change_deaths=[change_deaths],
                             tot_tested=[tot_tested], change_tested=[change_tested], 
                             tot_recoveries=[tot_recoveries], change_recoveries=[change_recoveries], 
                             datetime_updated=[current_time]))
gen_data

Unnamed: 0,tot_infected,change_infected,tot_deaths,change_deaths,tot_tested,change_tested,tot_recoveries,change_recoveries,datetime_updated
0,4 793,247,90,3,178 470,9 827,1 473,0,15:01 28 April 2020


In [82]:
gen_data.to_csv("data/gen_data.csv", index=False)

# Data from Image
Use data_from_img.py code to load data from NICD infographic image specified

In [83]:
# import data_from_img

**Path of image to be processed:**

In [84]:
# img_path = "NICD_updates/NICD_Updates_04_25.jpg"

**Get data from image**

In [85]:
# date, prov_deaths_totals, prov_recovered_totals, gen_totals = data_from_img.get_tot_data(img_path)

### Province Names Dict

In [86]:
province_names = {
    "EC":"Eastern Cape",
    "FS" : "Free State",
    "GP" : "Gauteng",
    "KZN" : "KwaZula-Natal",
    "LP" : "Limpopo",
    "MP" : "Mpumalanga",
    "NW" : "North West",
    "NC" : "Northern Cape",
    "WC" : "Western Cape",
    "UNKNOWN": "Unknown"
}

### Deaths

In [87]:
# tot_deaths_per_province = deaths_prov_data.tail(1)
# tot_deaths_per_province = tot_deaths_per_province.melt(id_vars=['date'], var_name='province', 
#                                               value_name='tot_deaths')
# tot_deaths_per_province.drop(['date'], axis=1, inplace = True)
# tot_deaths_per_province['province'] = tot_deaths_per_province['province'].map(province_names)
# tot_deaths_per_province.set_index(['province'], inplace = True)
# tot_deaths_per_province

**Save to csv**

In [88]:
# tot_deaths_per_province.to_csv('data/tot_deaths_provinces.csv',index=True)

### Recovered

In [89]:
# prov_recovered_totals_df = pd.DataFrame([prov_recovered_totals])
# prov_recovered_totals_df['date'] = date
# prov_recovered_totals_df =prov_recovered_totals_df.melt(id_vars=['date'], var_name='province', 
#                                                   value_name='recovered').set_index(['date'])
# prov_recovered_totals_df['province'] = prov_recovered_totals_df['province'].map(province_names)
# prov_recovered_totals_df

**Save to csv**

In [90]:
# prov_recovered_totals_df.to_csv('data/tot_recovered_provinces.csv',index=True)