# Data Preprocessing

In [406]:
# Enable Intellisense
%config IPCompleter.greedy=True

import pandas as pd
import numpy as np
import io
import requests
import seaborn as sns
from matplotlib import pyplot, dates
import plotly.express as px 
import datetime
import plotly.graph_objects as go
import plotly.io as pio
sns.set()

# Annoying warning
pd.options.mode.chained_assignment = None  # default='warn'

# Gen Data
## All Confirm

**No longer going to use above data**

# Over time

## Provinces
Taken from 'covid19za_provincial_cumulative_timeline_confirmed.csv'
### Cumulative

In [407]:
province_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv"

province_data_req = requests.get(province_data_url).content

province_data = pd.read_csv(io.StringIO(province_data_req.decode('utf-8')), delimiter = ',')
province_data.drop(['YYYYMMDD'], axis = 1, inplace=True)
province_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total,source
50,26-04-2020,535.0,110.0,1331.0,863.0,31.0,23.0,17.0,28.0,1608.0,0.0,4546,https://twitter.com/DrZweliMkhize/status/12544...
51,27-04-2020,588.0,111.0,1353.0,902.0,31.0,26.0,17.0,31.0,1737.0,0.0,4793,https://twitter.com/drzwelimkhize/status/12548...
52,28-04-2020,616.0,113.0,1377.0,919.0,31.0,26.0,17.0,29.0,1870.0,0.0,4996,https://twitter.com/COVID_19_ZA/status/1255200...
53,29-04-2020,630.0,113.0,1408.0,956.0,31.0,31.0,17.0,29.0,2135.0,0.0,5350,https://sacoronavirus.co.za/2020/04/29/update-...
54,30-04-2020,647.0,116.0,1446.0,980.0,32.0,36.0,17.0,31.0,2342.0,0.0,5647,


In [408]:
province_data.drop(['total','source'],axis=1, inplace = True)
province_data.dropna(inplace=True)
province_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
50,26-04-2020,535.0,110.0,1331.0,863.0,31.0,23.0,17.0,28.0,1608.0,0.0
51,27-04-2020,588.0,111.0,1353.0,902.0,31.0,26.0,17.0,31.0,1737.0,0.0
52,28-04-2020,616.0,113.0,1377.0,919.0,31.0,26.0,17.0,29.0,1870.0,0.0
53,29-04-2020,630.0,113.0,1408.0,956.0,31.0,31.0,17.0,29.0,2135.0,0.0
54,30-04-2020,647.0,116.0,1446.0,980.0,32.0,36.0,17.0,31.0,2342.0,0.0


In [409]:
province_data.to_csv('data/daily_prov.csv', index = False)

In [410]:
province_data = pd.read_csv('data/daily_prov.csv')
province_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
48,26-04-2020,535.0,110.0,1331.0,863.0,31.0,23.0,17.0,28.0,1608.0,0.0
49,27-04-2020,588.0,111.0,1353.0,902.0,31.0,26.0,17.0,31.0,1737.0,0.0
50,28-04-2020,616.0,113.0,1377.0,919.0,31.0,26.0,17.0,29.0,1870.0,0.0
51,29-04-2020,630.0,113.0,1408.0,956.0,31.0,31.0,17.0,29.0,2135.0,0.0
52,30-04-2020,647.0,116.0,1446.0,980.0,32.0,36.0,17.0,31.0,2342.0,0.0


In [411]:
province_data['date'] = pd.to_datetime(province_data['date'], format='%d-%m-%Y')

In [412]:
province_data_melt = province_data.melt(id_vars=['date'], var_name='province', value_name='cumulative_cases')
province_data_melt

Unnamed: 0,date,province,cumulative_cases
0,2020-03-05,EC,0.0
1,2020-03-07,EC,0.0
2,2020-03-08,EC,0.0
3,2020-03-09,EC,0.0
4,2020-03-11,EC,0.0
...,...,...,...
525,2020-04-26,UNKNOWN,0.0
526,2020-04-27,UNKNOWN,0.0
527,2020-04-28,UNKNOWN,0.0
528,2020-04-29,UNKNOWN,0.0


In [413]:
province_names = {
#     "EC":"Eastern Cape",
#     "FS" : "Free State",
#     "GP" : "Gauteng",
#     "KZN" : "KwaZula-Natal",
#     "LP" : "Limpopo",
#     "MP" : "Mpumalanga",
#     "NW" : "North West",
#     "NC" : "Northern Cape",
#     "WC" : "Western Cape",
    "UNKNOWN": "UNK"
}
# province_data_melt['province'] = province_data_melt['province'].map(province_names)
province_data_melt['province'] = province_data_melt['province'].replace("UNKNOWN","UNK")
province_data_melt

Unnamed: 0,date,province,cumulative_cases
0,2020-03-05,EC,0.0
1,2020-03-07,EC,0.0
2,2020-03-08,EC,0.0
3,2020-03-09,EC,0.0
4,2020-03-11,EC,0.0
...,...,...,...
525,2020-04-26,UNK,0.0
526,2020-04-27,UNK,0.0
527,2020-04-28,UNK,0.0
528,2020-04-29,UNK,0.0


### Daily

In [414]:
province_data_daily = province_data.copy()
province_data_daily.iloc[1:,1:] = province_data_daily.iloc[:,1:].diff().iloc[1:,:]
province_data_daily.tail()
# province_data_daily

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
48,2020-04-26,47.0,-1.0,27.0,22.0,1.0,0.0,1.0,0.0,94.0,0.0
49,2020-04-27,53.0,1.0,22.0,39.0,0.0,3.0,0.0,3.0,129.0,0.0
50,2020-04-28,28.0,2.0,24.0,17.0,0.0,0.0,0.0,-2.0,133.0,0.0
51,2020-04-29,14.0,0.0,31.0,37.0,0.0,5.0,0.0,0.0,265.0,0.0
52,2020-04-30,17.0,3.0,38.0,24.0,1.0,5.0,0.0,2.0,207.0,0.0


In [415]:
province_data_daily_melt = province_data_daily.melt(id_vars=['date'], var_name='province', 
                                                    value_name='daily_cases')
province_data_daily_melt.tail()

Unnamed: 0,date,province,daily_cases
525,2020-04-26,UNKNOWN,0.0
526,2020-04-27,UNKNOWN,0.0
527,2020-04-28,UNKNOWN,0.0
528,2020-04-29,UNKNOWN,0.0
529,2020-04-30,UNKNOWN,0.0


In [416]:
province_names = {
#     "EC":"Eastern Cape",
#     "FS" : "Free State",
#     "GP" : "Gauteng",
#     "KZN" : "KwaZula-Natal",
#     "LP" : "Limpopo",
#     "MP" : "Mpumalanga",
#     "NW" : "North West",
#     "NC" : "Northern Cape",
#     "WC" : "Western Cape",
    "UNKNOWN": "UNK"
}
# province_data_daily_melt['province'] = province_data_daily_melt['province'].map(province_names)
province_data_daily_melt['province'] = province_data_daily_melt['province'].replace("UNKNOWN","UNK")
province_data_daily_melt

Unnamed: 0,date,province,daily_cases
0,2020-03-05,EC,0.0
1,2020-03-07,EC,0.0
2,2020-03-08,EC,0.0
3,2020-03-09,EC,0.0
4,2020-03-11,EC,0.0
...,...,...,...
525,2020-04-26,UNK,0.0
526,2020-04-27,UNK,0.0
527,2020-04-28,UNK,0.0
528,2020-04-29,UNK,0.0


### Concatenate Cumulative & Daily

In [417]:
prov_cumulative_daily = pd.concat([province_data_melt, province_data_daily_melt['daily_cases']], axis =1)
prov_cumulative_daily.tail()

Unnamed: 0,date,province,cumulative_cases,daily_cases
525,2020-04-26,UNK,0.0,0.0
526,2020-04-27,UNK,0.0,0.0
527,2020-04-28,UNK,0.0,0.0
528,2020-04-29,UNK,0.0,0.0
529,2020-04-30,UNK,0.0,0.0


**Save to csv**

In [418]:
prov_cumulative_daily.to_csv('data/daily_cumulative_confirmed_prov.csv', index=False)

## Tests

In [419]:
tests_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv"

tests_data_req = requests.get(tests_data_url).content

tests_data = pd.read_csv(io.StringIO(tests_data_req.decode('utf-8')), delimiter = ',',
                         usecols=['date','cumulative_tests'])
tests_data.dropna(inplace=True)

So far tests_data only includes cumulative.

In [420]:
tests_data

Unnamed: 0,date,cumulative_tests
0,11-02-2020,61.0
1,13-02-2020,67.0
2,14-02-2020,71.0
3,19-02-2020,95.0
4,20-02-2020,106.0
5,24-02-2020,116.0
6,26-02-2020,121.0
7,02-03-2020,160.0
8,03-03-2020,164.0
9,06-03-2020,200.0


In [421]:
tests_data.to_csv('data/tests_data.csv', index=False)

### Daily

In [422]:
tests_data['daily_tests'] = tests_data['cumulative_tests']
# slice is required as first entry of diff will be NaN but data we want should be equal to the starting value
tests_data['daily_tests'][1:] = tests_data['cumulative_tests'].diff()[1:] 
tests_data

Unnamed: 0,date,cumulative_tests,daily_tests
0,11-02-2020,61.0,61.0
1,13-02-2020,67.0,6.0
2,14-02-2020,71.0,4.0
3,19-02-2020,95.0,24.0
4,20-02-2020,106.0,11.0
5,24-02-2020,116.0,10.0
6,26-02-2020,121.0,5.0
7,02-03-2020,160.0,39.0
8,03-03-2020,164.0,4.0
9,06-03-2020,200.0,36.0


#### Save to CSV

In [423]:
tests_data.to_csv('data/daily_cumulative_tests.csv', index=False)

## Confirmed Cases
Get no of cumulative cases from 'covid19za_provincial_cumulative_timeline_confirmed.csv'
### Cumulative

In [424]:
confirmed_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv"

confirmed_data_req = requests.get(confirmed_data_url).content

confirmed_data = pd.read_csv(io.StringIO(confirmed_data_req.decode('utf-8')), usecols=['date','total'])

confirmed_data.tail()

Unnamed: 0,date,total
50,26-04-2020,4546
51,27-04-2020,4793
52,28-04-2020,4996
53,29-04-2020,5350
54,30-04-2020,5647


In [425]:
confirmed_data['date'] = pd.to_datetime(confirmed_data['date'], format='%d-%m-%Y')
confirmed_data.set_index('date', inplace=True)
confirmed_data.rename(columns={"total":"cumulative_cases"}, inplace = True)
confirmed_data.tail()

Unnamed: 0_level_0,cumulative_cases
date,Unnamed: 1_level_1
2020-04-26,4546
2020-04-27,4793
2020-04-28,4996
2020-04-29,5350
2020-04-30,5647


### Daily

In [426]:
confirmed_data['daily_cases'] = confirmed_data['cumulative_cases']
# slice is required as first entry of diff will be NaN but data we want should be equal to the starting value
confirmed_data['daily_cases'][1:] = confirmed_data['cumulative_cases'].diff()[1:] 
confirmed_data

Unnamed: 0_level_0,cumulative_cases,daily_cases
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-05,1,1.0
2020-03-07,2,1.0
2020-03-08,3,1.0
2020-03-09,7,4.0
2020-03-11,13,6.0
2020-03-12,16,3.0
2020-03-13,24,8.0
2020-03-14,38,14.0
2020-03-15,51,13.0
2020-03-16,62,11.0


**Save to csv**

In [427]:
confirmed_data.to_csv('data/daily_cumulative_confirmed.csv')

### Daily

In [428]:
# confirmed_data = confirmed_all_data.groupby(['date']).count()[['province']]
# confirmed_data.rename(columns={'province':'daily_cases'}, inplace = True)
# confirmed_data

### Cumulative

In [429]:
# confirmed_data['cumulative_cases'] = confirmed_data['daily_cases'].cumsum()
# confirmed_data

**Save to csv**

Above method is no longer being used. Thus why below is commented out.

In [430]:
# confirmed_data.to_csv('data/daily_cumulative_confirmed.csv')

**Use saved data rather than above**

In [431]:
# confirmed_data = pd.read_csv('data/daily_cumulative_confirmed.csv')
# confirmed_data['date'] = pd.to_datetime(confirmed_data['date'], format='%Y-%m-%d')
# confirmed_data.set_index('date', inplace=True)
# confirmed_data

**Add data to above**

Data to be added:

In [432]:
# # Format must be dd-mm-YYYY
# date_str = "29-03-2020"
# date_dt = pd.to_datetime(date_str, format='%d-%m-%Y')
# new_tot_cases = 1326

In [433]:
# new_daily_cases = new_tot_cases - confirmed_data.iloc[-1]['cumulative_cases']
# new_df_entry = pd.DataFrame({"date":[date_dt],
#              "daily_cases":[new_daily_cases],
#              "cumulative_cases":[new_tot_cases]}).set_index('date')
# confirmed_data = pd.concat([confirmed_data, new_df_entry])
# confirmed_data.tail()

**Save to csv**

In [434]:
# confirmed_data.to_csv('data/daily_cumulative_confirmed.csv')

## Confirmed & Tests
### Daily

In [435]:
confirmed_data_tmp = confirmed_data.reset_index()
confirmed_data_tmp['date'] = pd.to_datetime(confirmed_data_tmp['date'], format='%d-%m-%Y')
confirmed_data_tmp.set_index('date', inplace = True)
confirmed_data_tmp.tail()

Unnamed: 0_level_0,cumulative_cases,daily_cases
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-04-26,4546,185.0
2020-04-27,4793,247.0
2020-04-28,4996,203.0
2020-04-29,5350,354.0
2020-04-30,5647,297.0


In [436]:
tests_data_tmp = tests_data.copy()
# test_data_tmp.info()
tests_data_tmp['date'] = pd.to_datetime(tests_data_tmp['date'], format='%d-%m-%Y')
tests_data_tmp.set_index('date', inplace = True)
tests_data_tmp.tail()

Unnamed: 0_level_0,cumulative_tests,daily_tests
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-04-26,168643.0,7639.0
2020-04-27,178470.0,9827.0
2020-04-28,185497.0,7027.0
2020-04-29,197127.0,11630.0
2020-04-30,207530.0,10403.0


In [437]:
daily_tests_confirmed = pd.concat([confirmed_data_tmp[['daily_cases']], 
                                   tests_data_tmp['daily_tests']], axis = 1, sort = True)
daily_tests_confirmed['daily_cases'].fillna(0,inplace=True)
daily_tests_confirmed

Unnamed: 0_level_0,daily_cases,daily_tests
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-11,0.0,61.0
2020-02-13,0.0,6.0
2020-02-14,0.0,4.0
2020-02-19,0.0,24.0
2020-02-20,0.0,11.0
...,...,...
2020-04-26,185.0,7639.0
2020-04-27,247.0,9827.0
2020-04-28,203.0,7027.0
2020-04-29,354.0,11630.0


NaN for daily_tests corresponds to test data not available.
### Percentage of Positive Tests

In [438]:
daily_tests_confirmed['perc_positive'] = (daily_tests_confirmed['daily_cases']/
                                          daily_tests_confirmed['daily_tests']) * 100
daily_tests_confirmed['perc_positive'] = daily_tests_confirmed['perc_positive'].round(1)
daily_tests_confirmed

Unnamed: 0_level_0,daily_cases,daily_tests,perc_positive
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-02-11,0.0,61.0,0.0
2020-02-13,0.0,6.0,0.0
2020-02-14,0.0,4.0,0.0
2020-02-19,0.0,24.0,0.0
2020-02-20,0.0,11.0,0.0
...,...,...,...
2020-04-26,185.0,7639.0,2.4
2020-04-27,247.0,9827.0,2.5
2020-04-28,203.0,7027.0,2.9
2020-04-29,354.0,11630.0,3.0


**Save to csv**

In [439]:
daily_tests_confirmed.to_csv('data/daily_tests_confirmed.csv')

### Cumulative

In [440]:
cumulative_tests_confirmed = daily_tests_confirmed.cumsum()
cumulative_tests_confirmed

Unnamed: 0_level_0,daily_cases,daily_tests,perc_positive
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-02-11,0.0,61.0,0.0
2020-02-13,0.0,67.0,0.0
2020-02-14,0.0,71.0,0.0
2020-02-19,0.0,95.0,0.0
2020-02-20,0.0,106.0,0.0
...,...,...,...
2020-04-26,4546.0,168643.0,151.4
2020-04-27,4793.0,178470.0,153.9
2020-04-28,4996.0,185497.0,156.8
2020-04-29,5350.0,197127.0,159.8


// TODO - Rename to Cumulative as this is confusing

### Percentage of Positive Tests

In [441]:
cumulative_tests_confirmed['perc_positive'] = (cumulative_tests_confirmed['daily_cases']/
                                          cumulative_tests_confirmed['daily_tests']) * 100
cumulative_tests_confirmed['perc_positive'] = cumulative_tests_confirmed['perc_positive'].round(1)
cumulative_tests_confirmed

Unnamed: 0_level_0,daily_cases,daily_tests,perc_positive
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-02-11,0.0,61.0,0.0
2020-02-13,0.0,67.0,0.0
2020-02-14,0.0,71.0,0.0
2020-02-19,0.0,95.0,0.0
2020-02-20,0.0,106.0,0.0
...,...,...,...
2020-04-26,4546.0,168643.0,2.7
2020-04-27,4793.0,178470.0,2.7
2020-04-28,4996.0,185497.0,2.7
2020-04-29,5350.0,197127.0,2.7


**Save to csv**

In [442]:
cumulative_tests_confirmed.to_csv('data/cumulative_tests_confirmed.csv')

# Deaths & Recoveries
## Recoveries

In [443]:
recovered_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv"
recovered_data_req = requests.get(recovered_data_url).content

recovered_data = pd.read_csv(io.StringIO(recovered_data_req.decode('utf-8')), delimiter = ',', 
                            usecols=['date','recovered'])
# rename recovered
recovered_data.rename(columns={"recovered":"cum_recovered"}, inplace=True)

# drop fields where recovered is 0
recovered_data.drop(recovered_data[recovered_data['cum_recovered']==0]['cum_recovered'].index, inplace=True)
recovered_data

Unnamed: 0,date,cum_recovered
22,22-03-2020,1
23,23-03-2020,1
24,24-03-2020,2
25,25-03-2020,4
26,26-03-2020,4
27,27-03-2020,31
28,28-03-2020,31
29,29-03-2020,31
30,30-03-2020,31
31,31-03-2020,31


In [444]:
recovered_data['date'] = pd.to_datetime(recovered_data['date'], format='%d-%m-%Y')
recovered_data.set_index('date', inplace = True)
recovered_data

Unnamed: 0_level_0,cum_recovered
date,Unnamed: 1_level_1
2020-03-22,1
2020-03-23,1
2020-03-24,2
2020-03-25,4
2020-03-26,4
2020-03-27,31
2020-03-28,31
2020-03-29,31
2020-03-30,31
2020-03-31,31


In [445]:
recovered_data['daily_recovered'] = recovered_data['cum_recovered']
# slice is required as first entry of diff will be NaN but data we want should be equal to the starting value
recovered_data['daily_recovered'][1:] = recovered_data['cum_recovered'].diff()[1:] 
recovered_data

Unnamed: 0_level_0,cum_recovered,daily_recovered
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-22,1,1.0
2020-03-23,1,0.0
2020-03-24,2,1.0
2020-03-25,4,2.0
2020-03-26,4,0.0
2020-03-27,31,27.0
2020-03-28,31,0.0
2020-03-29,31,0.0
2020-03-30,31,0.0
2020-03-31,31,0.0


**Save to csv**

In [446]:
recovered_data.to_csv('data/recovered_data.csv')

## Deaths

In [447]:
start_deaths_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_deaths.csv"
start_deaths_data_req = requests.get(start_deaths_data_url).content

start_deaths_data = pd.read_csv(io.StringIO(start_deaths_data_req.decode('utf-8')), delimiter = ',',
                            usecols=['date', 'province']).head(13)

start_deaths_data

Unnamed: 0,date,province
0,27-03-2020,WC
1,28-03-2020,KZN
2,30-03-2020,FS
3,31-03-2020,GP
4,31-03-2020,KZN
5,03-04-2020,KZN
6,03-04-2020,KZN
7,03-04-2020,KZN
8,03-04-2020,KZN
9,05-04-2020,WC


In [448]:
start_deaths_data['date'] = pd.to_datetime(start_deaths_data['date'], format='%d-%m-%Y')
start_deaths_data.tail()

Unnamed: 0,date,province
8,2020-04-03,KZN
9,2020-04-05,WC
10,2020-04-05,KZN
11,2020-04-06,WC
12,2020-04-07,KZN


In [449]:
start_deaths_data = start_deaths_data.groupby(['date']).count()
start_deaths_data.rename({"province":"daily_deaths"}, axis=1, inplace = True)
start_deaths_data

Unnamed: 0_level_0,daily_deaths
date,Unnamed: 1_level_1
2020-03-27,1
2020-03-28,1
2020-03-30,1
2020-03-31,2
2020-04-03,4
2020-04-05,2
2020-04-06,1
2020-04-07,1


In [450]:
start_deaths_data['cum_deaths'] = start_deaths_data[['daily_deaths']].cumsum()
start_deaths_data

Unnamed: 0_level_0,daily_deaths,cum_deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-27,1,1
2020-03-28,1,2
2020-03-30,1,3
2020-03-31,2,5
2020-04-03,4,9
2020-04-05,2,11
2020-04-06,1,12
2020-04-07,1,13


#### Deaths data from 8 April onwards
From https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_deaths.csv

In [451]:
deaths_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_deaths.csv"

deaths_data_req = requests.get(deaths_data_url).content

deaths_data = pd.read_csv(io.StringIO(deaths_data_req.decode('utf-8')), usecols=['date','total'])

deaths_data.tail()

Unnamed: 0,date,total
26,26-04-2020,87
27,27-04-2020,90
28,28-04-2020,93
29,29-04-2020,103
30,30-04-2020,103


In [452]:
deaths_data['date'] = pd.to_datetime(deaths_data['date'], format='%d-%m-%Y')
deaths_data.set_index('date', inplace=True)
deaths_data.rename(columns={"total":"cum_deaths"}, inplace = True)
deaths_data.tail()

Unnamed: 0_level_0,cum_deaths
date,Unnamed: 1_level_1
2020-04-26,87
2020-04-27,90
2020-04-28,93
2020-04-29,103
2020-04-30,103


### Daily

In [453]:
# deaths_data.iloc[0]['cum_deaths'] - start_deaths_data.iloc[-1]['cum_deaths']

In [454]:
# deaths_data['daily_deaths'] = deaths_data['cum_deaths'].diff()
# deaths_data['daily_deaths'][0] = deaths_data.iloc[0]['cum_deaths'] - start_deaths_data.iloc[-1]['cum_deaths']
# deaths_data.head()

**Concat missing data from before 8 April**

In [455]:
# deaths_data=pd.concat([start_deaths_data, deaths_data])
# deaths_data

**Save to csv**

In [456]:
deaths_data.to_csv('data/daily_cum_deaths.csv')

### Death Per Province

In [457]:
deaths_data_prov_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_deaths.csv"
deaths_data_prov_req = requests.get(deaths_data_prov_url).content

deaths_prov_data = pd.read_csv(io.StringIO(deaths_data_prov_req.decode('utf-8'))).drop("YYYYMMDD", axis =1)
deaths_prov_data['date'] = pd.to_datetime(deaths_prov_data['date'], format='%d-%m-%Y')

deaths_prov_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total
26,2020-04-26,10,5,8,29,2,0,0,0,33,0,87
27,2020-04-27,10,5,8,30,2,0,0,0,35,0,90
28,2020-04-28,10,5,8,30,2,0,0,0,38,0,93
29,2020-04-29,11,5,11,32,2,0,0,0,42,0,103
30,2020-04-30,11,5,11,32,2,0,0,0,42,0,103


In [458]:
deaths_prov_data.drop(['total'], axis = 1, inplace = True)
deaths_prov_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
26,2020-04-26,10,5,8,29,2,0,0,0,33,0
27,2020-04-27,10,5,8,30,2,0,0,0,35,0
28,2020-04-28,10,5,8,30,2,0,0,0,38,0
29,2020-04-29,11,5,11,32,2,0,0,0,42,0
30,2020-04-30,11,5,11,32,2,0,0,0,42,0


**Confirmed Data**

In [459]:
confirmed_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv"

confirmed_data_req = requests.get(confirmed_data_url).content

confirmed_data = pd.read_csv(io.StringIO(confirmed_data_req.decode('utf-8')), usecols=['date','total'])

confirmed_data.tail()

Unnamed: 0,date,total
50,26-04-2020,4546
51,27-04-2020,4793
52,28-04-2020,4996
53,29-04-2020,5350
54,30-04-2020,5647


In [460]:
confirmed_data['date'] = pd.to_datetime(confirmed_data['date'], format='%d-%m-%Y')
confirmed_data.set_index('date', inplace=True)
confirmed_data.rename(columns={"total":"cumulative_cases"}, inplace = True)
confirmed_data.tail()

Unnamed: 0_level_0,cumulative_cases
date,Unnamed: 1_level_1
2020-04-26,4546
2020-04-27,4793
2020-04-28,4996
2020-04-29,5350
2020-04-30,5647


### Daily

In [461]:
confirmed_data['daily_cases'] = confirmed_data['cumulative_cases']
# slice is required as first entry of diff will be NaN but data we want should be equal to the starting value
confirmed_data['daily_cases'][1:] = confirmed_data['cumulative_cases'].diff()[1:] 
confirmed_data.head()

Unnamed: 0_level_0,cumulative_cases,daily_cases
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-05,1,1.0
2020-03-07,2,1.0
2020-03-08,3,1.0
2020-03-09,7,4.0
2020-03-11,13,6.0


**Save to csv**

In [462]:
confirmed_data.to_csv('data/daily_cumulative_confirmed.csv')

### Per Province

In [463]:
# deaths_data_per_day = deaths_data.groupby('date')[['province']].count()

### Total per day
#### Daily

In [464]:
# deaths_data_per_day = deaths_data.groupby('date')[['province']].count()
# deaths_data_per_day.rename(columns={"province":"daily_deaths"},inplace=True)
# deaths_data_per_day

Manually add entry

#### Cumulative

In [465]:
# deaths_data_per_day['cum_deaths']=deaths_data_per_day['daily_deaths'].cumsum()
# deaths_data_per_day

In [466]:
def add_deaths(org_df, date_of_deaths, cumulative_deaths):
    date_dt = pd.to_datetime(date_of_deaths, format='%d-%m-%Y') # Format must be dd-mm-YYYY
    new_tot_deaths = cumulative_deaths

    new_daily_deaths = new_tot_deaths - org_df.iloc[-1]['cum_deaths']
    new_df_entry = pd.DataFrame({"date":[date_dt], 
                             "daily_deaths":[new_daily_deaths],
                             "cum_deaths":[new_tot_deaths],}).set_index('date')
    new_df = pd.concat([org_df, new_df_entry])
    return new_df

In [467]:
# # deaths_data_per_day = add_deaths(deaths_data_per_day, "17-04-2020", 50) # 17th April - 50 deaths
# # deaths_data_per_day = add_deaths(deaths_data_per_day, "18-04-2020", 52) # 18th April - 52 deaths
# deaths_data = add_deaths(deaths_data, "29-04-2020", 103)

# deaths_data

**Save to csv**

In [468]:
deaths_data

Unnamed: 0_level_0,cum_deaths
date,Unnamed: 1_level_1
2020-03-27,1
2020-03-28,2
2020-03-30,3
2020-03-31,5
2020-04-03,9
2020-04-05,11
2020-04-06,12
2020-04-07,13
2020-04-08,18
2020-04-09,18


In [469]:
deaths_data.to_csv('data/daily_cum_deaths.csv')

#### Deaths Vs Recovered

In [470]:
deaths_vs_recoveries = pd.concat([deaths_data[['cum_deaths']], recovered_data['cum_recovered']], 
                                 axis =1)
deaths_vs_recoveries.iloc[0,0] = 0
deaths_vs_recoveries['cum_deaths'].ffill(inplace=True)
deaths_vs_recoveries['cum_recovered'].ffill(inplace=True)
deaths_vs_recoveries

Unnamed: 0_level_0,cum_deaths,cum_recovered
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-22,0.0,1
2020-03-23,0.0,1
2020-03-24,0.0,2
2020-03-25,0.0,4
2020-03-26,0.0,4
2020-03-27,1.0,31
2020-03-28,2.0,31
2020-03-29,2.0,31
2020-03-30,3.0,31
2020-03-31,5.0,31


**Save to csv**

In [471]:
deaths_vs_recoveries.to_csv('data/deaths_vs_recoveries.csv')

# Totals
## Province
### Confirmed Cases

In [472]:
province_names = {
    "EC":"Eastern Cape",
    "FS" : "Free State",
    "GP" : "Gauteng",
    "KZN" : "KwaZulu-Natal",
    "LP" : "Limpopo",
    "MP" : "Mpumalanga",
    "NW" : "North West",
    "NC" : "Northern Cape",
    "WC" : "Western Cape",
    "UNKNOWN": "Unknown"
}

In [473]:
prov_totals = province_data.tail(1).copy()
prov_totals = prov_totals.melt(id_vars=['date'], var_name='province', value_name='total')
prov_totals['province'] = prov_totals['province'].map(province_names)
prov_totals.drop('date',axis=1,inplace=True)
prov_totals

Unnamed: 0,province,total
0,Eastern Cape,647.0
1,Free State,116.0
2,Gauteng,1446.0
3,KwaZulu-Natal,980.0
4,Limpopo,32.0
5,Mpumalanga,36.0
6,Northern Cape,17.0
7,North West,31.0
8,Western Cape,2342.0
9,Unknown,0.0


**Save to csv**

In [474]:
prov_totals.to_csv('data/tot_provinces.csv',index=False)

### Deaths

In [475]:
tot_deaths_per_province = deaths_prov_data.tail(1)
tot_deaths_per_province = tot_deaths_per_province.melt(id_vars=['date'], var_name='province', 
                                              value_name='tot_deaths')
tot_deaths_per_province.drop(['date'], axis=1, inplace = True)
tot_deaths_per_province['province'] = tot_deaths_per_province['province'].map(province_names)
tot_deaths_per_province.set_index(['province'], inplace = True)
tot_deaths_per_province

Unnamed: 0_level_0,tot_deaths
province,Unnamed: 1_level_1
Eastern Cape,11
Free State,5
Gauteng,11
KwaZulu-Natal,32
Limpopo,2
Mpumalanga,0
Northern Cape,0
North West,0
Western Cape,42
Unknown,0


**Save to csv**

In [476]:
tot_deaths_per_province.to_csv('data/tot_deaths_provinces.csv',index=True)

## Deaths Per Prov

In [477]:
# deaths_data_copy.drop(['gender','age'], axis =1, inplace=True)
# deaths_data_copy

#### Daily

In [478]:
# deaths_data_per_prov = deaths_data_copy.copy().fillna('Unknown')
# deaths_data_per_prov['total'] = 1
# # deaths_data_per_prov
# deaths_data_per_prov = deaths_data_per_prov.groupby(['date','province']).count()
# deaths_data_per_prov.tail()

In [479]:
# cum_deaths_data_per_prov = deaths_data_per_prov.groupby(level=-1)[['total']].cumsum()
# cum_deaths_data_per_prov.head()
# # deaths_data_per_prov
# # cum_deaths_data_per_prov.groupby(level=-1)['total'].cumsum()

## Data to be displayed as text on website

In [480]:
format(200003,',d').replace(","," ")

'200 003'

In [481]:
def zero_space(num):
    return format(num,',d').replace(","," ")

In [482]:
tot_tested = zero_space(tests_data.tail(1).iloc[0]['cumulative_tests'].astype(int))
change_tested = zero_space(tests_data.tail(1).iloc[0]['daily_tests'].astype(int))
print(tot_tested, change_tested)

207 530 10 403


In [483]:
tot_infected = zero_space(confirmed_data.tail(1).iloc[0]['cumulative_cases'].astype(int))
change_infected = zero_space(confirmed_data.tail(1).iloc[0]['daily_cases'].astype(int))
print(tot_infected, change_infected)

5 647 297


In [484]:
tot_deaths = zero_space(deaths_vs_recoveries.tail(1).iloc[0]['cum_deaths'].astype(int))
change_deaths = zero_space(deaths_vs_recoveries['cum_deaths'].diff().tail(1).iloc[0].astype(int))
print(tot_deaths, change_deaths)

103 0


In [485]:
tot_recoveries = zero_space(deaths_vs_recoveries.tail(1).iloc[0]['cum_recovered'].astype(int))
change_recoveries = zero_space(deaths_vs_recoveries['cum_recovered'].diff().tail(1).iloc[0].astype(int))
print(tot_recoveries, change_recoveries)

2 073 0


In [486]:
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M %d %B %Y")
print("Current Time =", current_time)

Current Time = 12:58 01 May 2020


In [487]:
gen_data = pd.DataFrame(dict(tot_infected=[tot_infected], change_infected=[change_infected], 
                             tot_deaths=[tot_deaths], change_deaths=[change_deaths],
                             tot_tested=[tot_tested], change_tested=[change_tested], 
                             tot_recoveries=[tot_recoveries], change_recoveries=[change_recoveries], 
                             datetime_updated=[current_time]))
gen_data

Unnamed: 0,tot_infected,change_infected,tot_deaths,change_deaths,tot_tested,change_tested,tot_recoveries,change_recoveries,datetime_updated
0,5 647,297,103,0,207 530,10 403,2 073,0,12:58 01 May 2020


In [488]:
gen_data.to_csv("data/gen_data.csv", index=False)

# Data from Image
Use data_from_img.py code to load data from NICD infographic image specified

In [489]:
# import data_from_img

**Path of image to be processed:**

In [490]:
img_path = "NICD_updates/NICD_Updates_04_25.jpg"

**Get data from image**

In [491]:
# date, prov_deaths_totals, prov_recovered_totals, gen_totals = data_from_img.get_tot_data(img_path)

### Province Names Dict

In [492]:
province_names = {
    "EC":"Eastern Cape",
    "FS" : "Free State",
    "GP" : "Gauteng",
    "KZN" : "KwaZula-Natal",
    "LP" : "Limpopo",
    "MP" : "Mpumalanga",
    "NW" : "North West",
    "NC" : "Northern Cape",
    "WC" : "Western Cape",
    "UNKNOWN": "Unknown"
}

### Deaths

In [493]:
# tot_deaths_per_province = deaths_prov_data.tail(1)
# tot_deaths_per_province = tot_deaths_per_province.melt(id_vars=['date'], var_name='province', 
#                                               value_name='tot_deaths')
# tot_deaths_per_province.drop(['date'], axis=1, inplace = True)
# tot_deaths_per_province['province'] = tot_deaths_per_province['province'].map(province_names)
# tot_deaths_per_province.set_index(['province'], inplace = True)
# tot_deaths_per_province

**Save to csv**

In [494]:
# tot_deaths_per_province.to_csv('data/tot_deaths_provinces.csv',index=True)

### Recovered

In [495]:
# prov_recovered_totals_df = pd.DataFrame([prov_recovered_totals])
# prov_recovered_totals_df['date'] = date
# prov_recovered_totals_df =prov_recovered_totals_df.melt(id_vars=['date'], var_name='province', 
#                                                   value_name='recovered').set_index(['date'])
# prov_recovered_totals_df['province'] = prov_recovered_totals_df['province'].map(province_names)
# prov_recovered_totals_df

**Save to csv**

In [496]:
# prov_recovered_totals_df.to_csv('data/tot_recovered_provinces.csv',index=True)