# Data Preprocessing

In [269]:
# Enable Intellisense
%config IPCompleter.greedy=True

import pandas as pd
import numpy as np
import io
import requests
import seaborn as sns
from matplotlib import pyplot, dates
import plotly.express as px 
import datetime
import plotly.graph_objects as go
import plotly.io as pio
sns.set()

# Annoying warning
pd.options.mode.chained_assignment = None  # default='warn'

# Gen Data
## All Confirm

**No longer going to use above data**

# Over time

## Provinces
Taken from 'covid19za_provincial_cumulative_timeline_confirmed.csv'
### Cumulative

In [270]:
province_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv"

province_data_req = requests.get(province_data_url).content

province_data = pd.read_csv(io.StringIO(province_data_req.decode('utf-8')), delimiter = ',')
province_data.drop(['YYYYMMDD'], axis = 1, inplace=True)
province_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total,source
45,21-04-2020,345.0,106.0,1199.0,671.0,27.0,24.0,16.0,24.0,1010.0,43.0,3465,https://twitter.com/DrZweliMkhize/status/12526...
46,22-04-2020,377.0,106.0,1224.0,758.0,27.0,23.0,16.0,24.0,1079.0,1.0,3635,https://sacoronavirus.co.za/2020/04/22/update-...
47,23-04-2020,417.0,106.0,1252.0,807.0,27.0,23.0,16.0,25.0,1279.0,1.0,3953,https://sacoronavirus.co.za/2020/04/23/update-...
48,24-04-2020,480.0,111.0,1281.0,841.0,29.0,24.0,16.0,25.0,1413.0,0.0,4220,https://twitter.com/nicd_sa/status/12537692103...
49,25-04-2020,488.0,111.0,1304.0,841.0,30.0,23.0,16.0,28.0,1514.0,0.0,4361,https://twitter.com/DrZweliMkhize/status/12541...


In [271]:
province_data.drop(['total','source'],axis=1, inplace = True)
province_data.dropna(inplace=True)
province_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
45,21-04-2020,345.0,106.0,1199.0,671.0,27.0,24.0,16.0,24.0,1010.0,43.0
46,22-04-2020,377.0,106.0,1224.0,758.0,27.0,23.0,16.0,24.0,1079.0,1.0
47,23-04-2020,417.0,106.0,1252.0,807.0,27.0,23.0,16.0,25.0,1279.0,1.0
48,24-04-2020,480.0,111.0,1281.0,841.0,29.0,24.0,16.0,25.0,1413.0,0.0
49,25-04-2020,488.0,111.0,1304.0,841.0,30.0,23.0,16.0,28.0,1514.0,0.0


In [272]:
province_data.to_csv('data/daily_prov.csv', index = False)

In [273]:
province_data = pd.read_csv('data/daily_prov.csv')
province_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
43,21-04-2020,345.0,106.0,1199.0,671.0,27.0,24.0,16.0,24.0,1010.0,43.0
44,22-04-2020,377.0,106.0,1224.0,758.0,27.0,23.0,16.0,24.0,1079.0,1.0
45,23-04-2020,417.0,106.0,1252.0,807.0,27.0,23.0,16.0,25.0,1279.0,1.0
46,24-04-2020,480.0,111.0,1281.0,841.0,29.0,24.0,16.0,25.0,1413.0,0.0
47,25-04-2020,488.0,111.0,1304.0,841.0,30.0,23.0,16.0,28.0,1514.0,0.0


In [274]:
province_data['date'] = pd.to_datetime(province_data['date'], format='%d-%m-%Y')

In [275]:
province_data_melt = province_data.melt(id_vars=['date'], var_name='province', value_name='cumulative_cases')
province_data_melt

Unnamed: 0,date,province,cumulative_cases
0,2020-03-05,EC,0.0
1,2020-03-07,EC,0.0
2,2020-03-08,EC,0.0
3,2020-03-09,EC,0.0
4,2020-03-11,EC,0.0
...,...,...,...
475,2020-04-21,UNKNOWN,43.0
476,2020-04-22,UNKNOWN,1.0
477,2020-04-23,UNKNOWN,1.0
478,2020-04-24,UNKNOWN,0.0


In [276]:
province_names = {
#     "EC":"Eastern Cape",
#     "FS" : "Free State",
#     "GP" : "Gauteng",
#     "KZN" : "KwaZula-Natal",
#     "LP" : "Limpopo",
#     "MP" : "Mpumalanga",
#     "NW" : "North West",
#     "NC" : "Northern Cape",
#     "WC" : "Western Cape",
    "UNKNOWN": "UNK"
}
# province_data_melt['province'] = province_data_melt['province'].map(province_names)
province_data_melt['province'] = province_data_melt['province'].replace("UNKNOWN","UNK")
province_data_melt

Unnamed: 0,date,province,cumulative_cases
0,2020-03-05,EC,0.0
1,2020-03-07,EC,0.0
2,2020-03-08,EC,0.0
3,2020-03-09,EC,0.0
4,2020-03-11,EC,0.0
...,...,...,...
475,2020-04-21,UNK,43.0
476,2020-04-22,UNK,1.0
477,2020-04-23,UNK,1.0
478,2020-04-24,UNK,0.0


### Daily

In [277]:
province_data_daily = province_data.copy()
province_data_daily.iloc[1:,1:] = province_data_daily.iloc[:,1:].diff().iloc[1:,:]
province_data_daily.tail()
# province_data_daily

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
43,2020-04-21,35.0,1.0,29.0,32.0,0.0,1.0,-2.0,-1.0,70.0,0.0
44,2020-04-22,32.0,0.0,25.0,87.0,0.0,-1.0,0.0,0.0,69.0,-42.0
45,2020-04-23,40.0,0.0,28.0,49.0,0.0,0.0,0.0,1.0,200.0,0.0
46,2020-04-24,63.0,5.0,29.0,34.0,2.0,1.0,0.0,0.0,134.0,-1.0
47,2020-04-25,8.0,0.0,23.0,0.0,1.0,-1.0,0.0,3.0,101.0,0.0


In [278]:
province_data_daily_melt = province_data_daily.melt(id_vars=['date'], var_name='province', 
                                                    value_name='daily_cases')
province_data_daily_melt.tail()

Unnamed: 0,date,province,daily_cases
475,2020-04-21,UNKNOWN,0.0
476,2020-04-22,UNKNOWN,-42.0
477,2020-04-23,UNKNOWN,0.0
478,2020-04-24,UNKNOWN,-1.0
479,2020-04-25,UNKNOWN,0.0


In [279]:
province_names = {
#     "EC":"Eastern Cape",
#     "FS" : "Free State",
#     "GP" : "Gauteng",
#     "KZN" : "KwaZula-Natal",
#     "LP" : "Limpopo",
#     "MP" : "Mpumalanga",
#     "NW" : "North West",
#     "NC" : "Northern Cape",
#     "WC" : "Western Cape",
    "UNKNOWN": "UNK"
}
# province_data_daily_melt['province'] = province_data_daily_melt['province'].map(province_names)
province_data_daily_melt['province'] = province_data_daily_melt['province'].replace("UNKNOWN","UNK")
province_data_daily_melt

Unnamed: 0,date,province,daily_cases
0,2020-03-05,EC,0.0
1,2020-03-07,EC,0.0
2,2020-03-08,EC,0.0
3,2020-03-09,EC,0.0
4,2020-03-11,EC,0.0
...,...,...,...
475,2020-04-21,UNK,0.0
476,2020-04-22,UNK,-42.0
477,2020-04-23,UNK,0.0
478,2020-04-24,UNK,-1.0


### Concatenate Cumulative & Daily

In [280]:
prov_cumulative_daily = pd.concat([province_data_melt, province_data_daily_melt['daily_cases']], axis =1)
prov_cumulative_daily.tail()

Unnamed: 0,date,province,cumulative_cases,daily_cases
475,2020-04-21,UNK,43.0,0.0
476,2020-04-22,UNK,1.0,-42.0
477,2020-04-23,UNK,1.0,0.0
478,2020-04-24,UNK,0.0,-1.0
479,2020-04-25,UNK,0.0,0.0


**Save to csv**

In [281]:
prov_cumulative_daily.to_csv('data/daily_cumulative_confirmed_prov.csv', index=False)

## Tests

In [282]:
tests_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv"

tests_data_req = requests.get(tests_data_url).content

tests_data = pd.read_csv(io.StringIO(tests_data_req.decode('utf-8')), delimiter = ',',
                         usecols=['date','cumulative_tests'])
tests_data.dropna(inplace=True)

So far tests_data only includes cumulative.

In [283]:
tests_data

Unnamed: 0,date,cumulative_tests
0,11-02-2020,61.0
1,13-02-2020,67.0
2,14-02-2020,71.0
3,19-02-2020,95.0
4,20-02-2020,106.0
5,24-02-2020,116.0
6,26-02-2020,121.0
7,02-03-2020,160.0
8,03-03-2020,164.0
9,06-03-2020,200.0


In [284]:
tests_data.to_csv('data/tests_data.csv', index=False)

### Daily

In [285]:
tests_data['daily_tests'] = tests_data['cumulative_tests']
# slice is required as first entry of diff will be NaN but data we want should be equal to the starting value
tests_data['daily_tests'][1:] = tests_data['cumulative_tests'].diff()[1:] 
tests_data

Unnamed: 0,date,cumulative_tests,daily_tests
0,11-02-2020,61.0,61.0
1,13-02-2020,67.0,6.0
2,14-02-2020,71.0,4.0
3,19-02-2020,95.0,24.0
4,20-02-2020,106.0,11.0
5,24-02-2020,116.0,10.0
6,26-02-2020,121.0,5.0
7,02-03-2020,160.0,39.0
8,03-03-2020,164.0,4.0
9,06-03-2020,200.0,36.0


#### Save to CSV

In [286]:
tests_data.to_csv('data/daily_cumulative_tests.csv', index=False)

## Confirmed Cases
Get no of cumulative cases from 'covid19za_provincial_cumulative_timeline_confirmed.csv'
### Cumulative

In [287]:
confirmed_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv"

confirmed_data_req = requests.get(confirmed_data_url).content

confirmed_data = pd.read_csv(io.StringIO(confirmed_data_req.decode('utf-8')), usecols=['date','total'])

confirmed_data.tail()

Unnamed: 0,date,total
45,21-04-2020,3465
46,22-04-2020,3635
47,23-04-2020,3953
48,24-04-2020,4220
49,25-04-2020,4361


In [288]:
confirmed_data['date'] = pd.to_datetime(confirmed_data['date'], format='%d-%m-%Y')
confirmed_data.set_index('date', inplace=True)
confirmed_data.rename(columns={"total":"cumulative_cases"}, inplace = True)
confirmed_data

Unnamed: 0_level_0,cumulative_cases
date,Unnamed: 1_level_1
2020-03-05,1
2020-03-07,2
2020-03-08,3
2020-03-09,7
2020-03-11,13
2020-03-12,16
2020-03-13,24
2020-03-14,38
2020-03-15,51
2020-03-16,62


### Daily

In [289]:
confirmed_data['daily_cases'] = confirmed_data['cumulative_cases']
# slice is required as first entry of diff will be NaN but data we want should be equal to the starting value
confirmed_data['daily_cases'][1:] = confirmed_data['cumulative_cases'].diff()[1:] 
confirmed_data

Unnamed: 0_level_0,cumulative_cases,daily_cases
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-05,1,1.0
2020-03-07,2,1.0
2020-03-08,3,1.0
2020-03-09,7,4.0
2020-03-11,13,6.0
2020-03-12,16,3.0
2020-03-13,24,8.0
2020-03-14,38,14.0
2020-03-15,51,13.0
2020-03-16,62,11.0


**Save to csv**

In [290]:
confirmed_data.to_csv('data/daily_cumulative_confirmed.csv')

### Daily

In [291]:
# confirmed_data = confirmed_all_data.groupby(['date']).count()[['province']]
# confirmed_data.rename(columns={'province':'daily_cases'}, inplace = True)
# confirmed_data

### Cumulative

In [292]:
# confirmed_data['cumulative_cases'] = confirmed_data['daily_cases'].cumsum()
# confirmed_data

**Save to csv**

Above method is no longer being used. Thus why below is commented out.

In [293]:
# confirmed_data.to_csv('data/daily_cumulative_confirmed.csv')

**Use saved data rather than above**

In [294]:
# confirmed_data = pd.read_csv('data/daily_cumulative_confirmed.csv')
# confirmed_data['date'] = pd.to_datetime(confirmed_data['date'], format='%Y-%m-%d')
# confirmed_data.set_index('date', inplace=True)
# confirmed_data

**Add data to above**

Data to be added:

In [295]:
# # Format must be dd-mm-YYYY
# date_str = "29-03-2020"
# date_dt = pd.to_datetime(date_str, format='%d-%m-%Y')
# new_tot_cases = 1326

In [296]:
# new_daily_cases = new_tot_cases - confirmed_data.iloc[-1]['cumulative_cases']
# new_df_entry = pd.DataFrame({"date":[date_dt],
#              "daily_cases":[new_daily_cases],
#              "cumulative_cases":[new_tot_cases]}).set_index('date')
# confirmed_data = pd.concat([confirmed_data, new_df_entry])
# confirmed_data.tail()

**Save to csv**

In [297]:
# confirmed_data.to_csv('data/daily_cumulative_confirmed.csv')

## Confirmed & Tests
### Daily

In [298]:
confirmed_data_tmp = confirmed_data.reset_index()
confirmed_data_tmp['date'] = pd.to_datetime(confirmed_data_tmp['date'], format='%d-%m-%Y')
confirmed_data_tmp.set_index('date', inplace = True)
confirmed_data_tmp.tail()

Unnamed: 0_level_0,cumulative_cases,daily_cases
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-04-21,3465,165.0
2020-04-22,3635,170.0
2020-04-23,3953,318.0
2020-04-24,4220,267.0
2020-04-25,4361,141.0


In [299]:
tests_data_tmp = tests_data.copy()
# test_data_tmp.info()
tests_data_tmp['date'] = pd.to_datetime(tests_data_tmp['date'], format='%d-%m-%Y')
tests_data_tmp.set_index('date', inplace = True)
tests_data_tmp.tail()

Unnamed: 0_level_0,cumulative_tests,daily_tests
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-04-21,126937.0,5427.0
2020-04-22,133774.0,6837.0
2020-04-23,143570.0,9796.0
2020-04-24,152390.0,8820.0
2020-04-25,161004.0,8614.0


In [300]:
daily_tests_confirmed = pd.concat([confirmed_data_tmp[['daily_cases']], 
                                   tests_data_tmp['daily_tests']], axis = 1, sort = True)
daily_tests_confirmed['daily_cases'].fillna(0,inplace=True)
daily_tests_confirmed

Unnamed: 0_level_0,daily_cases,daily_tests
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-11,0.0,61.0
2020-02-13,0.0,6.0
2020-02-14,0.0,4.0
2020-02-19,0.0,24.0
2020-02-20,0.0,11.0
2020-02-24,0.0,10.0
2020-02-26,0.0,5.0
2020-03-02,0.0,39.0
2020-03-03,0.0,4.0
2020-03-05,1.0,


NaN for daily_tests corresponds to test data not available.
### Percentage of Positive Tests

In [301]:
daily_tests_confirmed['perc_positive'] = (daily_tests_confirmed['daily_cases']/
                                          daily_tests_confirmed['daily_tests']) * 100
daily_tests_confirmed['perc_positive'] = daily_tests_confirmed['perc_positive'].round(1)
daily_tests_confirmed

Unnamed: 0_level_0,daily_cases,daily_tests,perc_positive
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-02-11,0.0,61.0,0.0
2020-02-13,0.0,6.0,0.0
2020-02-14,0.0,4.0,0.0
2020-02-19,0.0,24.0,0.0
2020-02-20,0.0,11.0,0.0
2020-02-24,0.0,10.0,0.0
2020-02-26,0.0,5.0,0.0
2020-03-02,0.0,39.0,0.0
2020-03-03,0.0,4.0,0.0
2020-03-05,1.0,,


**Save to csv**

In [302]:
daily_tests_confirmed.to_csv('data/daily_tests_confirmed.csv')

### Cumulative

In [303]:
cumulative_tests_confirmed = daily_tests_confirmed.cumsum()
cumulative_tests_confirmed

Unnamed: 0_level_0,daily_cases,daily_tests,perc_positive
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-02-11,0.0,61.0,0.0
2020-02-13,0.0,67.0,0.0
2020-02-14,0.0,71.0,0.0
2020-02-19,0.0,95.0,0.0
2020-02-20,0.0,106.0,0.0
2020-02-24,0.0,116.0,0.0
2020-02-26,0.0,121.0,0.0
2020-03-02,0.0,160.0,0.0
2020-03-03,0.0,164.0,0.0
2020-03-05,1.0,,


// TODO - Rename to Cumulative as this is confusing

### Percentage of Positive Tests

In [304]:
cumulative_tests_confirmed['perc_positive'] = (cumulative_tests_confirmed['daily_cases']/
                                          cumulative_tests_confirmed['daily_tests']) * 100
cumulative_tests_confirmed['perc_positive'] = cumulative_tests_confirmed['perc_positive'].round(1)
cumulative_tests_confirmed

Unnamed: 0_level_0,daily_cases,daily_tests,perc_positive
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-02-11,0.0,61.0,0.0
2020-02-13,0.0,67.0,0.0
2020-02-14,0.0,71.0,0.0
2020-02-19,0.0,95.0,0.0
2020-02-20,0.0,106.0,0.0
2020-02-24,0.0,116.0,0.0
2020-02-26,0.0,121.0,0.0
2020-03-02,0.0,160.0,0.0
2020-03-03,0.0,164.0,0.0
2020-03-05,1.0,,


**Save to csv**

In [305]:
cumulative_tests_confirmed.to_csv('data/cumulative_tests_confirmed.csv')

# Deaths & Recoveries
## Recoveries

In [306]:
recovered_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv"
recovered_data_req = requests.get(recovered_data_url).content

recovered_data = pd.read_csv(io.StringIO(recovered_data_req.decode('utf-8')), delimiter = ',', 
                            usecols=['date','recovered'])
# rename recovered
recovered_data.rename(columns={"recovered":"cum_recovered"}, inplace=True)

# drop fields where recovered is 0
recovered_data.drop(recovered_data[recovered_data['cum_recovered']==0]['cum_recovered'].index, inplace=True)
recovered_data

Unnamed: 0,date,cum_recovered
22,22-03-2020,1
23,23-03-2020,1
24,24-03-2020,2
25,25-03-2020,4
26,26-03-2020,4
27,27-03-2020,31
28,28-03-2020,31
29,29-03-2020,31
30,30-03-2020,31
31,31-03-2020,31


In [307]:
recovered_data['date'] = pd.to_datetime(recovered_data['date'], format='%d-%m-%Y')
recovered_data.set_index('date', inplace = True)
recovered_data

Unnamed: 0_level_0,cum_recovered
date,Unnamed: 1_level_1
2020-03-22,1
2020-03-23,1
2020-03-24,2
2020-03-25,4
2020-03-26,4
2020-03-27,31
2020-03-28,31
2020-03-29,31
2020-03-30,31
2020-03-31,31


In [308]:
recovered_data['daily_recovered'] = recovered_data['cum_recovered']
# slice is required as first entry of diff will be NaN but data we want should be equal to the starting value
recovered_data['daily_recovered'][1:] = recovered_data['cum_recovered'].diff()[1:] 
recovered_data

Unnamed: 0_level_0,cum_recovered,daily_recovered
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-22,1,1.0
2020-03-23,1,0.0
2020-03-24,2,1.0
2020-03-25,4,2.0
2020-03-26,4,0.0
2020-03-27,31,27.0
2020-03-28,31,0.0
2020-03-29,31,0.0
2020-03-30,31,0.0
2020-03-31,31,0.0


**Save to csv**

In [309]:
recovered_data.to_csv('data/recovered_data.csv')

## Deaths
From https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_deaths.csv

**Per Province**

In [310]:
deaths_data_prov_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_deaths.csv"
deaths_data_prov_req = requests.get(deaths_data_prov_url).content

deaths_prov_data = pd.read_csv(io.StringIO(deaths_data_prov_req.decode('utf-8'))).drop("YYYYMMDD", axis =1)
deaths_prov_data['date'] = pd.to_datetime(deaths_prov_data['date'], format='%d-%m-%Y')

deaths_prov_data

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total
0,2020-04-08,0,3,3,9,0,0,0,0,3,0,18
1,2020-04-09,0,3,3,9,0,0,0,0,3,0,18
2,2020-04-10,0,3,3,12,0,0,0,0,6,0,24
3,2020-04-11,0,3,3,12,0,0,0,0,7,0,25
4,2020-04-12,0,3,3,12,0,0,0,0,7,0,25
5,2020-04-13,0,3,4,12,0,0,0,0,8,0,27
6,2020-04-14,0,3,4,12,0,0,0,0,8,0,27
7,2020-04-15,0,3,5,18,0,0,0,0,8,0,34
8,2020-04-16,4,4,6,20,1,0,0,0,13,0,48
9,2020-04-17,4,4,6,20,1,0,0,0,15,0,50


In [311]:
deaths_prov_data.drop(['total'], axis = 1, inplace = True)
deaths_prov_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
13,2020-04-21,5,5,7,23,1,0,0,0,17,0
14,2020-04-22,5,5,7,25,1,0,0,0,22,0
15,2020-04-23,6,5,8,27,1,0,0,0,28,0
16,2020-04-24,6,5,8,29,1,0,0,0,30,0
17,2020-04-25,10,5,8,29,2,0,0,0,32,0


In [312]:
deaths_prov_data_melt = deaths_prov_data.melt(id_vars=['date'], var_name='province', 
                                              value_name='cumulative_deaths')
deaths_prov_data_melt

Unnamed: 0,date,province,cumulative_deaths
0,2020-04-08,EC,0
1,2020-04-09,EC,0
2,2020-04-10,EC,0
3,2020-04-11,EC,0
4,2020-04-12,EC,0
...,...,...,...
175,2020-04-21,UNKNOWN,0
176,2020-04-22,UNKNOWN,0
177,2020-04-23,UNKNOWN,0
178,2020-04-24,UNKNOWN,0


In [313]:
deaths_prov_data_melt.set_index(['date'], inplace=True)
deaths_prov_data_melt.tail()

Unnamed: 0_level_0,province,cumulative_deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-04-21,UNKNOWN,0
2020-04-22,UNKNOWN,0
2020-04-23,UNKNOWN,0
2020-04-24,UNKNOWN,0
2020-04-25,UNKNOWN,0


### Timeline deaths

In [314]:
deaths_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_deaths.csv"
deaths_data_req = requests.get(deaths_data_url).content

deaths_data = pd.read_csv(io.StringIO(deaths_data_req.decode('utf-8')), delimiter = ',',
                            usecols=['date', 'province', ' gender', ' age'])
deaths_data.rename(columns={" gender":"gender", " age":"age"}, inplace=True)

deaths_data_copy = deaths_data.copy()
deaths_data

Unnamed: 0,date,province,gender,age
0,27-03-2020,WC,female,48.0
1,28-03-2020,KZN,male,74.0
2,30-03-2020,FS,male,85.0
3,31-03-2020,GP,male,79.0
4,31-03-2020,KZN,female,46.0
...,...,...,...,...
70,23-04-2020,WC,female,61.0
71,23-04-2020,WC,male,41.0
72,23-04-2020,WC,,
73,23-04-2020,WC,,


In [315]:
deaths_data['date'] = pd.to_datetime(deaths_data['date'], format='%d-%m-%Y')
deaths_data

Unnamed: 0,date,province,gender,age
0,2020-03-27,WC,female,48.0
1,2020-03-28,KZN,male,74.0
2,2020-03-30,FS,male,85.0
3,2020-03-31,GP,male,79.0
4,2020-03-31,KZN,female,46.0
...,...,...,...,...
70,2020-04-23,WC,female,61.0
71,2020-04-23,WC,male,41.0
72,2020-04-23,WC,,
73,2020-04-23,WC,,


In [316]:
deaths_data['province'].fillna("UNKOWN", inplace = True)
deaths_data

Unnamed: 0,date,province,gender,age
0,2020-03-27,WC,female,48.0
1,2020-03-28,KZN,male,74.0
2,2020-03-30,FS,male,85.0
3,2020-03-31,GP,male,79.0
4,2020-03-31,KZN,female,46.0
...,...,...,...,...
70,2020-04-23,WC,female,61.0
71,2020-04-23,WC,male,41.0
72,2020-04-23,WC,,
73,2020-04-23,WC,,


### Per Province

In [317]:
deaths_data_per_day = deaths_data.groupby('date')[['province']].count()

### Total per day
#### Daily

In [318]:
deaths_data_per_day = deaths_data.groupby('date')[['province']].count()
deaths_data_per_day.rename(columns={"province":"daily_deaths"},inplace=True)
deaths_data_per_day

Unnamed: 0_level_0,daily_deaths
date,Unnamed: 1_level_1
2020-03-27,1
2020-03-28,1
2020-03-30,1
2020-03-31,2
2020-04-03,4
2020-04-05,2
2020-04-06,1
2020-04-07,1
2020-04-08,5
2020-04-09,1


Manually add entry

#### Cumulative

In [319]:
deaths_data_per_day['cum_deaths']=deaths_data_per_day['daily_deaths'].cumsum()
deaths_data_per_day

Unnamed: 0_level_0,daily_deaths,cum_deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-27,1,1
2020-03-28,1,2
2020-03-30,1,3
2020-03-31,2,5
2020-04-03,4,9
2020-04-05,2,11
2020-04-06,1,12
2020-04-07,1,13
2020-04-08,5,18
2020-04-09,1,19


In [320]:
def add_deaths(org_df, date_of_deaths, cumulative_deaths):
    date_dt = pd.to_datetime(date_of_deaths, format='%d-%m-%Y') # Format must be dd-mm-YYYY
    new_tot_deaths = cumulative_deaths

    new_daily_deaths = new_tot_deaths - org_df.iloc[-1]['cum_deaths']
    new_df_entry = pd.DataFrame({"date":[date_dt], 
                             "daily_deaths":[new_daily_deaths],
                             "cum_deaths":[new_tot_deaths],}).set_index('date')
    new_df = pd.concat([org_df, new_df_entry])
    return new_df

In [321]:
# deaths_data_per_day = add_deaths(deaths_data_per_day, "17-04-2020", 50) # 17th April - 50 deaths
# deaths_data_per_day = add_deaths(deaths_data_per_day, "18-04-2020", 52) # 18th April - 52 deaths
deaths_data_per_day = add_deaths(deaths_data_per_day, "24-04-2020", 79)
deaths_data_per_day = add_deaths(deaths_data_per_day, "25-04-2020", 86)

deaths_data_per_day

Unnamed: 0_level_0,daily_deaths,cum_deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-27,1,1
2020-03-28,1,2
2020-03-30,1,3
2020-03-31,2,5
2020-04-03,4,9
2020-04-05,2,11
2020-04-06,1,12
2020-04-07,1,13
2020-04-08,5,18
2020-04-09,1,19


**Save to csv**

In [322]:
deaths_data_per_day.to_csv('data/daily_cum_deaths.csv')

#### Deaths Vs Recovered

In [323]:
deaths_vs_recoveries = pd.concat([deaths_data_per_day[['cum_deaths']], recovered_data['cum_recovered']], 
                                 axis =1)
deaths_vs_recoveries.iloc[0,0] = 0
deaths_vs_recoveries['cum_deaths'].ffill(inplace=True)
deaths_vs_recoveries['cum_recovered'].ffill(inplace=True)
deaths_vs_recoveries

Unnamed: 0_level_0,cum_deaths,cum_recovered
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-22,0.0,1
2020-03-23,0.0,1
2020-03-24,0.0,2
2020-03-25,0.0,4
2020-03-26,0.0,4
2020-03-27,1.0,31
2020-03-28,2.0,31
2020-03-29,2.0,31
2020-03-30,3.0,31
2020-03-31,5.0,31


**Save to csv**

In [324]:
deaths_vs_recoveries.to_csv('data/deaths_vs_recoveries.csv')

# Totals
## Province
### Confirmed Cases

In [325]:
province_names = {
    "EC":"Eastern Cape",
    "FS" : "Free State",
    "GP" : "Gauteng",
    "KZN" : "KwaZula-Natal",
    "LP" : "Limpopo",
    "MP" : "Mpumalanga",
    "NW" : "North West",
    "NC" : "Northern Cape",
    "WC" : "Western Cape",
    "UNKNOWN": "Unknown"
}

In [326]:
prov_totals = province_data.tail(1).copy()
prov_totals = prov_totals.melt(id_vars=['date'], var_name='province', value_name='total')
prov_totals['province'] = prov_totals['province'].map(province_names)
prov_totals.drop('date',axis=1,inplace=True)
prov_totals

Unnamed: 0,province,total
0,Eastern Cape,488.0
1,Free State,111.0
2,Gauteng,1304.0
3,KwaZula-Natal,841.0
4,Limpopo,30.0
5,Mpumalanga,23.0
6,Northern Cape,16.0
7,North West,28.0
8,Western Cape,1514.0
9,Unknown,0.0


**Save to csv**

In [327]:
prov_totals.to_csv('data/tot_provinces.csv',index=False)

### Deaths

In [328]:
# prov_deaths_totals = deaths_prov_data.tail(1).copy().drop(['total'], axis =1)
# prov_deaths_totals = prov_deaths_totals.melt(id_vars=['date'], var_name='province', value_name='total')
# prov_deaths_totals['province'] = prov_deaths_totals['province'].map(province_names)
# prov_deaths_totals

**Save to csv**

In [329]:
#prov_deaths_totals.to_csv('data/tot_deaths_provinces.csv',index=False)

## Deaths Per Prov

In [330]:
deaths_data_copy.drop(['gender','age'], axis =1, inplace=True)
deaths_data_copy

Unnamed: 0,date,province
0,27-03-2020,WC
1,28-03-2020,KZN
2,30-03-2020,FS
3,31-03-2020,GP
4,31-03-2020,KZN
...,...,...
70,23-04-2020,WC
71,23-04-2020,WC
72,23-04-2020,WC
73,23-04-2020,WC


#### Daily

In [331]:
deaths_data_per_prov = deaths_data_copy.copy().fillna('Unknown')
deaths_data_per_prov['total'] = 1
# deaths_data_per_prov
deaths_data_per_prov = deaths_data_per_prov.groupby(['date','province']).count()
deaths_data_per_prov.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,total
date,province,Unnamed: 2_level_1
27-03-2020,WC,1
28-03-2020,KZN,1
30-03-2020,FS,1
31-03-2020,GP,1
31-03-2020,KZN,1


In [332]:
cum_deaths_data_per_prov = deaths_data_per_prov.groupby(level=-1)[['total']].cumsum()
cum_deaths_data_per_prov.head()
# deaths_data_per_prov
# cum_deaths_data_per_prov.groupby(level=-1)['total'].cumsum()

Unnamed: 0_level_0,Unnamed: 1_level_0,total
date,province,Unnamed: 2_level_1
03-04-2020,KZN,4
05-04-2020,KZN,5
05-04-2020,WC,1
06-04-2020,WC,2
07-04-2020,KZN,6


## Data to be displayed as text on website

In [333]:
format(200003,',d').replace(","," ")

'200 003'

In [334]:
def zero_space(num):
    return format(num,',d').replace(","," ")

In [335]:
tot_tested = zero_space(tests_data.tail(1).iloc[0]['cumulative_tests'].astype(int))
change_tested = zero_space(tests_data.tail(1).iloc[0]['daily_tests'].astype(int))
print(tot_tested, change_tested)

161 004 8 614


In [336]:
tot_infected = zero_space(confirmed_data.tail(1).iloc[0]['cumulative_cases'].astype(int))
change_infected = zero_space(confirmed_data.tail(1).iloc[0]['daily_cases'].astype(int))
print(tot_infected, change_infected)

4 361 141


In [337]:
tot_deaths = zero_space(deaths_vs_recoveries.tail(1).iloc[0]['cum_deaths'].astype(int))
change_deaths = zero_space(deaths_vs_recoveries['cum_deaths'].diff().tail(1).iloc[0].astype(int))
print(tot_deaths, change_deaths)

86 7


In [338]:
tot_recoveries = zero_space(deaths_vs_recoveries.tail(1).iloc[0]['cum_recovered'].astype(int))
change_recoveries = zero_space(deaths_vs_recoveries['cum_recovered'].diff().tail(1).iloc[0].astype(int))
print(tot_recoveries, change_recoveries)

1 437 0


In [339]:
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M %d %B %Y")
print("Current Time =", current_time)

Current Time = 12:55 26 April 2020


In [340]:
gen_data = pd.DataFrame(dict(tot_infected=[tot_infected], change_infected=[change_infected], 
                             tot_deaths=[tot_deaths], change_deaths=[change_deaths],
                             tot_tested=[tot_tested], change_tested=[change_tested], 
                             tot_recoveries=[tot_recoveries], change_recoveries=[change_recoveries], 
                             datetime_updated=[current_time]))
gen_data

Unnamed: 0,tot_infected,change_infected,tot_deaths,change_deaths,tot_tested,change_tested,tot_recoveries,change_recoveries,datetime_updated
0,4 361,141,86,7,161 004,8 614,1 437,0,12:55 26 April 2020


In [341]:
gen_data.to_csv("data/gen_data.csv", index=False)

# Data from Image
Use data_from_img.py code to load data from NICD infographic image specified

In [342]:
import data_from_img

**Path of image to be processed:**

In [343]:
img_path = "NICD_updates/NICD_Updates_04_25.jpg"

**Get data from image**

In [344]:
date, prov_deaths_totals, prov_recovered_totals, gen_totals = data_from_img.get_tot_data(img_path)

Date done
prov_death_totals done
prov_recovered_totals done
gen_totals done


### Province Names Dict

In [345]:
province_names = {
    "EC":"Eastern Cape",
    "FS" : "Free State",
    "GP" : "Gauteng",
    "KZN" : "KwaZula-Natal",
    "LP" : "Limpopo",
    "MP" : "Mpumalanga",
    "NW" : "North West",
    "NC" : "Northern Cape",
    "WC" : "Western Cape",
    "UNKNOWN": "Unknown"
}

### Deaths

In [346]:
prov_deaths_totals_df = pd.DataFrame([prov_deaths_totals])
prov_deaths_totals_df['date'] = date
prov_deaths_totals_df =prov_deaths_totals_df.melt(id_vars=['date'], var_name='province', 
                                                  value_name='deaths').set_index(['date'])
prov_deaths_totals_df['province'] = prov_deaths_totals_df['province'].map(province_names)
prov_deaths_totals_df

Unnamed: 0_level_0,province,deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
25-04-2020,Eastern Cape,10
25-04-2020,Free State,5
25-04-2020,Gauteng,8
25-04-2020,KwaZula-Natal,29
25-04-2020,Limpopo,2
25-04-2020,Mpumalanga,0
25-04-2020,North West,0
25-04-2020,Northern Cape,0
25-04-2020,Western Cape,32


**Save to csv**

In [347]:
prov_deaths_totals_df.to_csv('data/tot_deaths_provinces.csv',index=True)

### Recovered

In [348]:
prov_recovered_totals_df = pd.DataFrame([prov_recovered_totals])
prov_recovered_totals_df['date'] = date
prov_recovered_totals_df =prov_recovered_totals_df.melt(id_vars=['date'], var_name='province', 
                                                  value_name='recovered').set_index(['date'])
prov_recovered_totals_df['province'] = prov_recovered_totals_df['province'].map(province_names)
prov_recovered_totals_df

Unnamed: 0_level_0,province,recovered
date,Unnamed: 1_level_1,Unnamed: 2_level_1
25-04-2020,Eastern Cape,19
25-04-2020,Free State,76
25-04-2020,Gauteng,843
25-04-2020,KwaZula-Natal,241
25-04-2020,Limpopo,24
25-04-2020,Mpumalanga,15
25-04-2020,North West,13
25-04-2020,Northern Cape,6
25-04-2020,Western Cape,236


**Save to csv**

In [349]:
prov_recovered_totals_df.to_csv('data/tot_recovered_provinces.csv',index=True)