### Is Graph Valid

In [1]:
def is_chart_valid(file_path):
    chart_file = open(file_path)
    last_lines = []
    for line in chart_file:
        last_lines.append(line)
        if len(last_lines) > 3:
            last_lines.pop(0)
    chart_file.close()
    return last_lines

In [2]:
is_chart_valid('date_vs_daily_cases.html')

['        </div>\n', '</body>\n', '</html>']

# Data Preprocessing

In [1]:
# Enable Intellisense
%config IPCompleter.greedy=True

import pandas as pd
import numpy as np
import io
import requests
import seaborn as sns
from matplotlib import pyplot, dates
import plotly.express as px 
import datetime
import plotly.graph_objects as go
import plotly.io as pio
sns.set()

# Annoying warning
pd.options.mode.chained_assignment = None  # default='warn'

# Gen Data
## All Confirm

**No longer going to use above data**

# Over time

## Provinces
Taken from 'covid19za_provincial_cumulative_timeline_confirmed.csv'
### Cumulative

In [2]:
province_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv"

province_data_req = requests.get(province_data_url).content

province_data = pd.read_csv(io.StringIO(province_data_req.decode('utf-8')), delimiter = ',')
province_data.drop(['YYYYMMDD'], axis = 1, inplace=True)
province_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total,source
60,06-05-2020,849.0,130.0,1720.0,1149.0,40.0,57.0,26.0,37.0,3760.0,0.0,7808,https://twitter.com/HealthZA/status/1258130333...
61,07-05-2020,929.0,134.0,1804.0,1204.0,41.0,59.0,27.0,40.0,3994.0,0.0,8232,https://sacoronavirus.co.za/2020/05/07/update-...
62,08-05-2020,989.0,133.0,1851.0,1253.0,43.0,60.0,27.0,42.0,4497.0,0.0,8895,https://sacoronavirus.co.za/2020/05/08/update-...
63,09-05-2020,1078.0,134.0,1910.0,1308.0,51.0,61.0,28.0,41.0,4809.0,0.0,9420,https://sacoronavirus.co.za/2020/05/09/update-...
64,10-05-2020,1218.0,135.0,1952.0,1353.0,54.0,61.0,29.0,45.0,5168.0,0.0,10015,https://sacoronavirus.co.za/2020/05/10/update-...


In [3]:
province_data.drop(['total','source'],axis=1, inplace = True)
province_data.dropna(inplace=True)
province_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
60,06-05-2020,849.0,130.0,1720.0,1149.0,40.0,57.0,26.0,37.0,3760.0,0.0
61,07-05-2020,929.0,134.0,1804.0,1204.0,41.0,59.0,27.0,40.0,3994.0,0.0
62,08-05-2020,989.0,133.0,1851.0,1253.0,43.0,60.0,27.0,42.0,4497.0,0.0
63,09-05-2020,1078.0,134.0,1910.0,1308.0,51.0,61.0,28.0,41.0,4809.0,0.0
64,10-05-2020,1218.0,135.0,1952.0,1353.0,54.0,61.0,29.0,45.0,5168.0,0.0


In [4]:
province_data.to_csv('data/daily_prov.csv', index = False)

In [5]:
province_data = pd.read_csv('data/daily_prov.csv')
province_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
58,06-05-2020,849.0,130.0,1720.0,1149.0,40.0,57.0,26.0,37.0,3760.0,0.0
59,07-05-2020,929.0,134.0,1804.0,1204.0,41.0,59.0,27.0,40.0,3994.0,0.0
60,08-05-2020,989.0,133.0,1851.0,1253.0,43.0,60.0,27.0,42.0,4497.0,0.0
61,09-05-2020,1078.0,134.0,1910.0,1308.0,51.0,61.0,28.0,41.0,4809.0,0.0
62,10-05-2020,1218.0,135.0,1952.0,1353.0,54.0,61.0,29.0,45.0,5168.0,0.0


In [6]:
province_data['date'] = pd.to_datetime(province_data['date'], format='%d-%m-%Y')

In [7]:
province_data_melt = province_data.melt(id_vars=['date'], var_name='province', value_name='cumulative_cases')
province_data_melt

Unnamed: 0,date,province,cumulative_cases
0,2020-03-05,EC,0.0
1,2020-03-07,EC,0.0
2,2020-03-08,EC,0.0
3,2020-03-09,EC,0.0
4,2020-03-11,EC,0.0
...,...,...,...
625,2020-05-06,UNKNOWN,0.0
626,2020-05-07,UNKNOWN,0.0
627,2020-05-08,UNKNOWN,0.0
628,2020-05-09,UNKNOWN,0.0


In [8]:
province_names = {
#     "EC":"Eastern Cape",
#     "FS" : "Free State",
#     "GP" : "Gauteng",
#     "KZN" : "KwaZula-Natal",
#     "LP" : "Limpopo",
#     "MP" : "Mpumalanga",
#     "NW" : "North West",
#     "NC" : "Northern Cape",
#     "WC" : "Western Cape",
    "UNKNOWN": "UNK"
}
# province_data_melt['province'] = province_data_melt['province'].map(province_names)
province_data_melt['province'] = province_data_melt['province'].replace("UNKNOWN","UNK")
province_data_melt

Unnamed: 0,date,province,cumulative_cases
0,2020-03-05,EC,0.0
1,2020-03-07,EC,0.0
2,2020-03-08,EC,0.0
3,2020-03-09,EC,0.0
4,2020-03-11,EC,0.0
...,...,...,...
625,2020-05-06,UNK,0.0
626,2020-05-07,UNK,0.0
627,2020-05-08,UNK,0.0
628,2020-05-09,UNK,0.0


### Daily

In [9]:
province_data_daily = province_data.copy()
province_data_daily.iloc[1:,1:] = province_data_daily.iloc[:,1:].diff().iloc[1:,:]
province_data_daily.tail()
# province_data_daily

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
58,2020-05-06,11.0,2.0,23.0,7.0,0.0,0.0,0.0,2.0,151.0,0.0
59,2020-05-07,80.0,4.0,84.0,55.0,1.0,2.0,1.0,3.0,234.0,0.0
60,2020-05-08,60.0,-1.0,47.0,49.0,2.0,1.0,0.0,2.0,503.0,0.0
61,2020-05-09,89.0,1.0,59.0,55.0,8.0,1.0,1.0,-1.0,312.0,0.0
62,2020-05-10,140.0,1.0,42.0,45.0,3.0,0.0,1.0,4.0,359.0,0.0


In [10]:
province_data_daily_melt = province_data_daily.melt(id_vars=['date'], var_name='province', 
                                                    value_name='daily_cases')
province_data_daily_melt.tail()

Unnamed: 0,date,province,daily_cases
625,2020-05-06,UNKNOWN,0.0
626,2020-05-07,UNKNOWN,0.0
627,2020-05-08,UNKNOWN,0.0
628,2020-05-09,UNKNOWN,0.0
629,2020-05-10,UNKNOWN,0.0


In [11]:
province_names = {
#     "EC":"Eastern Cape",
#     "FS" : "Free State",
#     "GP" : "Gauteng",
#     "KZN" : "KwaZula-Natal",
#     "LP" : "Limpopo",
#     "MP" : "Mpumalanga",
#     "NW" : "North West",
#     "NC" : "Northern Cape",
#     "WC" : "Western Cape",
    "UNKNOWN": "UNK"
}
# province_data_daily_melt['province'] = province_data_daily_melt['province'].map(province_names)
province_data_daily_melt['province'] = province_data_daily_melt['province'].replace("UNKNOWN","UNK")
province_data_daily_melt

Unnamed: 0,date,province,daily_cases
0,2020-03-05,EC,0.0
1,2020-03-07,EC,0.0
2,2020-03-08,EC,0.0
3,2020-03-09,EC,0.0
4,2020-03-11,EC,0.0
...,...,...,...
625,2020-05-06,UNK,0.0
626,2020-05-07,UNK,0.0
627,2020-05-08,UNK,0.0
628,2020-05-09,UNK,0.0


### Concatenate Cumulative & Daily

In [12]:
prov_cumulative_daily = pd.concat([province_data_melt, province_data_daily_melt['daily_cases']], axis =1)
prov_cumulative_daily.tail()

Unnamed: 0,date,province,cumulative_cases,daily_cases
625,2020-05-06,UNK,0.0,0.0
626,2020-05-07,UNK,0.0,0.0
627,2020-05-08,UNK,0.0,0.0
628,2020-05-09,UNK,0.0,0.0
629,2020-05-10,UNK,0.0,0.0


**Save to csv**

In [13]:
prov_cumulative_daily.to_csv('data/daily_cumulative_confirmed_prov.csv', index=False)

## Tests

In [14]:
tests_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv"

tests_data_req = requests.get(tests_data_url).content

tests_data = pd.read_csv(io.StringIO(tests_data_req.decode('utf-8')), delimiter = ',',
                         usecols=['date','cumulative_tests'])
tests_data.dropna(inplace=True)

So far tests_data only includes cumulative.

In [15]:
tests_data

Unnamed: 0,date,cumulative_tests
0,11-02-2020,61.0
1,13-02-2020,67.0
2,14-02-2020,71.0
3,19-02-2020,95.0
4,20-02-2020,106.0
...,...,...
67,06-05-2020,279379.0
68,07-05-2020,292153.0
69,08-05-2020,307752.0
70,09-05-2020,324079.0


In [16]:
tests_data.to_csv('data/tests_data.csv', index=False)

### Daily

In [17]:
tests_data['daily_tests'] = tests_data['cumulative_tests']
# slice is required as first entry of diff will be NaN but data we want should be equal to the starting value
tests_data['daily_tests'][1:] = tests_data['cumulative_tests'].diff()[1:] 
tests_data

Unnamed: 0,date,cumulative_tests,daily_tests
0,11-02-2020,61.0,61.0
1,13-02-2020,67.0,6.0
2,14-02-2020,71.0,4.0
3,19-02-2020,95.0,24.0
4,20-02-2020,106.0,11.0
...,...,...,...
67,06-05-2020,279379.0,11315.0
68,07-05-2020,292153.0,12774.0
69,08-05-2020,307752.0,15599.0
70,09-05-2020,324079.0,16327.0


#### Save to CSV

In [18]:
tests_data.to_csv('data/daily_cumulative_tests.csv', index=False)

## Confirmed Cases
Get no of cumulative cases from 'covid19za_provincial_cumulative_timeline_confirmed.csv'
### Cumulative

In [19]:
confirmed_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv"

confirmed_data_req = requests.get(confirmed_data_url).content

confirmed_data = pd.read_csv(io.StringIO(confirmed_data_req.decode('utf-8')), usecols=['date','total'])

confirmed_data.tail()

Unnamed: 0,date,total
60,06-05-2020,7808
61,07-05-2020,8232
62,08-05-2020,8895
63,09-05-2020,9420
64,10-05-2020,10015


In [20]:
confirmed_data['date'] = pd.to_datetime(confirmed_data['date'], format='%d-%m-%Y')
confirmed_data.set_index('date', inplace=True)
confirmed_data.rename(columns={"total":"cumulative_cases"}, inplace = True)
confirmed_data.tail()

Unnamed: 0_level_0,cumulative_cases
date,Unnamed: 1_level_1
2020-05-06,7808
2020-05-07,8232
2020-05-08,8895
2020-05-09,9420
2020-05-10,10015


### Daily

In [21]:
confirmed_data['daily_cases'] = confirmed_data['cumulative_cases']
# slice is required as first entry of diff will be NaN but data we want should be equal to the starting value
confirmed_data['daily_cases'][1:] = confirmed_data['cumulative_cases'].diff()[1:] 
confirmed_data

Unnamed: 0_level_0,cumulative_cases,daily_cases
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-05,1,1.0
2020-03-07,2,1.0
2020-03-08,3,1.0
2020-03-09,7,4.0
2020-03-11,13,6.0
...,...,...
2020-05-06,7808,236.0
2020-05-07,8232,424.0
2020-05-08,8895,663.0
2020-05-09,9420,525.0


**Save to csv**

In [22]:
confirmed_data.to_csv('data/daily_cumulative_confirmed.csv')

### Daily

In [23]:
# confirmed_data = confirmed_all_data.groupby(['date']).count()[['province']]
# confirmed_data.rename(columns={'province':'daily_cases'}, inplace = True)
# confirmed_data

### Cumulative

In [24]:
# confirmed_data['cumulative_cases'] = confirmed_data['daily_cases'].cumsum()
# confirmed_data

**Save to csv**

Above method is no longer being used. Thus why below is commented out.

In [25]:
# confirmed_data.to_csv('data/daily_cumulative_confirmed.csv')

**Use saved data rather than above**

In [26]:
# confirmed_data = pd.read_csv('data/daily_cumulative_confirmed.csv')
# confirmed_data['date'] = pd.to_datetime(confirmed_data['date'], format='%Y-%m-%d')
# confirmed_data.set_index('date', inplace=True)
# confirmed_data

**Add data to above**

Data to be added:

In [27]:
# # Format must be dd-mm-YYYY
# date_str = "29-03-2020"
# date_dt = pd.to_datetime(date_str, format='%d-%m-%Y')
# new_tot_cases = 1326

In [28]:
# new_daily_cases = new_tot_cases - confirmed_data.iloc[-1]['cumulative_cases']
# new_df_entry = pd.DataFrame({"date":[date_dt],
#              "daily_cases":[new_daily_cases],
#              "cumulative_cases":[new_tot_cases]}).set_index('date')
# confirmed_data = pd.concat([confirmed_data, new_df_entry])
# confirmed_data.tail()

**Save to csv**

In [29]:
# confirmed_data.to_csv('data/daily_cumulative_confirmed.csv')

## Confirmed & Tests
### Daily

In [30]:
confirmed_data_tmp = confirmed_data.reset_index()
confirmed_data_tmp['date'] = pd.to_datetime(confirmed_data_tmp['date'], format='%d-%m-%Y')
confirmed_data_tmp.set_index('date', inplace = True)
confirmed_data_tmp.tail()

Unnamed: 0_level_0,cumulative_cases,daily_cases
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-05-06,7808,236.0
2020-05-07,8232,424.0
2020-05-08,8895,663.0
2020-05-09,9420,525.0
2020-05-10,10015,595.0


In [31]:
tests_data_tmp = tests_data.copy()
# test_data_tmp.info()
tests_data_tmp['date'] = pd.to_datetime(tests_data_tmp['date'], format='%d-%m-%Y')
tests_data_tmp.set_index('date', inplace = True)
tests_data_tmp.tail()

Unnamed: 0_level_0,cumulative_tests,daily_tests
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-05-06,279379.0,11315.0
2020-05-07,292153.0,12774.0
2020-05-08,307752.0,15599.0
2020-05-09,324079.0,16327.0
2020-05-10,341336.0,17257.0


In [32]:
daily_tests_confirmed = pd.concat([confirmed_data_tmp[['daily_cases']], 
                                   tests_data_tmp['daily_tests']], axis = 1, sort = True)
daily_tests_confirmed['daily_cases'].fillna(0,inplace=True)
daily_tests_confirmed

Unnamed: 0_level_0,daily_cases,daily_tests
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-11,0.0,61.0
2020-02-13,0.0,6.0
2020-02-14,0.0,4.0
2020-02-19,0.0,24.0
2020-02-20,0.0,11.0
...,...,...
2020-05-06,236.0,11315.0
2020-05-07,424.0,12774.0
2020-05-08,663.0,15599.0
2020-05-09,525.0,16327.0


NaN for daily_tests corresponds to test data not available.
### Percentage of Positive Tests

In [33]:
daily_tests_confirmed['perc_positive'] = (daily_tests_confirmed['daily_cases']/
                                          daily_tests_confirmed['daily_tests']) * 100
daily_tests_confirmed['perc_positive'] = daily_tests_confirmed['perc_positive'].round(1)
daily_tests_confirmed

Unnamed: 0_level_0,daily_cases,daily_tests,perc_positive
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-02-11,0.0,61.0,0.0
2020-02-13,0.0,6.0,0.0
2020-02-14,0.0,4.0,0.0
2020-02-19,0.0,24.0,0.0
2020-02-20,0.0,11.0,0.0
...,...,...,...
2020-05-06,236.0,11315.0,2.1
2020-05-07,424.0,12774.0,3.3
2020-05-08,663.0,15599.0,4.3
2020-05-09,525.0,16327.0,3.2


**Save to csv**

In [34]:
daily_tests_confirmed.to_csv('data/daily_tests_confirmed.csv')

### Cumulative

In [35]:
cumulative_tests_confirmed = daily_tests_confirmed.cumsum()
cumulative_tests_confirmed

Unnamed: 0_level_0,daily_cases,daily_tests,perc_positive
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-02-11,0.0,61.0,0.0
2020-02-13,0.0,67.0,0.0
2020-02-14,0.0,71.0,0.0
2020-02-19,0.0,95.0,0.0
2020-02-20,0.0,106.0,0.0
...,...,...,...
2020-05-06,7808.0,279379.0,180.7
2020-05-07,8232.0,292153.0,184.0
2020-05-08,8895.0,307752.0,188.3
2020-05-09,9420.0,324079.0,191.5


// TODO - Rename to Cumulative as this is confusing

### Percentage of Positive Tests

In [36]:
cumulative_tests_confirmed['perc_positive'] = (cumulative_tests_confirmed['daily_cases']/
                                          cumulative_tests_confirmed['daily_tests']) * 100
cumulative_tests_confirmed['perc_positive'] = cumulative_tests_confirmed['perc_positive'].round(1)
cumulative_tests_confirmed

Unnamed: 0_level_0,daily_cases,daily_tests,perc_positive
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-02-11,0.0,61.0,0.0
2020-02-13,0.0,67.0,0.0
2020-02-14,0.0,71.0,0.0
2020-02-19,0.0,95.0,0.0
2020-02-20,0.0,106.0,0.0
...,...,...,...
2020-05-06,7808.0,279379.0,2.8
2020-05-07,8232.0,292153.0,2.8
2020-05-08,8895.0,307752.0,2.9
2020-05-09,9420.0,324079.0,2.9


**Save to csv**

In [37]:
cumulative_tests_confirmed.to_csv('data/cumulative_tests_confirmed.csv')

# Deaths & Recoveries
## Recoveries

In [38]:
recovered_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv"
recovered_data_req = requests.get(recovered_data_url).content

recovered_data = pd.read_csv(io.StringIO(recovered_data_req.decode('utf-8')), delimiter = ',', 
                            usecols=['date','recovered'])
# rename recovered
recovered_data.rename(columns={"recovered":"cum_recovered"}, inplace=True)

# drop fields where recovered is 0
recovered_data.drop(recovered_data[recovered_data['cum_recovered']==0]['cum_recovered'].index, inplace=True)
recovered_data

Unnamed: 0,date,cum_recovered
22,22-03-2020,1
23,23-03-2020,1
24,24-03-2020,2
25,25-03-2020,4
26,26-03-2020,4
27,27-03-2020,31
28,28-03-2020,31
29,29-03-2020,31
30,30-03-2020,31
31,31-03-2020,31


In [39]:
recovered_data['date'] = pd.to_datetime(recovered_data['date'], format='%d-%m-%Y')
recovered_data.set_index('date', inplace = True)
recovered_data

Unnamed: 0_level_0,cum_recovered
date,Unnamed: 1_level_1
2020-03-22,1
2020-03-23,1
2020-03-24,2
2020-03-25,4
2020-03-26,4
2020-03-27,31
2020-03-28,31
2020-03-29,31
2020-03-30,31
2020-03-31,31


In [40]:
recovered_data['daily_recovered'] = recovered_data['cum_recovered']
# slice is required as first entry of diff will be NaN but data we want should be equal to the starting value
recovered_data['daily_recovered'][1:] = recovered_data['cum_recovered'].diff()[1:] 
recovered_data

Unnamed: 0_level_0,cum_recovered,daily_recovered
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-22,1,1.0
2020-03-23,1,0.0
2020-03-24,2,1.0
2020-03-25,4,2.0
2020-03-26,4,0.0
2020-03-27,31,27.0
2020-03-28,31,0.0
2020-03-29,31,0.0
2020-03-30,31,0.0
2020-03-31,31,0.0


**Save to csv**

In [41]:
recovered_data.to_csv('data/recovered_data.csv')

## Deaths

In [42]:
start_deaths_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_deaths.csv"
start_deaths_data_req = requests.get(start_deaths_data_url).content

start_deaths_data = pd.read_csv(io.StringIO(start_deaths_data_req.decode('utf-8')), delimiter = ',',
                            usecols=['date', 'province']).head(13)

start_deaths_data

Unnamed: 0,date,province
0,27-03-2020,WC
1,28-03-2020,KZN
2,30-03-2020,FS
3,31-03-2020,GP
4,31-03-2020,KZN
5,03-04-2020,KZN
6,03-04-2020,KZN
7,03-04-2020,KZN
8,03-04-2020,KZN
9,05-04-2020,WC


In [43]:
start_deaths_data['date'] = pd.to_datetime(start_deaths_data['date'], format='%d-%m-%Y')
start_deaths_data.tail()

Unnamed: 0,date,province
8,2020-04-03,KZN
9,2020-04-05,WC
10,2020-04-05,KZN
11,2020-04-06,WC
12,2020-04-07,KZN


In [44]:
start_deaths_data = start_deaths_data.groupby(['date']).count()
start_deaths_data.rename({"province":"daily_deaths"}, axis=1, inplace = True)
start_deaths_data

Unnamed: 0_level_0,daily_deaths
date,Unnamed: 1_level_1
2020-03-27,1
2020-03-28,1
2020-03-30,1
2020-03-31,2
2020-04-03,4
2020-04-05,2
2020-04-06,1
2020-04-07,1


In [45]:
start_deaths_data['cum_deaths'] = start_deaths_data[['daily_deaths']].cumsum()
start_deaths_data

Unnamed: 0_level_0,daily_deaths,cum_deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-27,1,1
2020-03-28,1,2
2020-03-30,1,3
2020-03-31,2,5
2020-04-03,4,9
2020-04-05,2,11
2020-04-06,1,12
2020-04-07,1,13


#### Deaths data from 8 April onwards
From https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_deaths.csv

In [46]:
deaths_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_deaths.csv"

deaths_data_req = requests.get(deaths_data_url).content

deaths_data = pd.read_csv(io.StringIO(deaths_data_req.decode('utf-8')), usecols=['date','total'])

deaths_data.tail()

Unnamed: 0,date,total
36,06-05-2020,153
37,07-05-2020,161
38,08-05-2020,178
39,09-05-2020,186
40,10-05-2020,194


In [47]:
deaths_data['date'] = pd.to_datetime(deaths_data['date'], format='%d-%m-%Y')
deaths_data.set_index('date', inplace=True)
deaths_data.rename(columns={"total":"cum_deaths"}, inplace = True)
deaths_data.tail()

Unnamed: 0_level_0,cum_deaths
date,Unnamed: 1_level_1
2020-05-06,153
2020-05-07,161
2020-05-08,178
2020-05-09,186
2020-05-10,194


### Daily

In [48]:
# deaths_data.iloc[0]['cum_deaths'] - start_deaths_data.iloc[-1]['cum_deaths']

In [49]:
deaths_data['daily_deaths'] = deaths_data['cum_deaths']
deaths_data['daily_deaths'].iloc[1:] = deaths_data['cum_deaths'].diff()[1:]
# deaths_data['daily_deaths'][0] = deaths_data.iloc[0]['cum_deaths'] - start_deaths_data.iloc[-1]['cum_deaths']
deaths_data

Unnamed: 0_level_0,cum_deaths,daily_deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-27,1,1.0
2020-03-28,2,1.0
2020-03-30,3,1.0
2020-03-31,5,2.0
2020-04-03,9,4.0
2020-04-05,11,2.0
2020-04-06,12,1.0
2020-04-07,13,1.0
2020-04-08,18,5.0
2020-04-09,18,0.0


**Concat missing data from before 8 April**

In [50]:
# deaths_data=pd.concat([start_deaths_data, deaths_data])
# deaths_data

**Save to csv**

In [51]:
deaths_data.to_csv('data/daily_cum_deaths.csv')

### Death Per Province

In [52]:
deaths_data_prov_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_deaths.csv"
deaths_data_prov_req = requests.get(deaths_data_prov_url).content

deaths_prov_data = pd.read_csv(io.StringIO(deaths_data_prov_req.decode('utf-8'))).drop("YYYYMMDD", axis =1)
deaths_prov_data['date'] = pd.to_datetime(deaths_prov_data['date'], format='%d-%m-%Y')

deaths_prov_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total,source
36,2020-05-06,18,6,15,38,3,0,0,0,73,0,153,https://twitter.com/HealthZA/status/1258130333...
37,2020-05-07,18,6,15,40,3,0,0,0,79,0,161,https://twitter.com/nicd_sa/status/12584671537...
38,2020-05-08,21,6,18,42,3,0,0,0,88,0,178,https://twitter.com/nicd_sa/status/12588518999...
39,2020-05-09,21,6,19,42,3,0,0,0,95,0,186,https://twitter.com/nicd_sa/status/12591254301...
40,2020-05-10,22,6,22,43,3,0,0,0,98,0,194,https://sacoronavirus.co.za/2020/05/10/update-...


In [53]:
deaths_prov_data.drop(['total','source'], axis = 1, inplace = True)
deaths_prov_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
36,2020-05-06,18,6,15,38,3,0,0,0,73,0
37,2020-05-07,18,6,15,40,3,0,0,0,79,0
38,2020-05-08,21,6,18,42,3,0,0,0,88,0
39,2020-05-09,21,6,19,42,3,0,0,0,95,0
40,2020-05-10,22,6,22,43,3,0,0,0,98,0


**Confirmed Data**

In [54]:
confirmed_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv"

confirmed_data_req = requests.get(confirmed_data_url).content

confirmed_data = pd.read_csv(io.StringIO(confirmed_data_req.decode('utf-8')), usecols=['date','total'])

confirmed_data.tail()

Unnamed: 0,date,total
60,06-05-2020,7808
61,07-05-2020,8232
62,08-05-2020,8895
63,09-05-2020,9420
64,10-05-2020,10015


In [55]:
confirmed_data['date'] = pd.to_datetime(confirmed_data['date'], format='%d-%m-%Y')
confirmed_data.set_index('date', inplace=True)
confirmed_data.rename(columns={"total":"cumulative_cases"}, inplace = True)
confirmed_data.tail()

Unnamed: 0_level_0,cumulative_cases
date,Unnamed: 1_level_1
2020-05-06,7808
2020-05-07,8232
2020-05-08,8895
2020-05-09,9420
2020-05-10,10015


### Daily

In [56]:
confirmed_data['daily_cases'] = confirmed_data['cumulative_cases']
# slice is required as first entry of diff will be NaN but data we want should be equal to the starting value
confirmed_data['daily_cases'][1:] = confirmed_data['cumulative_cases'].diff()[1:] 
confirmed_data.head()

Unnamed: 0_level_0,cumulative_cases,daily_cases
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-05,1,1.0
2020-03-07,2,1.0
2020-03-08,3,1.0
2020-03-09,7,4.0
2020-03-11,13,6.0


**Save to csv**

In [57]:
confirmed_data.to_csv('data/daily_cumulative_confirmed.csv')

### Per Province

In [58]:
# deaths_data_per_day = deaths_data.groupby('date')[['province']].count()

### Total per day
#### Daily

In [59]:
# deaths_data_per_day = deaths_data.groupby('date')[['province']].count()
# deaths_data_per_day.rename(columns={"province":"daily_deaths"},inplace=True)
# deaths_data_per_day

Manually add entry

#### Cumulative

In [60]:
# deaths_data_per_day['cum_deaths']=deaths_data_per_day['daily_deaths'].cumsum()
# deaths_data_per_day

In [61]:
def add_deaths(org_df, date_of_deaths, cumulative_deaths):
    date_dt = pd.to_datetime(date_of_deaths, format='%d-%m-%Y') # Format must be dd-mm-YYYY
    new_tot_deaths = cumulative_deaths

    new_daily_deaths = new_tot_deaths - org_df.iloc[-1]['cum_deaths']
    new_df_entry = pd.DataFrame({"date":[date_dt], 
                             "daily_deaths":[new_daily_deaths],
                             "cum_deaths":[new_tot_deaths],}).set_index('date')
    new_df = pd.concat([org_df, new_df_entry])
    return new_df

In [62]:
# # deaths_data_per_day = add_deaths(deaths_data_per_day, "17-04-2020", 50) # 17th April - 50 deaths
# # deaths_data_per_day = add_deaths(deaths_data_per_day, "18-04-2020", 52) # 18th April - 52 deaths
# deaths_data = add_deaths(deaths_data, "2-05-2020", 123)

# deaths_data

**Save to csv**

In [63]:
deaths_data

Unnamed: 0_level_0,cum_deaths,daily_deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-27,1,1.0
2020-03-28,2,1.0
2020-03-30,3,1.0
2020-03-31,5,2.0
2020-04-03,9,4.0
2020-04-05,11,2.0
2020-04-06,12,1.0
2020-04-07,13,1.0
2020-04-08,18,5.0
2020-04-09,18,0.0


In [64]:
deaths_data.to_csv('data/daily_cum_deaths.csv')

#### Deaths Vs Recovered

In [65]:
deaths_vs_recoveries = pd.concat([deaths_data[['cum_deaths']], recovered_data['cum_recovered']], 
                                 axis =1)
deaths_vs_recoveries.iloc[0,0] = 0
deaths_vs_recoveries['cum_deaths'].ffill(inplace=True)
deaths_vs_recoveries['cum_recovered'].ffill(inplace=True)
deaths_vs_recoveries

Unnamed: 0_level_0,cum_deaths,cum_recovered
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-22,0.0,1
2020-03-23,0.0,1
2020-03-24,0.0,2
2020-03-25,0.0,4
2020-03-26,0.0,4
2020-03-27,1.0,31
2020-03-28,2.0,31
2020-03-29,2.0,31
2020-03-30,3.0,31
2020-03-31,5.0,31


**Save to csv**

In [66]:
deaths_vs_recoveries.to_csv('data/deaths_vs_recoveries.csv')

# Totals
## Province
### Confirmed Cases

In [67]:
province_names = {
    "EC":"Eastern Cape",
    "FS" : "Free State",
    "GP" : "Gauteng",
    "KZN" : "KwaZulu-Natal",
    "LP" : "Limpopo",
    "MP" : "Mpumalanga",
    "NW" : "North West",
    "NC" : "Northern Cape",
    "WC" : "Western Cape",
    "UNKNOWN": "Unknown"
}

In [68]:
prov_totals = province_data.tail(1).copy()
prov_totals = prov_totals.melt(id_vars=['date'], var_name='province', value_name='total')
prov_totals['province'] = prov_totals['province'].map(province_names)
prov_totals.drop('date',axis=1,inplace=True)
prov_totals

Unnamed: 0,province,total
0,Eastern Cape,1218.0
1,Free State,135.0
2,Gauteng,1952.0
3,KwaZulu-Natal,1353.0
4,Limpopo,54.0
5,Mpumalanga,61.0
6,Northern Cape,29.0
7,North West,45.0
8,Western Cape,5168.0
9,Unknown,0.0


**Save to csv**

In [69]:
prov_totals.to_csv('data/tot_provinces.csv',index=False)

### Deaths

In [70]:
tot_deaths_per_province = deaths_prov_data.tail(1)
tot_deaths_per_province = tot_deaths_per_province.melt(id_vars=['date'], var_name='province', 
                                              value_name='tot_deaths')
tot_deaths_per_province.drop(['date'], axis=1, inplace = True)
tot_deaths_per_province['province'] = tot_deaths_per_province['province'].map(province_names)
tot_deaths_per_province.set_index(['province'], inplace = True)
tot_deaths_per_province

Unnamed: 0_level_0,tot_deaths
province,Unnamed: 1_level_1
Eastern Cape,22
Free State,6
Gauteng,22
KwaZulu-Natal,43
Limpopo,3
Mpumalanga,0
Northern Cape,0
North West,0
Western Cape,98
Unknown,0


**Save to csv**

In [71]:
tot_deaths_per_province.to_csv('data/tot_deaths_provinces.csv',index=True)

## Deaths Per Prov

In [72]:
# deaths_data_copy.drop(['gender','age'], axis =1, inplace=True)
# deaths_data_copy

#### Daily

In [73]:
# deaths_data_per_prov = deaths_data_copy.copy().fillna('Unknown')
# deaths_data_per_prov['total'] = 1
# # deaths_data_per_prov
# deaths_data_per_prov = deaths_data_per_prov.groupby(['date','province']).count()
# deaths_data_per_prov.tail()

In [74]:
# cum_deaths_data_per_prov = deaths_data_per_prov.groupby(level=-1)[['total']].cumsum()
# cum_deaths_data_per_prov.head()
# # deaths_data_per_prov
# # cum_deaths_data_per_prov.groupby(level=-1)['total'].cumsum()

## Data to be displayed as text on website

In [76]:
def zero_space(num):
    return format(num,',d').replace(","," ")

In [77]:
tot_tested = zero_space(tests_data.tail(1).iloc[0]['cumulative_tests'].astype(int))
change_tested = zero_space(tests_data.tail(1).iloc[0]['daily_tests'].astype(int))
print(tot_tested, change_tested)

341 336 17 257


In [78]:
tot_infected = zero_space(confirmed_data.tail(1).iloc[0]['cumulative_cases'].astype(int))
change_infected = zero_space(confirmed_data.tail(1).iloc[0]['daily_cases'].astype(int))
print(tot_infected, change_infected)

10 015 595


In [79]:
tot_deaths = zero_space(deaths_vs_recoveries.tail(1).iloc[0]['cum_deaths'].astype(int))
change_deaths = zero_space(deaths_vs_recoveries['cum_deaths'].diff().tail(1).iloc[0].astype(int))
print(tot_deaths, change_deaths)

194 8


In [80]:
tot_recoveries = zero_space(deaths_vs_recoveries.tail(1).iloc[0]['cum_recovered'].astype(int))
change_recoveries = zero_space(deaths_vs_recoveries['cum_recovered'].diff().tail(1).iloc[0].astype(int))
print(tot_recoveries, change_recoveries)

4 173 190


In [81]:
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M %d %B %Y")
print("Current Time =", current_time)

Current Time = 10:34 11 May 2020


In [82]:
gen_data = pd.DataFrame(dict(tot_infected=[tot_infected], change_infected=[change_infected], 
                             tot_deaths=[tot_deaths], change_deaths=[change_deaths],
                             tot_tested=[tot_tested], change_tested=[change_tested], 
                             tot_recoveries=[tot_recoveries], change_recoveries=[change_recoveries], 
                             datetime_updated=[current_time]))
gen_data

Unnamed: 0,tot_infected,change_infected,tot_deaths,change_deaths,tot_tested,change_tested,tot_recoveries,change_recoveries,datetime_updated
0,10 015,595,194,8,341 336,17 257,4 173,190,10:34 11 May 2020


In [83]:
gen_data.to_csv("data/gen_data.csv", index=False)

**Render Template**

In [84]:
import template_renderer as tr # will render automatically

Template Rendered


# Data from Image
Use data_from_img.py code to load data from NICD infographic image specified

In [85]:
# import data_from_img

**Path of image to be processed:**

In [86]:
img_path = "NICD_updates/NICD_Updates_05_02.jpg"

**Get data from image**

In [87]:
# date, prov_deaths_totals, prov_recovered_totals = data_from_img.get_tot_data(img_path, load_deaths=True, 
#                                                                              load_recovered=True)
# # , prov_recovered_totals, gen_totals

In [88]:
# prov_recovered_totals

### Province Names Dict

In [89]:
province_names = {
    "EC":"Eastern Cape",
    "FS" : "Free State",
    "GP" : "Gauteng",
    "KZN" : "KwaZula-Natal",
    "LP" : "Limpopo",
    "MP" : "Mpumalanga",
    "NW" : "North West",
    "NC" : "Northern Cape",
    "WC" : "Western Cape",
    "UNKNOWN": "Unknown"
}

### Deaths

In [90]:
# tot_deaths_per_province = deaths_prov_data.tail(1)
# tot_deaths_per_province = tot_deaths_per_province.melt(id_vars=['date'], var_name='province', 
#                                               value_name='tot_deaths')
# tot_deaths_per_province.drop(['date'], axis=1, inplace = True)
# tot_deaths_per_province['province'] = tot_deaths_per_province['province'].map(province_names)
# tot_deaths_per_province.set_index(['province'], inplace = True)
# tot_deaths_per_province

**Save to csv**

In [91]:
# tot_deaths_per_province.to_csv('data/tot_deaths_provinces.csv',index=True)

### Recovered

In [92]:
# prov_recovered_totals_df = pd.DataFrame([prov_recovered_totals])
# prov_recovered_totals_df['date'] = date
# prov_recovered_totals_df =prov_recovered_totals_df.melt(id_vars=['date'], var_name='province', 
#                                                   value_name='recovered').set_index(['date'])
# prov_recovered_totals_df['province'] = prov_recovered_totals_df['province'].map(province_names)
# prov_recovered_totals_df

**Save to csv**

In [93]:
# prov_recovered_totals_df.to_csv('data/tot_recovered_provinces.csv',index=True)