# Data Preprocessing

In [1]:
# Enable Intellisense
%config IPCompleter.greedy=True

import pandas as pd
import numpy as np
import io
import requests
import seaborn as sns
from matplotlib import pyplot, dates
import plotly.express as px 
import datetime
import plotly.graph_objects as go
import plotly.io as pio
sns.set()

# Annoying warning
pd.options.mode.chained_assignment = None  # default='warn'

# Gen Data
## All Confirm

In [2]:
# confirm_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_confirmed.csv"

# confirm_data_req = requests.get(confirm_data_url).content

# confirmed_all_data = pd.read_csv(io.StringIO(confirm_data_req.decode('utf-8')), delimiter = ',', 
#                                  usecols=['date','province','age','gender','type'])
# confirmed_all_data.tail()
# del confirmed_all_data

**No longer going to use above data**

# Over time

## Provinces
Taken from 'covid19za_provincial_cumulative_timeline_confirmed.csv'
### Cumulative

In [3]:
province_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv"

province_data_req = requests.get(province_data_url).content

province_data = pd.read_csv(io.StringIO(province_data_req.decode('utf-8')), delimiter = ',')
province_data.drop(['YYYYMMDD'], axis = 1, inplace=True)
province_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total
26,02-04-2020,17.0,84.0,663.0,206.0,16.0,13.0,7.0,9.0,353.0,94.0,1462
27,03-04-2020,21.0,84.0,672.0,215.0,16.0,13.0,7.0,9.0,374.0,94.0,1505
28,04-04-2020,25.0,85.0,693.0,232.0,18.0,18.0,7.0,11.0,433.0,63.0,1585
29,05-04-2020,31.0,87.0,704.0,246.0,19.0,18.0,8.0,11.0,454.0,77.0,1655
30,06-04-2020,32.0,89.0,713.0,257.0,19.0,18.0,8.0,11.0,462.0,77.0,1686


In [4]:
province_data.drop('total',axis=1, inplace = True)
province_data.dropna(inplace=True)
province_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
26,02-04-2020,17.0,84.0,663.0,206.0,16.0,13.0,7.0,9.0,353.0,94.0
27,03-04-2020,21.0,84.0,672.0,215.0,16.0,13.0,7.0,9.0,374.0,94.0
28,04-04-2020,25.0,85.0,693.0,232.0,18.0,18.0,7.0,11.0,433.0,63.0
29,05-04-2020,31.0,87.0,704.0,246.0,19.0,18.0,8.0,11.0,454.0,77.0
30,06-04-2020,32.0,89.0,713.0,257.0,19.0,18.0,8.0,11.0,462.0,77.0


In [5]:
province_data.to_csv('data/daily_prov.csv', index = False)

In [6]:
province_data = pd.read_csv('data/daily_prov.csv')
province_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
25,02-04-2020,17.0,84.0,663.0,206.0,16.0,13.0,7.0,9.0,353.0,94.0
26,03-04-2020,21.0,84.0,672.0,215.0,16.0,13.0,7.0,9.0,374.0,94.0
27,04-04-2020,25.0,85.0,693.0,232.0,18.0,18.0,7.0,11.0,433.0,63.0
28,05-04-2020,31.0,87.0,704.0,246.0,19.0,18.0,8.0,11.0,454.0,77.0
29,06-04-2020,32.0,89.0,713.0,257.0,19.0,18.0,8.0,11.0,462.0,77.0


In [7]:
province_data['date'] = pd.to_datetime(province_data['date'], format='%d-%m-%Y')

In [8]:
province_data_melt = province_data.melt(id_vars=['date'], var_name='province', value_name='cumulative_cases')
province_data_melt

Unnamed: 0,date,province,cumulative_cases
0,2020-03-05,EC,0.0
1,2020-03-07,EC,0.0
2,2020-03-08,EC,0.0
3,2020-03-09,EC,0.0
4,2020-03-11,EC,0.0
...,...,...,...
295,2020-04-02,UNKNOWN,94.0
296,2020-04-03,UNKNOWN,94.0
297,2020-04-04,UNKNOWN,63.0
298,2020-04-05,UNKNOWN,77.0


In [9]:
province_names = {"EC":"Eastern Cape",
                  "FS" : "Free State",
                  "GP" : "Gauteng",
                  "KZN" : "KwaZula-Natal",
                  "LP" : "Limpopo",
                  "MP" : "Mpumalanga",
                  "NW" : "North West",
                  "NC" : "Northern Cape",
                  "WC" : "Western Cape",
                  "UNKNOWN": "Unknown"}
province_data_melt['province'] = province_data_melt['province'].map(province_names)
province_data_melt.tail()

Unnamed: 0,date,province,cumulative_cases
295,2020-04-02,Unknown,94.0
296,2020-04-03,Unknown,94.0
297,2020-04-04,Unknown,63.0
298,2020-04-05,Unknown,77.0
299,2020-04-06,Unknown,77.0


### Daily

In [10]:
province_data_daily = province_data.copy()
province_data_daily.iloc[1:,1:] = province_data_daily.iloc[:,1:].diff().iloc[1:,:]
province_data_daily.tail()
# province_data_daily

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
25,2020-04-02,2.0,8.0,18.0,20.0,2.0,1.0,0.0,0.0,27.0,4.0
26,2020-04-03,4.0,0.0,9.0,9.0,0.0,0.0,0.0,0.0,21.0,0.0
27,2020-04-04,4.0,1.0,21.0,17.0,2.0,5.0,0.0,2.0,59.0,-31.0
28,2020-04-05,6.0,2.0,11.0,14.0,1.0,0.0,1.0,0.0,21.0,14.0
29,2020-04-06,1.0,2.0,9.0,11.0,0.0,0.0,0.0,0.0,8.0,0.0


In [11]:
province_data_daily_melt = province_data_daily.melt(id_vars=['date'], var_name='province', 
                                                    value_name='daily_cases')
province_data_daily_melt.tail()

Unnamed: 0,date,province,daily_cases
295,2020-04-02,UNKNOWN,4.0
296,2020-04-03,UNKNOWN,0.0
297,2020-04-04,UNKNOWN,-31.0
298,2020-04-05,UNKNOWN,14.0
299,2020-04-06,UNKNOWN,0.0


In [12]:
province_names = {"EC":"Eastern Cape",
                  "FS" : "Free State",
                  "GP" : "Gauteng",
                  "KZN" : "KwaZula-Natal",
                  "LP" : "Limpopo",
                  "MP" : "Mpumalanga",
                  "NW" : "North West",
                  "NC" : "Northern Cape",
                  "WC" : "Western Cape",
                  "UNKNOWN": "Unknown"}
province_data_daily_melt['province'] = province_data_daily_melt['province'].map(province_names)
province_data_daily_melt.tail()

Unnamed: 0,date,province,daily_cases
295,2020-04-02,Unknown,4.0
296,2020-04-03,Unknown,0.0
297,2020-04-04,Unknown,-31.0
298,2020-04-05,Unknown,14.0
299,2020-04-06,Unknown,0.0


### Concatenate Cumulative & Daily

In [13]:
prov_cumulative_daily = pd.concat([province_data_melt, province_data_daily_melt['daily_cases']], axis =1)
prov_cumulative_daily.tail()

Unnamed: 0,date,province,cumulative_cases,daily_cases
295,2020-04-02,Unknown,94.0,4.0
296,2020-04-03,Unknown,94.0,0.0
297,2020-04-04,Unknown,63.0,-31.0
298,2020-04-05,Unknown,77.0,14.0
299,2020-04-06,Unknown,77.0,0.0


**Save to csv**

In [14]:
prov_cumulative_daily.to_csv('data/daily_cumulative_confirmed_prov.csv', index=False)

## Tests

In [15]:
tests_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv"

tests_data_req = requests.get(tests_data_url).content

tests_data = pd.read_csv(io.StringIO(tests_data_req.decode('utf-8')), delimiter = ',',
                         usecols=['date','cumulative_tests'])
tests_data.dropna(inplace=True)

So far tests_data only includes cumulative.

In [16]:
tests_data

Unnamed: 0,date,cumulative_tests
0,11-02-2020,61.0
1,13-02-2020,67.0
2,14-02-2020,71.0
3,19-02-2020,95.0
4,20-02-2020,106.0
5,24-02-2020,116.0
6,26-02-2020,121.0
7,02-03-2020,160.0
8,03-03-2020,164.0
9,06-03-2020,200.0


In [17]:
tests_data.to_csv('data/tests_data.csv', index=False)

### Daily

In [18]:
tests_data['daily_tests'] = tests_data['cumulative_tests']
# slice is required as first entry of diff will be NaN but data we want should be equal to the starting value
tests_data['daily_tests'][1:] = tests_data['cumulative_tests'].diff()[1:] 
tests_data

Unnamed: 0,date,cumulative_tests,daily_tests
0,11-02-2020,61.0,61.0
1,13-02-2020,67.0,6.0
2,14-02-2020,71.0,4.0
3,19-02-2020,95.0,24.0
4,20-02-2020,106.0,11.0
5,24-02-2020,116.0,10.0
6,26-02-2020,121.0,5.0
7,02-03-2020,160.0,39.0
8,03-03-2020,164.0,4.0
9,06-03-2020,200.0,36.0


#### Save to CSV

In [19]:
tests_data.to_csv('data/daily_cumulative_tests.csv', index=False)

## Confirmed Cases
Get no of cumulative cases from 'covid19za_provincial_cumulative_timeline_confirmed.csv'
### Cumulative

In [20]:
confirmed_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv"

confirmed_data_req = requests.get(confirmed_data_url).content

confirmed_data = pd.read_csv(io.StringIO(confirmed_data_req.decode('utf-8')), usecols=['date','total'])

confirmed_data.tail()

Unnamed: 0,date,total
26,02-04-2020,1462
27,03-04-2020,1505
28,04-04-2020,1585
29,05-04-2020,1655
30,06-04-2020,1686


In [21]:
confirmed_data['date'] = pd.to_datetime(confirmed_data['date'], format='%d-%m-%Y')
confirmed_data.set_index('date', inplace=True)
confirmed_data.rename(columns={"total":"cumulative_cases"}, inplace = True)
confirmed_data

Unnamed: 0_level_0,cumulative_cases
date,Unnamed: 1_level_1
2020-03-05,1
2020-03-07,2
2020-03-08,3
2020-03-09,7
2020-03-11,13
2020-03-12,16
2020-03-13,24
2020-03-14,38
2020-03-15,51
2020-03-16,62


### Daily

In [22]:
confirmed_data['daily_cases'] = confirmed_data['cumulative_cases']
# slice is required as first entry of diff will be NaN but data we want should be equal to the starting value
confirmed_data['daily_cases'][1:] = confirmed_data['cumulative_cases'].diff()[1:] 
confirmed_data

Unnamed: 0_level_0,cumulative_cases,daily_cases
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-05,1,1.0
2020-03-07,2,1.0
2020-03-08,3,1.0
2020-03-09,7,4.0
2020-03-11,13,6.0
2020-03-12,16,3.0
2020-03-13,24,8.0
2020-03-14,38,14.0
2020-03-15,51,13.0
2020-03-16,62,11.0


**Save to csv**

In [23]:
confirmed_data.to_csv('data/daily_cumulative_confirmed.csv')

### Daily

In [24]:
# confirmed_data = confirmed_all_data.groupby(['date']).count()[['province']]
# confirmed_data.rename(columns={'province':'daily_cases'}, inplace = True)
# confirmed_data

### Cumulative

In [25]:
# confirmed_data['cumulative_cases'] = confirmed_data['daily_cases'].cumsum()
# confirmed_data

**Save to csv**

Above method is no longer being used. Thus why below is commented out.

In [26]:
# confirmed_data.to_csv('data/daily_cumulative_confirmed.csv')

**Use saved data rather than above**

In [27]:
# confirmed_data = pd.read_csv('data/daily_cumulative_confirmed.csv')
# confirmed_data['date'] = pd.to_datetime(confirmed_data['date'], format='%Y-%m-%d')
# confirmed_data.set_index('date', inplace=True)
# confirmed_data

**Add data to above**

Data to be added:

In [28]:
# # Format must be dd-mm-YYYY
# date_str = "29-03-2020"
# date_dt = pd.to_datetime(date_str, format='%d-%m-%Y')
# new_tot_cases = 1326

In [29]:
# new_daily_cases = new_tot_cases - confirmed_data.iloc[-1]['cumulative_cases']
# new_df_entry = pd.DataFrame({"date":[date_dt],
#              "daily_cases":[new_daily_cases],
#              "cumulative_cases":[new_tot_cases]}).set_index('date')
# confirmed_data = pd.concat([confirmed_data, new_df_entry])
# confirmed_data.tail()

**Save to csv**

In [30]:
# confirmed_data.to_csv('data/daily_cumulative_confirmed.csv')

## Confirmed & Tests
### Daily

In [31]:
confirmed_data_tmp = confirmed_data.reset_index()
confirmed_data_tmp['date'] = pd.to_datetime(confirmed_data_tmp['date'], format='%d-%m-%Y')
confirmed_data_tmp.set_index('date', inplace = True)
confirmed_data_tmp.tail()

Unnamed: 0_level_0,cumulative_cases,daily_cases
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-04-02,1462,82.0
2020-04-03,1505,43.0
2020-04-04,1585,80.0
2020-04-05,1655,70.0
2020-04-06,1686,31.0


In [32]:
tests_data_tmp = tests_data.copy()
# test_data_tmp.info()
tests_data_tmp['date'] = pd.to_datetime(tests_data_tmp['date'], format='%d-%m-%Y')
tests_data_tmp.set_index('date', inplace = True)
tests_data_tmp.tail()

Unnamed: 0_level_0,cumulative_tests,daily_tests
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-04-02,47965.0,3673.0
2020-04-03,50361.0,2396.0
2020-04-04,53937.0,3576.0
2020-04-05,56873.0,2936.0
2020-04-06,58098.0,1225.0


In [33]:
daily_tests_confirmed = pd.concat([confirmed_data_tmp[['daily_cases']], 
                                   tests_data_tmp['daily_tests']], axis = 1, sort = True)
daily_tests_confirmed['daily_cases'].fillna(0,inplace=True)
daily_tests_confirmed

Unnamed: 0_level_0,daily_cases,daily_tests
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-11,0.0,61.0
2020-02-13,0.0,6.0
2020-02-14,0.0,4.0
2020-02-19,0.0,24.0
2020-02-20,0.0,11.0
2020-02-24,0.0,10.0
2020-02-26,0.0,5.0
2020-03-02,0.0,39.0
2020-03-03,0.0,4.0
2020-03-05,1.0,


NaN for daily_tests corresponds to test data not available.
### Percentage of Positive Tests

In [34]:
daily_tests_confirmed['perc_positive'] = (daily_tests_confirmed['daily_cases']/
                                          daily_tests_confirmed['daily_tests']) * 100
daily_tests_confirmed['perc_positive'] = daily_tests_confirmed['perc_positive'].round(1)
daily_tests_confirmed

Unnamed: 0_level_0,daily_cases,daily_tests,perc_positive
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-02-11,0.0,61.0,0.0
2020-02-13,0.0,6.0,0.0
2020-02-14,0.0,4.0,0.0
2020-02-19,0.0,24.0,0.0
2020-02-20,0.0,11.0,0.0
2020-02-24,0.0,10.0,0.0
2020-02-26,0.0,5.0,0.0
2020-03-02,0.0,39.0,0.0
2020-03-03,0.0,4.0,0.0
2020-03-05,1.0,,


**Save to csv**

In [35]:
daily_tests_confirmed.to_csv('data/daily_tests_confirmed.csv')

### Cumulative

In [36]:
cumulative_tests_confirmed = daily_tests_confirmed.cumsum()
cumulative_tests_confirmed

Unnamed: 0_level_0,daily_cases,daily_tests,perc_positive
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-02-11,0.0,61.0,0.0
2020-02-13,0.0,67.0,0.0
2020-02-14,0.0,71.0,0.0
2020-02-19,0.0,95.0,0.0
2020-02-20,0.0,106.0,0.0
2020-02-24,0.0,116.0,0.0
2020-02-26,0.0,121.0,0.0
2020-03-02,0.0,160.0,0.0
2020-03-03,0.0,164.0,0.0
2020-03-05,1.0,,


// TODO - Rename to Cumulative as this is confusing

### Percentage of Positive Tests

In [37]:
cumulative_tests_confirmed['perc_positive'] = (cumulative_tests_confirmed['daily_cases']/
                                          cumulative_tests_confirmed['daily_tests']) * 100
cumulative_tests_confirmed['perc_positive'] = cumulative_tests_confirmed['perc_positive'].round(1)
cumulative_tests_confirmed

Unnamed: 0_level_0,daily_cases,daily_tests,perc_positive
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-02-11,0.0,61.0,0.0
2020-02-13,0.0,67.0,0.0
2020-02-14,0.0,71.0,0.0
2020-02-19,0.0,95.0,0.0
2020-02-20,0.0,106.0,0.0
2020-02-24,0.0,116.0,0.0
2020-02-26,0.0,121.0,0.0
2020-03-02,0.0,160.0,0.0
2020-03-03,0.0,164.0,0.0
2020-03-05,1.0,,


**Save to csv**

In [38]:
cumulative_tests_confirmed.to_csv('data/cumulative_tests_confirmed.csv')

## Deaths & Recoveries
### Recoveries

In [39]:
recovered_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv"
recovered_data_req = requests.get(recovered_data_url).content

recovered_data = pd.read_csv(io.StringIO(recovered_data_req.decode('utf-8')), delimiter = ',', 
                            usecols=['date','recovered'])
# rename recovered
recovered_data.rename(columns={"recovered":"cum_recovered"}, inplace=True)

# drop fields where recovered is 0
recovered_data.drop(recovered_data[recovered_data['cum_recovered']==0]['cum_recovered'].index, inplace=True)
recovered_data

Unnamed: 0,date,cum_recovered
22,22-03-2020,1
23,23-03-2020,1
24,24-03-2020,2
25,25-03-2020,4
26,26-03-2020,4
27,27-03-2020,31
28,28-03-2020,31
29,29-03-2020,31
30,30-03-2020,31
31,31-03-2020,31


In [40]:
recovered_data['date'] = pd.to_datetime(recovered_data['date'], format='%d-%m-%Y')
recovered_data.set_index('date', inplace = True)
recovered_data

Unnamed: 0_level_0,cum_recovered
date,Unnamed: 1_level_1
2020-03-22,1
2020-03-23,1
2020-03-24,2
2020-03-25,4
2020-03-26,4
2020-03-27,31
2020-03-28,31
2020-03-29,31
2020-03-30,31
2020-03-31,31


In [41]:
recovered_data['daily_recovered'] = recovered_data['cum_recovered']
# slice is required as first entry of diff will be NaN but data we want should be equal to the starting value
recovered_data['daily_recovered'][1:] = recovered_data['cum_recovered'].diff()[1:] 
recovered_data

Unnamed: 0_level_0,cum_recovered,daily_recovered
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-22,1,1.0
2020-03-23,1,0.0
2020-03-24,2,1.0
2020-03-25,4,2.0
2020-03-26,4,0.0
2020-03-27,31,27.0
2020-03-28,31,0.0
2020-03-29,31,0.0
2020-03-30,31,0.0
2020-03-31,31,0.0


**Save to csv**

In [42]:
recovered_data.to_csv('data/recovered_data.csv')

## Deaths
From https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_deaths.csv

In [43]:
deaths_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_deaths.csv"
deaths_data_req = requests.get(deaths_data_url).content

deaths_data = pd.read_csv(io.StringIO(deaths_data_req.decode('utf-8')), delimiter = ',',
                            usecols=['date', 'province', ' gender', ' age'])
deaths_data.rename(columns={" gender":"gender", " age":"age"}, inplace=True)

deaths_data

Unnamed: 0,date,province,gender,age
0,27-03-2020,WC,female,48
1,28-03-2020,KZN,male,74
2,30-03-2020,FS,male,86
3,31-03-2020,GP,male,79
4,31-03-2020,KZN,female,46
5,03-04-2020,KZN,female,63
6,03-04-2020,KZN,female,81
7,03-04-2020,KZN,female,80
8,03-04-2020,KZN,male,80
9,05-04-2020,WC,female,82


In [44]:
deaths_data['date'] = pd.to_datetime(deaths_data['date'], format='%d-%m-%Y')
deaths_data

Unnamed: 0,date,province,gender,age
0,2020-03-27,WC,female,48
1,2020-03-28,KZN,male,74
2,2020-03-30,FS,male,86
3,2020-03-31,GP,male,79
4,2020-03-31,KZN,female,46
5,2020-04-03,KZN,female,63
6,2020-04-03,KZN,female,81
7,2020-04-03,KZN,female,80
8,2020-04-03,KZN,male,80
9,2020-04-05,WC,female,82


### Total per day
#### Daily

In [45]:
deaths_data_per_day = deaths_data.groupby('date')[['province']].count()
deaths_data_per_day.rename(columns={"province":"daily_deaths"},inplace=True)
deaths_data_per_day

Unnamed: 0_level_0,daily_deaths
date,Unnamed: 1_level_1
2020-03-27,1
2020-03-28,1
2020-03-30,1
2020-03-31,2
2020-04-03,4
2020-04-05,2
2020-04-06,1


Manually add entry

#### Cumulative

In [46]:
deaths_data_per_day['cum_deaths']=deaths_data_per_day['daily_deaths'].cumsum()
deaths_data_per_day

Unnamed: 0_level_0,daily_deaths,cum_deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-27,1,1
2020-03-28,1,2
2020-03-30,1,3
2020-03-31,2,5
2020-04-03,4,9
2020-04-05,2,11
2020-04-06,1,12


In [47]:
# # Format must be dd-mm-YYYY
# date_str = "04-04-2020"
# date_dt = pd.to_datetime(date_str, format='%d-%m-%Y')
# new_tot_deaths = 9

# new_daily_deaths = new_tot_deaths - deaths_data_per_day.iloc[-1]['cum_deaths']
# new_df_entry = pd.DataFrame({"date":[date_dt], 
#                              "daily_deaths":[new_daily_deaths],
#                              "cum_deaths":[new_tot_deaths],}).set_index('date')
# deaths_data_per_day = pd.concat([deaths_data_per_day, new_df_entry])
# deaths_data_per_day

**Save to csv**

In [48]:
deaths_data_per_day.to_csv('data/daily_cum_deaths.csv')

#### Deaths Vs Recovered

In [49]:
deaths_vs_recoveries = pd.concat([deaths_data_per_day[['cum_deaths']], recovered_data['cum_recovered']], 
                                 axis =1)
deaths_vs_recoveries.iloc[0,0] = 0
deaths_vs_recoveries['cum_deaths'].ffill(inplace=True)
deaths_vs_recoveries['cum_recovered'].ffill(inplace=True)
deaths_vs_recoveries

Unnamed: 0_level_0,cum_deaths,cum_recovered
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-22,0.0,1
2020-03-23,0.0,1
2020-03-24,0.0,2
2020-03-25,0.0,4
2020-03-26,0.0,4
2020-03-27,1.0,31
2020-03-28,2.0,31
2020-03-29,2.0,31
2020-03-30,3.0,31
2020-03-31,5.0,31


**Save to csv**

In [50]:
deaths_vs_recoveries.to_csv('data/deaths_vs_recoveries.csv')

# Totals
## Province

In [51]:
prov_totals = province_data.tail(1).copy()
prov_totals = prov_totals.melt(id_vars=['date'], var_name='province', value_name='total')
prov_totals['province'] = prov_totals['province'].map(province_names)
prov_totals.drop('date',axis=1,inplace=True)
prov_totals

Unnamed: 0,province,total
0,Eastern Cape,32.0
1,Free State,89.0
2,Gauteng,713.0
3,KwaZula-Natal,257.0
4,Limpopo,19.0
5,Mpumalanga,18.0
6,Northern Cape,8.0
7,North West,11.0
8,Western Cape,462.0
9,Unknown,77.0


**Save to csv**

In [52]:
prov_totals.to_csv('data/tot_provinces.csv',index=False)

## Data to be displayed as text on website

In [53]:
round(1.1)

1

In [54]:
tot_tested = str(tests_data.tail(1).iloc[0]['cumulative_tests'].astype(int))
change_tested = str(tests_data.tail(1).iloc[0]['daily_tests'].astype(int))
print(tot_tested, change_tested)

58098 1225


In [55]:
tot_infected = str(confirmed_data.tail(1).iloc[0]['cumulative_cases'].astype(int))
change_infected = str(confirmed_data.tail(1).iloc[0]['daily_cases'].astype(int))
print(tot_infected, change_infected)

1686 31


In [63]:
tot_deaths = str(deaths_data_per_day.tail(1).iloc[0]['cum_deaths'].astype(int))
change_deaths = str(deaths_data_per_day.tail(1).iloc[0]['daily_deaths'].astype(int))
print(tot_deaths, change_deaths)

12 1


In [64]:
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M %d %B %Y")
print("Current Time =", current_time)

Current Time = 13:15 07 April 2020


In [67]:
gen_data = pd.DataFrame(dict(tot_infected=[tot_infected], change_infected=[change_infected], tot_deaths=[tot_deaths],
                             tot_tested=[tot_tested], change_tested=[change_tested], change_deaths=[change_deaths],
                             datetime_updated=[current_time]))
gen_data

Unnamed: 0,tot_infected,change_infected,tot_deaths,tot_tested,change_tested,change_deaths,datetime_updated
0,1686,31,12,58098,1225,1,13:15 07 April 2020


In [68]:
gen_data.to_csv("data/gen_data.csv", index=False)

## Age
For now this is just being done by hand. I.e. copying from official report.

## Gender

In [59]:
# tot_gender_data =  confirmed_all_data.groupby(['gender']).count()[['date']]
# tot_gender_data.rename(columns={'date':'total'}, inplace=True)
# tot_gender_data.rename({"not specified":"unknown"}, axis='index',inplace=True)
# tot_gender_data

**Save to csv**

In [60]:
# tot_gender_data.to_csv('data/tot_gender.csv')

## Transmission Type

In [61]:
# tot_transmission_data =  confirmed_all_data.groupby(['type']).count()[['date']]
# tot_transmission_data.rename(columns={'date':'total'}, inplace=True)
# tot_transmission_data.rename({"pending":"unknown"}, axis='index',inplace=True)
# tot_transmission_data

**Save to csv**

In [62]:
# tot_transmission_data.to_csv('data/tot_transmission_type.csv')