# Data to contribute to UP Repo

In [42]:
# Enable Intellisense
%config IPCompleter.greedy=True

import pandas as pd
import numpy as np
import io
import requests
import seaborn as sns
from matplotlib import pyplot, dates
import plotly.express as px 
import datetime
import plotly.graph_objects as go
import plotly.io as pio
sns.set()

from datetime import datetime

# Annoying warning
pd.options.mode.chained_assignment = None  # default='warn'

## Gen Helper Functions

In [5]:
def df_from_url(df_url, pd_kwargs={}):
    df_req = requests.get(df_url).content
    df = pd.read_csv(io.StringIO(df_req.decode('utf-8')), **pd_kwargs)
    return df

In [15]:
prov_dict = {
    "Eastern Cape":"EC",
    "Free State":"FS",
    "Gauteng":"GP",
    "KwaZulu-Natal":"KZN",
    "Limpopo":"LP",
    "Mpumalanga":"MP",
    "Northern Cape":"NC",
    "North West":"NW",
    "Western Cape":"WC",
    "Unknown":"UNKNOWN"
}

In [28]:
def change_date_format(in_date):
    return in_date.strftime('%Y%m%d')

## Deaths Per Prov

In [11]:
orig_data = df_from_url("https://raw.githubusercontent.com/dsfsi/covid19za/master/data/" + 
                        "covid19za_provincial_cumulative_timeline_deaths.csv", )
orig_data

Unnamed: 0,date,YYYYMMDD,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total
0,27-03-2020,20200327,0,0,0,0,0,0,0,0,1,0,1
1,28-03-2020,20200328,0,0,0,1,0,0,0,0,1,0,2
2,30-03-2020,20200330,0,1,0,1,0,0,0,0,1,0,3
3,31-03-2020,20200331,0,1,1,2,0,0,0,0,1,0,5
4,03-04-2020,20200403,0,1,1,6,0,0,0,0,1,0,9
5,05-04-2020,20200405,0,1,1,7,0,0,0,0,2,0,11
6,06-04-2020,20200406,0,1,1,7,0,0,0,0,3,0,12
7,07-04-2020,20200407,0,1,1,8,0,0,0,0,3,0,13
8,08-04-2020,20200408,0,3,3,9,0,0,0,0,3,0,18
9,09-04-2020,20200409,0,3,3,9,0,0,0,0,3,0,18


In [52]:
# Do some pre-processing before add
def add_to_data(orig_data, add_data, date):
    _add_data = add_data.copy() # Might be unneccesary but I'm not sure if add_data is passed by reference
    _add_data['date'] = date
    _add_data['date'] = pd.to_datetime(_add_data['date'], format='%d-%m-%Y')
    _add_data['province'] = add_data['province'].map(prov_dict)
    
    add_data_piv = _add_data.pivot(index='date', columns='province', values=['tot_deaths'])

    add_data_piv.columns = add_data_piv.columns.droplevel(level=0)
    add_data_piv.columns.name = ""
    add_data_piv.reset_index(inplace = True)

    add_data_piv['total'] = add_data_piv.drop(['date'], axis=1).sum(axis=1)
    add_data_piv['YYYYMMDD'] = add_data_piv['date'].apply(change_date_format)
#     print(type(add_data_piv['date'].iloc[-1]))
    add_data_piv['date'] = add_data_piv['date'].apply(lambda x: x.to_pydatetime().strftime("%d-%m-%Y"))

    new_data = pd.concat([orig_data, add_data_piv], axis = 0)
    return new_data

In [54]:
add_data = pd.read_csv('data/tot_deaths_provinces_01.csv')
new_data = add_to_data(orig_data, add_data, '01-05-2020')
new_data

Unnamed: 0,date,YYYYMMDD,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total
0,27-03-2020,20200327,0,0,0,0,0,0,0,0,1,0,1
1,28-03-2020,20200328,0,0,0,1,0,0,0,0,1,0,2
2,30-03-2020,20200330,0,1,0,1,0,0,0,0,1,0,3
3,31-03-2020,20200331,0,1,1,2,0,0,0,0,1,0,5
4,03-04-2020,20200403,0,1,1,6,0,0,0,0,1,0,9
5,05-04-2020,20200405,0,1,1,7,0,0,0,0,2,0,11
6,06-04-2020,20200406,0,1,1,7,0,0,0,0,3,0,12
7,07-04-2020,20200407,0,1,1,8,0,0,0,0,3,0,13
8,08-04-2020,20200408,0,3,3,9,0,0,0,0,3,0,18
9,09-04-2020,20200409,0,3,3,9,0,0,0,0,3,0,18


In [55]:
add_data = pd.read_csv('data/tot_deaths_provinces_02.csv')
new_data = add_to_data(new_data, add_data, '02-05-2020')
new_data

Unnamed: 0,date,YYYYMMDD,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total
0,27-03-2020,20200327,0,0,0,0,0,0,0,0,1,0,1
1,28-03-2020,20200328,0,0,0,1,0,0,0,0,1,0,2
2,30-03-2020,20200330,0,1,0,1,0,0,0,0,1,0,3
3,31-03-2020,20200331,0,1,1,2,0,0,0,0,1,0,5
4,03-04-2020,20200403,0,1,1,6,0,0,0,0,1,0,9
5,05-04-2020,20200405,0,1,1,7,0,0,0,0,2,0,11
6,06-04-2020,20200406,0,1,1,7,0,0,0,0,3,0,12
7,07-04-2020,20200407,0,1,1,8,0,0,0,0,3,0,13
8,08-04-2020,20200408,0,3,3,9,0,0,0,0,3,0,18
9,09-04-2020,20200409,0,3,3,9,0,0,0,0,3,0,18


In [56]:
new_data.to_csv('covid19za_provincial_cumulative_timeline_deaths.csv', index=False)

In [57]:
pd.read_csv('covid19za_provincial_cumulative_timeline_deaths.csv')

Unnamed: 0,date,YYYYMMDD,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN,total
0,27-03-2020,20200327,0,0,0,0,0,0,0,0,1,0,1
1,28-03-2020,20200328,0,0,0,1,0,0,0,0,1,0,2
2,30-03-2020,20200330,0,1,0,1,0,0,0,0,1,0,3
3,31-03-2020,20200331,0,1,1,2,0,0,0,0,1,0,5
4,03-04-2020,20200403,0,1,1,6,0,0,0,0,1,0,9
5,05-04-2020,20200405,0,1,1,7,0,0,0,0,2,0,11
6,06-04-2020,20200406,0,1,1,7,0,0,0,0,3,0,12
7,07-04-2020,20200407,0,1,1,8,0,0,0,0,3,0,13
8,08-04-2020,20200408,0,3,3,9,0,0,0,0,3,0,18
9,09-04-2020,20200409,0,3,3,9,0,0,0,0,3,0,18


In [51]:
type(new_data['date'].iloc[-1])

pandas._libs.tslibs.timestamps.Timestamp

In [43]:
datetime.now().strftime("%d-%m-%Y")

'03-05-2020'