# Data Pre-Processing at SA Level

In [1]:
# Enable Intellisense
%config IPCompleter.greedy=True

import pandas as pd
import numpy as np
import io
import requests 
import datetime

# Annoying warning
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
# get dataframe from specified url using kwargs specified for read_csv
def df_from_url(df_url, pd_kwargs={}, use_base_url=True):
    base_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/"
    if use_base_url:
        df_url = base_url + df_url
    df_req = requests.get(df_url).content
    df = pd.read_csv(io.StringIO(df_req.decode('utf-8')), **pd_kwargs)
    return df

# Generator method to get all dates in specified interval
def datetime_range(start_datetime, end_datetime):
    curr_date = start_datetime
    yield curr_date
    while curr_date < end_datetime:
        curr_date += timedelta(days=1)
        yield curr_date

In [3]:
    def get_cum_daily(data_url, cum_col='total', index_col='date'):  # kwargs={}):
        cols = ['date', 'total']
        pd_kwargs = {"usecols": [cum_col, index_col], "index_col": [index_col]}

        data = df_from_url(data_url, pd_kwargs)
        data.reset_index(inplace=True)
        data['date'] = pd.to_datetime(data['date'], format='%d-%m-%Y')
        data.set_index('date', inplace=True)
        data.rename({cum_col: "cum_no"}, axis=1, inplace=True)
        data.ffill(inplace=True)

        data['daily_no'] = data['cum_no']
        data['daily_no'][1:] = data['cum_no'].diff()[1:]
        # Cast columns to integer
        data = data.astype('int32')
        return data

In [4]:
confirmed_cases_url = "covid19za_provincial_cumulative_timeline_confirmed.csv"
confirmed_data = get_cum_daily(confirmed_cases_url)
confirmed_data

Unnamed: 0_level_0,cum_no,daily_no
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-05,1,1
2020-03-07,2,1
2020-03-08,3,1
2020-03-09,7,4
2020-03-11,13,6
...,...,...
2020-12-12,852965,7882
2020-12-13,860964,7999
2020-12-14,866127,5163
2020-12-15,873679,7552


In [7]:
confirmed_data.iloc[-1]['cum_no'].astype(int)

883687

In [5]:
confirmed_data.index[-1]

Timestamp('2020-12-16 00:00:00')