# Data Preprocessing Per Province

In [1]:
# Enable Intellisense
%config IPCompleter.greedy=True

import pandas as pd
import numpy as np
import io
import requests
import seaborn as sns
from matplotlib import pyplot, dates
import plotly.express as px 
import datetime
import plotly.graph_objects as go
import plotly.io as pio
sns.set()

# Annoying warning
pd.options.mode.chained_assignment = None  # default='warn'

# Confirmed Cases
Taken from 'data/covid19za_provincial_cumulative_timeline_confirmed.csv'
___
## Over Time
### Cumulative

In [5]:
province_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv"

province_data_req = requests.get(province_data_url).content

province_data = pd.read_csv(io.StringIO(province_data_req.decode('utf-8')), delimiter = ',')
province_data.drop(['YYYYMMDD','total','source'], axis = 1, inplace=True)
province_data['date'] = pd.to_datetime(province_data['date'], format='%d-%m-%Y')

province_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
44,2020-04-20,310.0,105.0,1170.0,639.0,27.0,23.0,18.0,25.0,940.0,43.0
45,2020-04-21,345.0,106.0,1199.0,671.0,27.0,24.0,16.0,24.0,1010.0,43.0
46,2020-04-22,377.0,106.0,1224.0,758.0,27.0,23.0,16.0,24.0,1079.0,1.0
47,2020-04-23,417.0,106.0,1252.0,807.0,27.0,23.0,16.0,25.0,1279.0,1.0
48,2020-04-24,480.0,111.0,1281.0,841.0,29.0,24.0,16.0,25.0,1413.0,0.0


In [7]:
province_data_melt = province_data.melt(id_vars=['date'], var_name='province', value_name='cumulative_cases')
province_data_melt.tail()

Unnamed: 0,date,province,cumulative_cases
485,2020-04-20,UNKNOWN,43.0
486,2020-04-21,UNKNOWN,43.0
487,2020-04-22,UNKNOWN,1.0
488,2020-04-23,UNKNOWN,1.0
489,2020-04-24,UNKNOWN,0.0


In [8]:
province_data_melt['province'] = province_data_melt['province'].replace("UNKNOWN","UNK")
province_data_melt.tail()

Unnamed: 0,date,province,cumulative_cases
485,2020-04-20,UNK,43.0
486,2020-04-21,UNK,43.0
487,2020-04-22,UNK,1.0
488,2020-04-23,UNK,1.0
489,2020-04-24,UNK,0.0


### Daily

In [9]:
province_data_daily = province_data.copy()
province_data_daily.iloc[1:,1:] = province_data_daily.iloc[:,1:].diff().iloc[1:,:]
province_data_daily.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
44,2020-04-20,17.0,5.0,22.0,22.0,0.0,0.0,2.0,1.0,72.0,1.0
45,2020-04-21,35.0,1.0,29.0,32.0,0.0,1.0,-2.0,-1.0,70.0,0.0
46,2020-04-22,32.0,0.0,25.0,87.0,0.0,-1.0,0.0,0.0,69.0,-42.0
47,2020-04-23,40.0,0.0,28.0,49.0,0.0,0.0,0.0,1.0,200.0,0.0
48,2020-04-24,63.0,5.0,29.0,34.0,2.0,1.0,0.0,0.0,134.0,-1.0


In [10]:
province_data_daily_melt = province_data_daily.melt(id_vars=['date'], var_name='province', 
                                                    value_name='daily_cases')
province_data_daily_melt.tail()

Unnamed: 0,date,province,daily_cases
485,2020-04-20,UNKNOWN,1.0
486,2020-04-21,UNKNOWN,0.0
487,2020-04-22,UNKNOWN,-42.0
488,2020-04-23,UNKNOWN,0.0
489,2020-04-24,UNKNOWN,-1.0


In [11]:
province_data_daily_melt['province'] = province_data_daily_melt['province'].replace("UNKNOWN","UNK")
province_data_daily_melt.tail()

Unnamed: 0,date,province,daily_cases
485,2020-04-20,UNK,1.0
486,2020-04-21,UNK,0.0
487,2020-04-22,UNK,-42.0
488,2020-04-23,UNK,0.0
489,2020-04-24,UNK,-1.0


### Concatenate Cumulative and Daily

In [12]:
prov_cumulative_daily = pd.concat([province_data_melt, province_data_daily_melt['daily_cases']], axis =1)
prov_cumulative_daily.tail()

Unnamed: 0,date,province,cumulative_cases,daily_cases
485,2020-04-20,UNK,43.0,1.0
486,2020-04-21,UNK,43.0,0.0
487,2020-04-22,UNK,1.0,-42.0
488,2020-04-23,UNK,1.0,0.0
489,2020-04-24,UNK,0.0,-1.0


**Save to csv**

In [13]:
prov_cumulative_daily.to_csv('data/daily_cumulative_confirmed_prov.csv', index=False)

## Summary
### Current Total

In [35]:
prov_confirmed_total = province_data.tail(1)
prov_confirmed_total

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
48,2020-04-24,480.0,111.0,1281.0,841.0,29.0,24.0,16.0,25.0,1413.0,0.0


In [36]:
prov_confirmed_total = prov_confirmed_total.melt(id_vars=['date'], var_name='province', 
                                                    value_name='total')
prov_confirmed_total['province'] = prov_confirmed_total['province'].replace("UNKNOWN","UNK")
prov_confirmed_total.drop(['date'], axis=1, inplace= True)
prov_confirmed_total.set_index(['province'], inplace=True)
prov_confirmed_total

Unnamed: 0_level_0,total
province,Unnamed: 1_level_1
EC,480.0
FS,111.0
GP,1281.0
KZN,841.0
LP,29.0
MP,24.0
NC,16.0
NW,25.0
WC,1413.0
UNK,0.0


### Latest Daily

In [37]:
prov_latest_daily = province_data_daily.copy().tail(1)
prov_latest_daily

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
48,2020-04-24,63.0,5.0,29.0,34.0,2.0,1.0,0.0,0.0,134.0,-1.0


In [38]:
prov_latest_daily = prov_latest_daily.melt(id_vars=['date'], var_name='province', 
                                                    value_name='latest')
prov_latest_daily['province'] = prov_latest_daily['province'].replace("UNKNOWN","UNK")
prov_latest_daily.drop(['date'], axis=1, inplace= True)
prov_latest_daily.set_index(['province'], inplace=True)
prov_latest_daily

Unnamed: 0_level_0,latest
province,Unnamed: 1_level_1
EC,63.0
FS,5.0
GP,29.0
KZN,34.0
LP,2.0
MP,1.0
NC,0.0
NW,0.0
WC,134.0
UNK,-1.0


### Concatenate total and latest 

In [39]:
prov_total_latest_daily = pd.concat([prov_latest_daily, prov_confirmed_total], axis =1)
prov_total_latest_daily

Unnamed: 0_level_0,latest,total
province,Unnamed: 1_level_1,Unnamed: 2_level_1
EC,63.0,480.0
FS,5.0,111.0
GP,29.0,1281.0
KZN,34.0,841.0
LP,2.0,29.0
MP,1.0,24.0
NC,0.0,16.0
NW,0.0,25.0
WC,134.0,1413.0
UNK,-1.0,0.0
