# Data Preprocessing Per Province

In [1]:
# Enable Intellisense
%config IPCompleter.greedy=True

import pandas as pd
import numpy as np
import io
import requests
import seaborn as sns
from matplotlib import pyplot, dates
import plotly.express as px 
import datetime
import plotly.graph_objects as go
import plotly.io as pio
sns.set()

# Annoying warning
pd.options.mode.chained_assignment = None  # default='warn'

# Confirmed Cases
Taken from 'data/covid19za_provincial_cumulative_timeline_confirmed.csv'
___
## Over Time
### Cumulative

In [2]:
province_data_url = "https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv"

province_data_req = requests.get(province_data_url).content

province_data = pd.read_csv(io.StringIO(province_data_req.decode('utf-8')), delimiter = ',')
province_data.drop(['YYYYMMDD','total','source'], axis = 1, inplace=True)
province_data['date'] = pd.to_datetime(province_data['date'], format='%d-%m-%Y')

province_data.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
50,2020-04-26,535.0,110.0,1331.0,863.0,31.0,23.0,17.0,28.0,1608.0,0.0
51,2020-04-27,588.0,111.0,1353.0,902.0,31.0,26.0,17.0,31.0,1737.0,0.0
52,2020-04-28,616.0,113.0,1377.0,919.0,31.0,26.0,17.0,29.0,1870.0,0.0
53,2020-04-29,630.0,113.0,1408.0,956.0,31.0,31.0,17.0,29.0,2135.0,0.0
54,2020-04-30,647.0,116.0,1446.0,980.0,32.0,36.0,17.0,31.0,2342.0,0.0


In [3]:
province_data_melt = province_data.melt(id_vars=['date'], var_name='province', value_name='cumulative_cases')
province_data_melt.tail()

Unnamed: 0,date,province,cumulative_cases
545,2020-04-26,UNKNOWN,0.0
546,2020-04-27,UNKNOWN,0.0
547,2020-04-28,UNKNOWN,0.0
548,2020-04-29,UNKNOWN,0.0
549,2020-04-30,UNKNOWN,0.0


In [4]:
province_data_melt['province'] = province_data_melt['province'].replace("UNKNOWN","UNK")
province_data_melt.tail()

Unnamed: 0,date,province,cumulative_cases
545,2020-04-26,UNK,0.0
546,2020-04-27,UNK,0.0
547,2020-04-28,UNK,0.0
548,2020-04-29,UNK,0.0
549,2020-04-30,UNK,0.0


### Daily

In [5]:
province_data_daily = province_data.copy()
province_data_daily.iloc[1:,1:] = province_data_daily.iloc[:,1:].diff().iloc[1:,:]
province_data_daily.tail()

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
50,2020-04-26,47.0,-1.0,27.0,22.0,1.0,0.0,1.0,0.0,94.0,0.0
51,2020-04-27,53.0,1.0,22.0,39.0,0.0,3.0,0.0,3.0,129.0,0.0
52,2020-04-28,28.0,2.0,24.0,17.0,0.0,0.0,0.0,-2.0,133.0,0.0
53,2020-04-29,14.0,0.0,31.0,37.0,0.0,5.0,0.0,0.0,265.0,0.0
54,2020-04-30,17.0,3.0,38.0,24.0,1.0,5.0,0.0,2.0,207.0,0.0


In [6]:
province_data_daily_melt = province_data_daily.melt(id_vars=['date'], var_name='province', 
                                                    value_name='daily_cases')
province_data_daily_melt.tail()

Unnamed: 0,date,province,daily_cases
545,2020-04-26,UNKNOWN,0.0
546,2020-04-27,UNKNOWN,0.0
547,2020-04-28,UNKNOWN,0.0
548,2020-04-29,UNKNOWN,0.0
549,2020-04-30,UNKNOWN,0.0


In [7]:
province_data_daily_melt['province'] = province_data_daily_melt['province'].replace("UNKNOWN","UNK")
province_data_daily_melt.tail()

Unnamed: 0,date,province,daily_cases
545,2020-04-26,UNK,0.0
546,2020-04-27,UNK,0.0
547,2020-04-28,UNK,0.0
548,2020-04-29,UNK,0.0
549,2020-04-30,UNK,0.0


### Concatenate Cumulative and Daily

In [8]:
prov_cumulative_daily = pd.concat([province_data_melt, province_data_daily_melt['daily_cases']], axis =1)
prov_cumulative_daily.tail()

Unnamed: 0,date,province,cumulative_cases,daily_cases
545,2020-04-26,UNK,0.0,0.0
546,2020-04-27,UNK,0.0,0.0
547,2020-04-28,UNK,0.0,0.0
548,2020-04-29,UNK,0.0,0.0
549,2020-04-30,UNK,0.0,0.0


**Save to csv**

In [9]:
prov_cumulative_daily.to_csv('data/daily_cumulative_confirmed_prov.csv', index=False)

## Summary
### Current Total

In [10]:
prov_confirmed_total = province_data.tail(1)
prov_confirmed_total

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
54,2020-04-30,647.0,116.0,1446.0,980.0,32.0,36.0,17.0,31.0,2342.0,0.0


In [11]:
prov_confirmed_total = prov_confirmed_total.melt(id_vars=['date'], var_name='province', 
                                                    value_name='total')
prov_confirmed_total['province'] = prov_confirmed_total['province'].replace("UNKNOWN","UNK")
prov_confirmed_total.drop(['date'], axis=1, inplace= True)
prov_confirmed_total.set_index(['province'], inplace=True)
prov_confirmed_total

Unnamed: 0_level_0,total
province,Unnamed: 1_level_1
EC,647.0
FS,116.0
GP,1446.0
KZN,980.0
LP,32.0
MP,36.0
NC,17.0
NW,31.0
WC,2342.0
UNK,0.0


### Latest Daily

In [12]:
prov_latest_daily = province_data_daily.copy().tail(1)
prov_latest_daily

Unnamed: 0,date,EC,FS,GP,KZN,LP,MP,NC,NW,WC,UNKNOWN
54,2020-04-30,17.0,3.0,38.0,24.0,1.0,5.0,0.0,2.0,207.0,0.0


In [13]:
prov_latest_daily = prov_latest_daily.melt(id_vars=['date'], var_name='province', 
                                                    value_name='latest')
prov_latest_daily['province'] = prov_latest_daily['province'].replace("UNKNOWN","UNK")
prov_latest_daily.drop(['date'], axis=1, inplace= True)
prov_latest_daily.set_index(['province'], inplace=True)
prov_latest_daily

Unnamed: 0_level_0,latest
province,Unnamed: 1_level_1
EC,17.0
FS,3.0
GP,38.0
KZN,24.0
LP,1.0
MP,5.0
NC,0.0
NW,2.0
WC,207.0
UNK,0.0


### Concatenate total and latest 

In [14]:
prov_total_latest_daily = pd.concat([prov_latest_daily, prov_confirmed_total], axis =1)
prov_total_latest_daily

Unnamed: 0_level_0,latest,total
province,Unnamed: 1_level_1,Unnamed: 2_level_1
EC,17.0,647.0
FS,3.0,116.0
GP,38.0,1446.0
KZN,24.0,980.0
LP,1.0,32.0
MP,5.0,36.0
NC,0.0,17.0
NW,2.0,31.0
WC,207.0,2342.0
UNK,0.0,0.0
