In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import math 

import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [176]:
%%time
# Import xlsx file and store each sheet in to a df list
xl_file = pd.ExcelFile('./data.xls')

dfs = {sheet_name: xl_file.parse(sheet_name) 
          for sheet_name in xl_file.sheet_names}

CPU times: user 1.18 s, sys: 20.3 ms, total: 1.2 s
Wall time: 1.2 s


In [175]:
%%time
# Data from each sheet can be accessed via key
keyList = list(dfs.keys())
# Data cleansing
for key, df in dfs.items():
    dfs[key].loc[:,'Confirmed'].fillna(value=0, inplace=True)
    dfs[key].loc[:,'Deaths'].fillna(value=0, inplace=True)
    dfs[key].loc[:,'Recovered'].fillna(value=0, inplace=True)
    dfs[key]=dfs[key].astype({'Confirmed':'int64', 'Deaths':'int64', 'Recovered':'int64'})
    # Change as China for coordinate search
    dfs[key]=dfs[key].replace({'Country/Region':'Mainland China'}, 'China')
    # Add a zero to the date so can be convert by datetime.strptime as 0-padded date
    dfs[key]['Last Update'] = '0' + dfs[key]['Last Update']
    # Convert time as Australian eastern daylight time
    dfs[key]['Date_last_updated_AEDT'] = [datetime.strptime(d, '%m/%d/%Y %H:%M') for d in dfs[key]['Last Update']]
    dfs[key]['Date_last_updated_AEDT'] = dfs[key]['Date_last_updated_AEDT'] + timedelta(hours=16)
    
# Check 
#dfs[keyList[0]].head()

CPU times: user 1.86 s, sys: 8.07 ms, total: 1.87 s
Wall time: 1.87 s


In [6]:
for key, df in dfs.items():
    dfTpm = df.groupby(['Country/Region'])['Confirmed'].agg(np.sum)
    dfTpm = pd.DataFrame({'Code':dfTpm.index, 'Confirmed':dfTpm.values})
    dfTpm = dfTpm.sort_values(by='Confirmed', ascending=False).reset_index(drop=True)
dfTpm.shape

(5, 2)

In [9]:
df = dfs[keyList[0]]
df.head()

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Date_last_updated_AEDT
0,Hubei,China,03/04/2020 18:00,67332,2871,38556,2020-03-05 10:00:00
1,Guangdong,China,03/04/2020 18:00,1350,7,1133,2020-03-05 10:00:00
2,Zhejiang,China,03/04/2020 18:00,1213,1,1114,2020-03-05 10:00:00
3,Shandong,China,03/04/2020 18:00,758,6,515,2020-03-05 10:00:00
4,Henan,China,03/04/2020 18:00,1272,22,1234,2020-03-05 10:00:00


In [13]:
dfTpm = df.groupby(['Country/Region'])['Confirmed', 'Deaths', 'Recovered'].agg(np.sum)

dfTpm

Unnamed: 0_level_0,Confirmed,Deaths,Recovered
Country/Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,1,0,0
Algeria,17,0,0
Andorra,1,0,0
Argentina,1,0,0
Armenia,1,0,0
...,...,...,...
UK,87,0,8
US,155,11,7
Ukraine,1,0,0
United Arab Emirates,28,0,5


In [120]:
def df_for_lineplot_diff(dfs, CaseType):
    '''This is the function for construct df for line plot'''
    
    assert type(CaseType) is str, "CaseType must be one of the following three strings Confirmed/Recovered/Deaths"
    
    
    # Construct confirmed cases dataframe for line plot
    DateList = []
    ChinaList =[]
    OtherList = []

    for key, df in dfs.items():
        dfTpm = df.groupby(['Country/Region'])[CaseType].agg(np.sum)
        dfTpm = pd.DataFrame({'Code':dfTpm.index, CaseType:dfTpm.values})
        dfTpm = dfTpm.sort_values(by=CaseType, ascending=False).reset_index(drop=True)
        DateList.append(df['Date_last_updated_AEDT'][0])
        ChinaList.append(dfTpm[CaseType][0])
        OtherList.append(dfTpm[CaseType][1:].sum())

    df = pd.DataFrame({'Date':DateList,
                       'Mainland China':ChinaList,
                       'Other locations':OtherList})
    df['Total']=df['Mainland China']+df['Other locations']

    # Calculate differenec in a 24-hour window
    for index, _ in df.iterrows():
        # Calculate the time differnece in hour
        diff=(df['Date'][0] - df['Date'][index]).total_seconds()/3600
        # find out the latest time after 24-hour
        if diff >= 24:
            break
    plusNum = df['Total'][0] - df['Total'][index]
    plusPercentNum = (df['Total'][0] - df['Total'][index])/df['Total'][index]

    # Select the latest data from a given date
    df['date_day']=[d.date() for d in df['Date']]
    df=df.groupby(by=df['date_day'], sort=False).transform(max).drop_duplicates(['Date'])
    
    df=df.reset_index(drop=True)
    
    return df, plusNum, plusPercentNum 

In [122]:
%%time
df_confirmed, plusConfirmedNum, plusPercentNum1 = df_for_lineplot_diff(dfs, 'Confirmed')

CPU times: user 760 ms, sys: 3.47 ms, total: 763 ms
Wall time: 766 ms


In [123]:
df_confirmed.head()

Unnamed: 0,Date,Mainland China,Other locations,Total
0,2020-03-05 12:00:00,80408,15048,95456
1,2020-03-04 22:00:00,80266,13298,93564
2,2020-03-03 22:00:00,80151,12034,92185
3,2020-03-02 22:00:00,80026,9753,89779
4,2020-03-01 22:00:00,79823,7170,86993


In [124]:
plusConfirmedNum

2570

In [125]:
plusPercentNum1

0.027668324612966432

In [141]:
df_recovered, plusRecoveredNum, plusPercentNum2 = df_for_lineplot_diff(dfs, 'Recovered')

In [142]:
df_recovered.head()

Unnamed: 0,Date,Mainland China,Other locations,Total
0,2020-03-05 12:00:00,52077,1483,53560
1,2020-03-04 22:00:00,49906,1081,50987
2,2020-03-03 22:00:00,47335,914,48249
3,2020-03-02 22:00:00,44546,626,45172
4,2020-03-01 22:00:00,42070,520,42590


In [136]:
plusRecoveredNum

5154

In [137]:
plusPercentNum2

0.1064744039995042

In [132]:
df_deaths, plusDeathNum, plusPercentNum3 = df_for_lineplot(dfs, 'Deaths')

In [133]:
plusDeathNum

122

In [134]:
plusPercentNum3

0.03859538120847833