In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import math 

import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [215]:
%%time
import os

# Import csv file and store each csv in to a df list

filename = os.listdir('./raw_data/')
sheet_name = [i.replace('.csv', '') for i in filename]
sheet_name = sheet_name[::-1]

dfs = {sheet_name: pd.read_csv('./raw_data/{}.csv'.format(sheet_name))
          for sheet_name in sheet_name}

CPU times: user 654 ms, sys: 29.3 ms, total: 683 ms
Wall time: 711 ms


In [219]:
filename = os.listdir('./')
filename

['.DS_Store',
 '.gitignore',
 '.ipynb_checkpoints',
 '2020-01-30-21-30_data.csv',
 '2020-01-31-14-00_data.csv',
 '2020-01-31-18-00_data.csv',
 '2020-01-31-19-00_data.csv',
 '2020-02-01-10-00_data.csv',
 '2020-02-01-18-00_data.csv',
 '2020-02-01-23-00_data.csv',
 '2020-02-02-05-00_data.csv',
 '2020-02-02-14-00_data.csv',
 '2020-02-02-20-00_data.csv',
 '2020-02-02-22-00_data.csv',
 '2020-02-03-03-00_data.csv',
 '2020-02-03-05-00_data.csv',
 '2020-02-03-15-00_data.csv',
 '2020-02-03-18-00_data.csv',
 '2020-02-03-20-00_data.csv',
 '2020-02-04-01-00_data.csv',
 '2020-02-04-05-00_data.csv',
 '2020-02-04-14-00_data.csv',
 '2020-02-04-17-00_data.csv',
 '2020-02-04-20-00_data.csv',
 '2020-02-05-02-00_data.csv',
 '2020-02-05-06-00_data.csv',
 '2020-02-05-15-00_data.csv',
 '2020-02-05-19-00_data.csv',
 '2020-02-05-23-00_data.csv',
 '2020-02-06-03-00_data.csv',
 '2020-02-06-06-00_data.csv',
 '2020-02-06-15-00_data.csv',
 '2020-02-06-18-00_data.csv',
 '2020-02-06-20-00_data.csv',
 '2020-02-06-21-30

In [216]:
%%time
# Data from each sheet can be accessed via key
keyList = list(dfs.keys())
# Data cleansing
for key, df in dfs.items():
    dfs[key].loc[:,'Confirmed'].fillna(value=0, inplace=True)
    dfs[key].loc[:,'Deaths'].fillna(value=0, inplace=True)
    dfs[key].loc[:,'Recovered'].fillna(value=0, inplace=True)
    dfs[key]=dfs[key].astype({'Confirmed':'int64', 'Deaths':'int64', 'Recovered':'int64'})
    # Change as China for coordinate search
    dfs[key]=dfs[key].replace({'Country/Region':'Mainland China'}, 'China')
    # Add a zero to the date so can be convert by datetime.strptime as 0-padded date
    dfs[key]['Last Update'] = '0' + dfs[key]['Last Update']
    # Convert time as Australian eastern daylight time
    dfs[key]['Date_last_updated_AEDT'] = [datetime.strptime(d, '%m/%d/%Y %H:%M') for d in dfs[key]['Last Update']]
    dfs[key]['Date_last_updated_AEDT'] = dfs[key]['Date_last_updated_AEDT'] + timedelta(hours=16)
    
# Check 
#dfs[keyList[0]].head()

CPU times: user 2.06 s, sys: 9.09 ms, total: 2.07 s
Wall time: 2.08 s


In [210]:
def df_for_lineplot_diff(dfs, CaseType):
    '''This is the function for construct df for line plot'''
    
    assert type(CaseType) is str, "CaseType must be one of the following three strings Confirmed/Recovered/Deaths"
    
    
    # Construct confirmed cases dataframe for line plot
    DateList = []
    ChinaList =[]
    OtherList = []

    for key, df in dfs.items():
        dfTpm = df.groupby(['Country/Region'])[CaseType].agg(np.sum)
        dfTpm = pd.DataFrame({'Code':dfTpm.index, CaseType:dfTpm.values})
        dfTpm = dfTpm.sort_values(by=CaseType, ascending=False).reset_index(drop=True)
        DateList.append(df['Date_last_updated_AEDT'][0])
        ChinaList.append(dfTpm[CaseType][0])
        OtherList.append(dfTpm[CaseType][1:].sum())

    df = pd.DataFrame({'Date':DateList,
                       'Mainland China':ChinaList,
                       'Other locations':OtherList})
    df['Total']=df['Mainland China']+df['Other locations']

    # Calculate differenec in a 24-hour window
    for index, _ in df.iterrows():
        # Calculate the time differnece in hour
        diff=(df['Date'][0] - df['Date'][index]).total_seconds()/3600
        # find out the latest time after 24-hour
        if diff >= 24:
            break
    plusNum = df['Total'][0] - df['Total'][index]
    plusPercentNum = (df['Total'][0] - df['Total'][index])/df['Total'][index]

    # Select the latest data from a given date
    df['date_day']=[d.date() for d in df['Date']]
    df=df.groupby(by=df['date_day'], sort=False).transform(max).drop_duplicates(['Date'])
    
    df=df.reset_index(drop=True)
    
    return df, plusNum, plusPercentNum 

In [211]:
%%time
df_confirmed, plusConfirmedNum, plusPercentNum1 = df_for_lineplot_diff(dfs, 'Confirmed')

CPU times: user 864 ms, sys: 3.36 ms, total: 867 ms
Wall time: 873 ms


In [212]:
df_confirmed.head()

Unnamed: 0,Date,Mainland China,Other locations,Total
0,2020-03-05 14:00:00,80408,15054,95462
1,2020-03-04 22:00:00,80266,13298,93564
2,2020-03-03 22:00:00,80151,12034,92185
3,2020-03-02 22:00:00,80026,9753,89779
4,2020-03-01 22:00:00,79823,7170,86993


In [213]:
plusConfirmedNum

2303

In [214]:
plusPercentNum1

0.02472117562446999

In [141]:
df_recovered, plusRecoveredNum, plusPercentNum2 = df_for_lineplot_diff(dfs, 'Recovered')

In [142]:
df_recovered.head()

Unnamed: 0,Date,Mainland China,Other locations,Total
0,2020-03-05 12:00:00,52077,1483,53560
1,2020-03-04 22:00:00,49906,1081,50987
2,2020-03-03 22:00:00,47335,914,48249
3,2020-03-02 22:00:00,44546,626,45172
4,2020-03-01 22:00:00,42070,520,42590


In [136]:
plusRecoveredNum

5154

In [137]:
plusPercentNum2

0.1064744039995042

In [132]:
df_deaths, plusDeathNum, plusPercentNum3 = df_for_lineplot(dfs, 'Deaths')

In [133]:
plusDeathNum

122

In [134]:
plusPercentNum3

0.03859538120847833

In [1]:
import os
import glob
filename = os.listdir('./')
sheet_name = [i.replace('.csv', '') for i in filename]
sheet_name = sheet_name[::-1]

In [2]:
sheet_name

['~$data.xlsx',
 'Updated_coordinates.ipynb',
 'Statistics_each_region.ipynb',
 'requirements.txt',
 'README.md',
 'raw_data',
 'Procfile',
 'markdown_chinese',
 'LICENSE',
 'Data_cleansing.ipynb',
 'data.xlsx',
 'data.xls',
 'data.json',
 'dashboard-virus.ipynb',
 'dashboard-callback-virus.ipynb',
 'dashboard-callback-virus-combine.ipynb',
 'dashboard-callback-virus-combine-past24.ipynb',
 'dashboard-callback-virus-combine-past24-scatterSize.ipynb',
 'dash_table_example.ipynb',
 'AU_cases.xlsx',
 'assets',
 'app_withDataSource.py',
 'app_screenshot.png',
 'app_screenshot.gif',
 'app_replaced_20200305.py',
 'app_old.py',
 'app_news.py',
 'app.py',
 '2020-03-04-22-00_data',
 '2020-03-04-20-00_data',
 '2020-03-04-18-00_data',
 '2020-03-04-15-00_data',
 '2020-03-04-06-00_data',
 '2020-03-04-03-00_data',
 '2020-03-03-23-30_data',
 '2020-03-03-20-30_data',
 '2020-03-03-19-00_data',
 '2020-03-03-17-00_data',
 '2020-03-03-15-00_data',
 '2020-03-03-06-00_data',
 '2020-03-03-05-00_data',
 '2020

In [6]:
import os
import glob
path = './'
extension = 'csv'
os.chdir(path)
result = glob.glob('*_data.{}'.format(extension))

In [7]:
result

['2020-01-30-21-30_data.csv',
 '2020-01-31-14-00_data.csv',
 '2020-01-31-18-00_data.csv',
 '2020-01-31-19-00_data.csv',
 '2020-02-01-10-00_data.csv',
 '2020-02-01-18-00_data.csv',
 '2020-02-01-23-00_data.csv',
 '2020-02-02-05-00_data.csv',
 '2020-02-02-14-00_data.csv',
 '2020-02-02-20-00_data.csv',
 '2020-02-02-22-00_data.csv',
 '2020-02-03-03-00_data.csv',
 '2020-02-03-05-00_data.csv',
 '2020-02-03-15-00_data.csv',
 '2020-02-03-18-00_data.csv',
 '2020-02-03-20-00_data.csv',
 '2020-02-04-01-00_data.csv',
 '2020-02-04-05-00_data.csv',
 '2020-02-04-14-00_data.csv',
 '2020-02-04-17-00_data.csv',
 '2020-02-04-20-00_data.csv',
 '2020-02-05-02-00_data.csv',
 '2020-02-05-06-00_data.csv',
 '2020-02-05-15-00_data.csv',
 '2020-02-05-19-00_data.csv',
 '2020-02-05-23-00_data.csv',
 '2020-02-06-03-00_data.csv',
 '2020-02-06-06-00_data.csv',
 '2020-02-06-15-00_data.csv',
 '2020-02-06-18-00_data.csv',
 '2020-02-06-20-00_data.csv',
 '2020-02-06-21-30_data.csv',
 '2020-02-07-00-30_data.csv',
 '2020-02-