<a href="https://colab.research.google.com/github/Strata-Tech/Covid19/blob/main/covid19.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
!pip3 install plotly==4.8



In [34]:
# import the relevant packages
import os
import pandas as pd
import numpy as np

# ploty is an interactive visualization package built using javascript. 
# it is mostly used for explantory visualization (presenting to others)
import plotly.express as px

# requests is a package that allows Python to talk to the world wide web (www)
import requests

In [35]:
#Using the data provided by John Hopkins at https://github.com/CSSEGISandData/COVID-19
#datasets used are confirmed,deaths and recovered cases.
COVID_DATA_URLS = [
  'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv',                 
  'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv',
  'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'              
]

In [36]:
#to check if links are working

for url in COVID_DATA_URLS:
  response=requests.head(url)
  print(str(response.status_code)+':'+response.request.url)

200:https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv
200:https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv
200:https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv


In [37]:
#Creating a function which downloads file and save it in colab hosted runtime if file does not exist. If file exists, reuse the saved file


url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'


os.listdir()

['.config',
 'time_series_covid19_deaths_global.csv',
 'time_series_covid19_recovered_global.csv',
 'bubbles.html',
 'time_series_covid19_confirmed_global.csv',
 'sample_data']

In [38]:
def load_df_from_url(url):
  filename=url.split('/')[-1]
  if filename in os.listdir():
    df=pd.read_csv(filename)
    print(f"Using saved file {filename} on colab hosted runtime")

  else:
    df=pd.read_csv(url)
    df.to_csv(f"{filename}",index=False)
    print(f"Downloaded file {filename} from internet and saved to colab hosted runtime")

  return df, filename
  


In [39]:
#testing function with first url in the list

url = COVID_DATA_URLS[0]
df, filename = load_df_from_url(url)

#to see all columns
pd.options.display.width=9999
pd.options.display.max_columns=15

print(df.head())
print(df.tail())


Using saved file time_series_covid19_confirmed_global.csv on colab hosted runtime
  Province/State Country/Region       Lat       Long  1/22/20  1/23/20  1/24/20  ...  8/1/21  8/2/21  8/3/21  8/4/21  8/5/21  8/6/21  8/7/21
0            NaN    Afghanistan  33.93911  67.709953        0        0        0  ...  146523  147985  148572  148933  149361  149810  149810
1            NaN        Albania  41.15330  20.168300        0        0        0  ...  133121  133146  133211  133310  133442  133591  133730
2            NaN        Algeria  28.03390   1.659600        0        0        0  ...  172564  173922  175229  176724  178013  179216  180356
3            NaN        Andorra  42.50630   1.521800        0        0        0  ...   14678   14747   14766   14797   14809   14836   14836
4            NaN         Angola -11.20270  17.873900        0        0        0  ...   42815   42970   43070   43158   43269   43487   43592

[5 rows x 568 columns]
    Province/State      Country/Region        La

In [40]:
#writing a loop for each datafile in the list 
#so that we can load the file in memory
#rename the columns from Province/State to Province,Country/Region to Country
#change all columns to lower case
#Fill missing values in province with values from country
#Check and drop columns with overwhelming null values.
#Convert the datset into a panel set with dates and countries as rows.
#Merge the panel dataframe into the main dataset, to arrive at confirmed, death and recovered columns

In [41]:
# ## For each data file,
# # enumerate creates a tuple with a counter that starts from 0 and the elements of the list, e.g. (0, URL1), (1, URL2), etc.
# for i, url in enumerate(COVID_DATA_URLS):

#     # Load the file into memory
#     df, filename = load_df_from_url(url)

#     # rename columns
#     df.rename(columns={'Province/State': 'Province', 'Country/Region':'Country'}, inplace=True)
#     # and make the column names lowercase, 
#     # general method, can be used in all dfs
#     df.columns = [c.lower() for c in df.columns]

#     # fill missing provinces/states with the country/region name
#     df['province'].fillna(df['country'], inplace=True)

#     # Create a pivot table with 'Province/State', 'Country/Region', 'Lat', 'Long' groups
#     df = df.groupby(['province','country', 'lat', 'long']).sum().reset_index()

#     # split the filename to get the keywords we want (confirmed, deaths, recovered)
#     value_name = filename.split('_')[-2]

#     # pivot the dataframe into a panel format.
#     df = df.melt(id_vars=['province','country', 'lat', 'long'], var_name='date',value_name=value_name)

#     # convert date column to datetime
#     df['date'] = pd.to_datetime(df['date'])

#     # Merge the dataframe into the main dataset
#     # checking if it's the first df to be read
#     # i should be equal to 0
#     # i is created from enumerate()
#     if i == 0:
#         dataset = df
#     # print(list(df))
#     else:
#         dataset = pd.merge(dataset, df, how='outer', on=['province','country', 'date', 'lat', 'long'])

# dataset.sort_values(['province','country', 'date'], inplace=True)


In [42]:
#writing a loop for each datafile in the list 
#so that we can load the file in memory
#rename the columns from Province/State to Province,Country/Region to Country
#change all columns to lower case
#Fill missing values in province with values from country
#Check and drop columns with overwhelming null values.
#Convert the datset into a panel set with dates and countries as rows.
#Merge the panel dataframe into the main dataset, to arrive at confirmed, death and recovered columns

for i, url in enumerate(COVID_DATA_URLS):

  #load file
  df,filename=load_df_from_url(url)
  

  #rename columns
  df.rename(columns={'Province/State':'Province','Country/Region':'Country'},inplace=True)


  
  #Change columns to lower case
  df.columns=[col.lower() for col in df.columns]


  #Fill missing values in province with values from country 
  df['province'].fillna(df['country'],inplace=True)
  

  # Create a pivot table with 'Province/State', 'Country/Region', 'Lat', 'Long' groups
  df=df.groupby(['province','country','lat','long']).sum().reset_index()
 


   # split the filename to get the keywords we want (confirmed, deaths, recovered)
  value_name=filename.split('_')[-2]
 

  # # pivot the dataframe into a panel format.
  df=df.melt(id_vars=['province','country','lat','long'],var_name='date',value_name=value_name)
 

  

  #convert date to datetime
  df['date']=pd.to_datetime(df['date'])
  

  # Merge the dataframe into the main dataset
  # checking if it's the first df to be read
  # i should be equal to 0
  # i is created from enumerate()

  if i==0:
    dataset=df

  else:
    dataset=pd.merge(dataset,df,how='outer',on=['province','country','lat','long','date'])



dataset.sort_values(['province','country','lat','long'],inplace=True)



Using saved file time_series_covid19_confirmed_global.csv on colab hosted runtime
Using saved file time_series_covid19_deaths_global.csv on colab hosted runtime
Using saved file time_series_covid19_recovered_global.csv on colab hosted runtime


In [43]:
dataset.head()

Unnamed: 0,province,country,lat,long,date,confirmed,deaths,recovered
0,Afghanistan,Afghanistan,33.93911,67.709953,2020-01-22,0.0,0.0,0.0
277,Afghanistan,Afghanistan,33.93911,67.709953,2020-01-23,0.0,0.0,0.0
554,Afghanistan,Afghanistan,33.93911,67.709953,2020-01-24,0.0,0.0,0.0
831,Afghanistan,Afghanistan,33.93911,67.709953,2020-01-25,0.0,0.0,0.0
1108,Afghanistan,Afghanistan,33.93911,67.709953,2020-01-26,0.0,0.0,0.0


In [44]:
# check if all the status rows are filled, return those rows with missing data
# isnull() returns any rows that has null values

null_data=dataset[dataset[['confirmed','deaths','recovered']].isnull().any(axis=1)]
null_data

Unnamed: 0,province,country,lat,long,date,confirmed,deaths,recovered
2,Alberta,Canada,53.9333,-116.5765,2020-01-22,0.0,0.0,
279,Alberta,Canada,53.9333,-116.5765,2020-01-23,0.0,0.0,
556,Alberta,Canada,53.9333,-116.5765,2020-01-24,0.0,0.0,
833,Alberta,Canada,53.9333,-116.5765,2020-01-25,0.0,0.0,
1110,Alberta,Canada,53.9333,-116.5765,2020-01-26,0.0,0.0,
...,...,...,...,...,...,...,...,...
155115,Yukon,Canada,64.2823,-135.0000,2021-08-03,610.0,8.0,
155392,Yukon,Canada,64.2823,-135.0000,2021-08-04,616.0,8.0,
155669,Yukon,Canada,64.2823,-135.0000,2021-08-05,623.0,8.0,
155946,Yukon,Canada,64.2823,-135.0000,2021-08-06,624.0,8.0,


In [45]:
# set maximum number of rows to display to be all
pd.options.display.max_rows=30

# groupby the key columns (except date) and look at their mean to discover what's wrong
null_countries=null_data.groupby(['province','country','lat','long']).mean().reset_index()
null_countries.sort_values(['country','province'])

# Canada reports confirmed and deaths by province, but recovered by entire country sum
# Hebei, Henan, Mozambique's Recovered lat long are smaller than Confirmed,Deaths lat long
# Syria's, Timor-Leste's Recovered lat long are larger than Confirmed,Deaths lat long


Unnamed: 0,province,country,lat,long,confirmed,deaths,recovered
0,Alberta,Canada,53.9333,-116.5765,80029.567376,908.264184,
1,British Columbia,Canada,53.7267,-127.6476,48835.008865,703.329787,
2,Canada,Canada,56.1304,-106.3468,,,468525.475177
3,Diamond Princess,Canada,0.0,0.0,0.056738,0.893617,
4,Grand Princess,Canada,0.0,0.0,11.712766,0.0,
9,Manitoba,Canada,53.7609,-98.8139,18112.093972,421.544326,
12,New Brunswick,Canada,46.5653,-66.4619,808.148936,14.219858,
13,Newfoundland and Labrador,Canada,53.1355,-57.6604,539.606383,3.836879,
14,Northwest Territories,Canada,64.8255,-124.8457,34.320922,0.0,
15,Nova Scotia,Canada,44.682,-63.7443,1824.826241,56.611702,


In [46]:
# solution to lat long problem
# objective: drop the lat long from recovered dataframe
# merge in the lat long from confirmed set

latlong_country = dataset.groupby(['province', 'country', 'lat', 'long'], as_index=False).mean()
latlong_country = latlong_country[['province', 'country', 'lat', 'long']]
latlong_country

Unnamed: 0,province,country,lat,long
0,Afghanistan,Afghanistan,33.939110,67.709953
1,Albania,Albania,41.153300,20.168300
2,Alberta,Canada,53.933300,-116.576500
3,Algeria,Algeria,28.033900,1.659600
4,Andorra,Andorra,42.506300,1.521800
...,...,...,...,...
278,Yukon,Canada,64.282300,-135.000000
279,Yunnan,China,24.974000,101.487000
280,Zambia,Zambia,-13.133897,27.849332
281,Zhejiang,China,29.183200,120.093400


In [47]:
#Make changes to the loop to address the Canada and Lat Long problem.


In [48]:
# solution to lat long problem
# objective: drop the lat long from recovered dataframe
# merge in the lat long from confirmed set

latlong_country=dataset.groupby(['province', 'country', 'lat', 'long'],as_index=False).mean()
latlong_country = latlong_country[['province', 'country', 'lat', 'long']]
latlong_country

Unnamed: 0,province,country,lat,long
0,Afghanistan,Afghanistan,33.939110,67.709953
1,Albania,Albania,41.153300,20.168300
2,Alberta,Canada,53.933300,-116.576500
3,Algeria,Algeria,28.033900,1.659600
4,Andorra,Andorra,42.506300,1.521800
...,...,...,...,...
278,Yukon,Canada,64.282300,-135.000000
279,Yunnan,China,24.974000,101.487000
280,Zambia,Zambia,-13.133897,27.849332
281,Zhejiang,China,29.183200,120.093400


In [49]:
# list of countries without province data
wo_province = ['Canada']

## For each data file,
# enumerate creates a tuple with a counter that starts from 0 and the elements of the list, e.g. (0, URL1), (1, URL2), etc.
for i, url in enumerate(COVID_DATA_URLS):

    # Load the file into memory
    df, filename = load_df_from_url(url)

    # rename columns and make them lowercase
    df.rename(columns={'Province/State': 'Province', 'Country/Region':'Country'}, inplace=True)
    df.columns = [c.lower() for c in df.columns]

    ## NEW CODES TO ADDRESS ISSUES
    # handle those countries that don't report Recovered by provinces
    df_wo_province = df[df['country'].isin(wo_province)]
    df_wo_province_group = df_wo_province.groupby(['country']).sum().reset_index()
    # print(df_wo_province_group)

    # ~ means NOT in pandas
    df = df[~df['country'].isin(wo_province)] # this will remove countries in wo_province
    df = pd.concat([df, df_wo_province_group])
    # print(df)

    ## BACK TO ORIGINAL CODES
    # fill missing provinces/states with the country/region name
    df['province'].fillna(df['country'], inplace=True)
    # print(df)

    # Create a pivot table with 'Province/State', 'Country/Region', 'Lat', 'Long' groups
    df = df.groupby(['province','country', 'lat', 'long']).sum().reset_index()
    print(df)

    # split the filename to get the keywords we want (confirmed, deaths, recovered)
    value_name = filename.split('_')[-2]

#     # NEW CODES to handle lat long problem
#     # 1. drop lat long columns from recovered
#     # 2. make new dataframe latlong_country from master dataset (confirmed) by grouping the 4 variables
#     # 3. keep only the 4 variables in the latlong_country
#     # 4. merge lat long back into recovered set
    if value_name == 'recovered':
      df.drop(columns=['lat', 'long'], inplace=True)
      latlong_country = dataset.groupby(['province', 'country', 'lat', 'long'], as_index=False).mean()
      latlong_country = latlong_country[['province', 'country', 'lat', 'long']]
      df = df.merge(latlong_country, how='left', on=['province', 'country'])

      # print(df)

    # pivot the dataframe into a panel format.
    df = df.melt(id_vars=['province','country', 'lat', 'long'], var_name='date',value_name=value_name)

    # Merge the dataframe into the main dataset
    if i==0:
        dataset = df
        # print(list(df))
    else:
        dataset = pd.merge(dataset, df, how='left', on=['province','country', 'date', 'lat', 'long'])

# convert date column to datetime
dataset['date'] = pd.to_datetime(dataset['date'])

# sort dataset  
dataset.sort_values(['province','country', 'date'], inplace=True)

# ## no longer needed with new fix: 7 Aug 2021
# to address different lat long for same province or country problem
# use fillna to fill recovered values to the same record as confirmed, deaths
# dropna to drop the recovered record
#dataset.recovered.fillna(method='backfill', inplace=True)
#dataset.dropna(subset=['confirmed', 'deaths'],how='any', inplace=True)

# reset index
dataset.reset_index(inplace=True, drop=True)

Using saved file time_series_covid19_confirmed_global.csv on colab hosted runtime
        province      country        lat        long  1/22/20  1/23/20  1/24/20  ...  8/1/21  8/2/21  8/3/21  8/4/21  8/5/21  8/6/21  8/7/21
0    Afghanistan  Afghanistan  33.939110   67.709953        0        0        0  ...  146523  147985  148572  148933  149361  149810  149810
1        Albania      Albania  41.153300   20.168300        0        0        0  ...  133121  133146  133211  133310  133442  133591  133730
2        Algeria      Algeria  28.033900    1.659600        0        0        0  ...  172564  173922  175229  176724  178013  179216  180356
3        Andorra      Andorra  42.506300    1.521800        0        0        0  ...   14678   14747   14766   14797   14809   14836   14836
4         Angola       Angola -11.202700   17.873900        0        0        0  ...   42815   42970   43070   43158   43269   43487   43592
..           ...          ...        ...         ...      ...      ...  

In [50]:
# check again there's no more null data
null_data = dataset[dataset[['confirmed', 'deaths', 'recovered']].isnull().any(axis=1)]
null_data

Unnamed: 0,province,country,lat,long,date,confirmed,deaths,recovered


In [51]:
dataset

Unnamed: 0,province,country,lat,long,date,confirmed,deaths,recovered
0,Afghanistan,Afghanistan,33.939110,67.709953,2020-01-22,0,0,0
1,Afghanistan,Afghanistan,33.939110,67.709953,2020-01-23,0,0,0
2,Afghanistan,Afghanistan,33.939110,67.709953,2020-01-24,0,0,0
3,Afghanistan,Afghanistan,33.939110,67.709953,2020-01-25,0,0,0
4,Afghanistan,Afghanistan,33.939110,67.709953,2020-01-26,0,0,0
...,...,...,...,...,...,...,...,...
148327,Zimbabwe,Zimbabwe,-19.015438,29.154857,2021-08-03,112435,3676,81570
148328,Zimbabwe,Zimbabwe,-19.015438,29.154857,2021-08-04,113526,3711,82994
148329,Zimbabwe,Zimbabwe,-19.015438,29.154857,2021-08-05,114489,3754,0
148330,Zimbabwe,Zimbabwe,-19.015438,29.154857,2021-08-06,115445,3805,0


In [52]:
#Create new columns: confirmed_daily, deaths_daily, recovered_daily by taking the differences between each day. Also create a death_pct column by taking deaths divded by confirmed

In [53]:
col_names=['confirmed', 'deaths', 'recovered']
dataset.sort_values(['confirmed', 'deaths', 'recovered'],inplace=True)

#example of 1 column

# dataset['confirmed_daily']=dataset.groupby(['province', 'country', 'lat', 'long'])['confirmed'].diff()
# loop for all three columns
# diff() default takes the difference between current row and previous row

for col in col_names:
  dataset[str(col)+'_daily']=dataset.groupby(['province', 'country', 'lat', 'long'])[col].diff()
  # for daily figures, there should not be negative numbers or missing values
  # clip() set any numbers beyond the limit to a number you define
  dataset[str(col)+'_daily'].clip(lower=0,inplace=True,axis=0)
  dataset.fillna(value=0,inplace=True)

# create death and recovered percentage columns
# Must clip to limit of 1 else we will get NaN values in the death_pct and recovered_pct column
dataset['death_pct']=dataset['deaths']/dataset['confirmed'].clip(lower=1)*100
dataset['recovered_pct']=dataset['recovered']/dataset['confirmed'].clip(lower=1)*100
dataset.dropna(how='all', inplace=True)
# sort them in order
dataset.sort_values(['country', 'province', 'date'], inplace=True)
dataset

Unnamed: 0,province,country,lat,long,date,confirmed,deaths,recovered,confirmed_daily,deaths_daily,recovered_daily,death_pct,recovered_pct
0,Afghanistan,Afghanistan,33.939110,67.709953,2020-01-22,0,0,0,0.0,0.0,0.0,0.000000,0.000000
1,Afghanistan,Afghanistan,33.939110,67.709953,2020-01-23,0,0,0,0.0,0.0,0.0,0.000000,0.000000
2,Afghanistan,Afghanistan,33.939110,67.709953,2020-01-24,0,0,0,0.0,0.0,0.0,0.000000,0.000000
3,Afghanistan,Afghanistan,33.939110,67.709953,2020-01-25,0,0,0,0.0,0.0,0.0,0.000000,0.000000
4,Afghanistan,Afghanistan,33.939110,67.709953,2020-01-26,0,0,0,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
148327,Zimbabwe,Zimbabwe,-19.015438,29.154857,2021-08-03,112435,3676,81570,1580.0,41.0,2150.0,3.269445,72.548584
148328,Zimbabwe,Zimbabwe,-19.015438,29.154857,2021-08-04,113526,3711,82994,1091.0,35.0,1424.0,3.268855,73.105720
148329,Zimbabwe,Zimbabwe,-19.015438,29.154857,2021-08-05,114489,3754,0,963.0,43.0,0.0,3.278918,0.000000
148330,Zimbabwe,Zimbabwe,-19.015438,29.154857,2021-08-06,115445,3805,0,956.0,51.0,0.0,3.295942,0.000000


In [54]:
# display only 30 rows
pd.options.display.max_rows=30

In [55]:
# check if daily columns have values less than 0
# | indicates OR condition
# & indicates AND condition

dataset[(dataset['confirmed_daily']<0)|(dataset['recovered_daily']<0)|(dataset['deaths_daily']<0)]

Unnamed: 0,province,country,lat,long,date,confirmed,deaths,recovered,confirmed_daily,deaths_daily,recovered_daily,death_pct,recovered_pct


In [56]:
#cleaned dataset
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 148332 entries, 0 to 148331
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   province         148332 non-null  object        
 1   country          148332 non-null  object        
 2   lat              148332 non-null  float64       
 3   long             148332 non-null  float64       
 4   date             148332 non-null  datetime64[ns]
 5   confirmed        148332 non-null  int64         
 6   deaths           148332 non-null  int64         
 7   recovered        148332 non-null  int64         
 8   confirmed_daily  148332 non-null  float64       
 9   deaths_daily     148332 non-null  float64       
 10  recovered_daily  148332 non-null  float64       
 11  death_pct        148332 non-null  float64       
 12  recovered_pct    148332 non-null  float64       
dtypes: datetime64[ns](1), float64(7), int64(3), object(2)
memory usage: 15.8+ 

In [57]:
#To see Covid 19 situation in the region(i.e Southeast Asia)
#Create a subset that only contains Singapore, Malaysia, Indonesia and Thailand data

country_list = ['Singapore', 'Malaysia', 'Indonesia', 'Thailand']
df_sub=dataset[dataset['country'].isin(country_list)].reset_index(drop=True)
df_sub

Unnamed: 0,province,country,lat,long,date,confirmed,deaths,recovered,confirmed_daily,deaths_daily,recovered_daily,death_pct,recovered_pct
0,Indonesia,Indonesia,-0.789300,113.921300,2020-01-22,0,0,0,0.0,0.0,0.0,0.000000,0.000000
1,Indonesia,Indonesia,-0.789300,113.921300,2020-01-23,0,0,0,0.0,0.0,0.0,0.000000,0.000000
2,Indonesia,Indonesia,-0.789300,113.921300,2020-01-24,0,0,0,0.0,0.0,0.0,0.000000,0.000000
3,Indonesia,Indonesia,-0.789300,113.921300,2020-01-25,0,0,0,0.0,0.0,0.0,0.000000,0.000000
4,Indonesia,Indonesia,-0.789300,113.921300,2020-01-26,0,0,0,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2251,Thailand,Thailand,15.870032,100.992541,2021-08-03,652185,5315,26873,18901.0,147.0,0.0,0.814953,4.120457
2252,Thailand,Thailand,15.870032,100.992541,2021-08-04,672385,5503,26873,20200.0,188.0,0.0,0.818430,3.996669
2253,Thailand,Thailand,15.870032,100.992541,2021-08-05,693305,5663,0,20920.0,160.0,0.0,0.816812,0.000000
2254,Thailand,Thailand,15.870032,100.992541,2021-08-06,714684,5854,0,21379.0,191.0,0.0,0.819103,0.000000


In [58]:
# check the descriptive statistics
df_sub[['confirmed_daily','deaths_daily','recovered_daily', 'death_pct', 'recovered_pct']].describe()

Unnamed: 0,confirmed_daily,deaths_daily,recovered_daily,death_pct,recovered_pct
count,2256.0,2256.0,2256.0,2256.0,2256.0
mean,2520.244681,54.120124,1755.858599,1.407884,71.403979
std,5952.096521,178.262619,4438.634419,1.772574,31.687986
min,0.0,0.0,0.0,0.0,0.0
25%,11.0,0.0,5.0,0.057283,57.549636
50%,142.5,0.0,40.0,0.723075,84.135598
75%,2682.5,36.25,1477.25,1.843689,95.372376
max,56757.0,2069.0,47128.0,9.497207,100.0


In [59]:
# find the median for each of the countries using median()
sub_median = df_sub.groupby('country')[['confirmed_daily', 'deaths_daily', 'recovered_daily', 'death_pct', 'recovered_pct']].median().reset_index()
sub_median

Unnamed: 0,country,confirmed_daily,deaths_daily,recovered_daily,death_pct,recovered_pct
0,Indonesia,4099.5,105.5,3754.5,2.988857,79.10741
1,Malaysia,851.5,3.0,646.0,0.663419,83.805149
2,Singapore,24.0,0.0,21.0,0.04962,99.17975
3,Thailand,15.0,0.0,2.0,0.95761,91.306222


In [60]:
#Plot the lines of confirmed cases of each country in separate charts

# import plotly.express as px

# for country in country_list:
#     fig = px.line(df_sub[df_sub['country']==country],
#                   x='date',
#                   y='confirmed_daily')
    
#     fig.update_layout(width = 600, 
#                       height = 400,
#                       bargap=0.2,
#                       xaxis_title_text=f'Number of daily confirmed cases in: {country}',
#                       yaxis_title_text='No. of Cases')
#     fig.show()

In [61]:
# import plotly.express as px
# # Plots a line chart of all the confirmed cases by date, for all the countries(world-wide)
# # since we are looking at all the countries, lwe will use back the dataset instead of df_sub(South East Asia)

# # https://plotly.com/python-api-reference/generated/plotly.express.line.html
# fig_line = px.line(dataset, x='date', y='confirmed',
#                    color='country',
#                    title='Worldwide Covid-19 confirmed cases'
#                    )

# fig_line.show()

In [62]:
# # Create a function that determines the log-base-10 to roundup a number to. 
# # This will help to determine the maximum range to plot the graph
# import math
# def rounduplog(x):
#   # Use math.log10() and math.ceil()
#   return 10**math.ceil(math.log10(x))

In [63]:
# import plotly.express as px
# # Plot an animated bubble chart with scatter plot

# fig = px.scatter(dataset, 
#                   # confirmed cases on the x axis on a log scale
#                   x="confirmed", 
#                   log_x = True,
#                   # recovered cases on the y axis on a log scale
#                   y="recovered", 
#                   log_y = True,
#                   # deaths for the size of the bubble
#                   size="deaths",
#                   size_max=55, 
#                   # death_pct for the intensity of the colour
#                   color="death_pct",
#                   # date for animation frame
#                   animation_frame=dataset.date.astype(str), 
#                   # other settings to prettify the chart
#                   animation_group='country',
#                   text='country',
#                   hover_name='country',
#                   # determine the range using roupuplog()
#                   range_x=[0.5,rounduplog(dataset['confirmed'].max())], 
#                   range_y=[0.5,rounduplog(dataset['recovered'].max())], 
#                   range_color=[0,5], 
#                   title='Timeline of Covid-19 Worldwide cases'
# )
# fig.update_layout(template="plotly_dark")
# fig.show()# 

In [64]:
# # to export, instead of writing to csv, we can write to html
# fig.write_html('bubbles.html')

# # trigger download
# from google.colab import files
# files.download('bubbles.html')