<a href="https://colab.research.google.com/github/Strata-Tech/Covid19/blob/main/covid19.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip3 install plotly==4.8



In [4]:
# import the relevant packages
import os
import pandas as pd
import numpy as np

# ploty is an interactive visualization package built using javascript. 
# it is mostly used for explantory visualization (presenting to others)
import plotly.express as px

# requests is a package that allows Python to talk to the world wide web (www)
import requests

In [5]:
#Using the data provided by John Hopkins at https://github.com/CSSEGISandData/COVID-19
#datasets used are confirmed,deaths and recovered cases.
COVID_DATA_URLS = [
  'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv',                 
  'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv',
  'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'              
]

In [6]:
#to check if links are working

for url in COVID_DATA_URLS:
  response=requests.head(url)
  print(str(response.status_code)+':'+response.request.url)

200:https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv
200:https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv
200:https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv


In [7]:
#Creating a function which downloads file and save it in colab hosted runtime if file does not exist. If file exists, reuse the saved file


url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'


os.listdir()

['.config',
 'time_series_covid19_deaths_global.csv',
 'time_series_covid19_recovered_global.csv',
 'time_series_covid19_confirmed_global.csv',
 'sample_data']

In [8]:
def load_df_from_url(url):
  filename=url.split('/')[-1]
  if filename in os.listdir():
    df=pd.read_csv(filename)
    print(f"Using saved file {filename} on colab hosted runtime")

  else:
    df=pd.read_csv(url)
    df.to_csv(f"{filename}",index=False)
    print(f"Downloaded file {filename} from internet and saved to colab hosted runtime")

  return df, filename
  


In [9]:
#testing function with first url in the list

url = COVID_DATA_URLS[0]
df, filename = load_df_from_url(url)

#to see all columns
pd.options.display.width=9999
pd.options.display.max_columns=10

print(df.head())
print(df.tail())


Using saved file time_series_covid19_confirmed_global.csv on colab hosted runtime
  Province/State Country/Region       Lat       Long  1/22/20  ...  8/3/21  8/4/21  8/5/21  8/6/21  8/7/21
0            NaN    Afghanistan  33.93911  67.709953        0  ...  148572  148933  149361  149810  149810
1            NaN        Albania  41.15330  20.168300        0  ...  133211  133310  133442  133591  133730
2            NaN        Algeria  28.03390   1.659600        0  ...  175229  176724  178013  179216  180356
3            NaN        Andorra  42.50630   1.521800        0  ...   14766   14797   14809   14836   14836
4            NaN         Angola -11.20270  17.873900        0  ...   43070   43158   43269   43487   43592

[5 rows x 568 columns]
    Province/State      Country/Region        Lat        Long  1/22/20  ...  8/3/21  8/4/21  8/5/21  8/6/21  8/7/21
274            NaN             Vietnam  14.058324  108.277199        0  ...  174461  181756  189066  193381  205656
275            NaN  

In [None]:
#writing a loop for each datafile in the list 
#so that we can load the file in memory
#rename the columns from Province/State to Province,Country/Region to Country
#change all columns to lower case
#Fill missing values in province with values from country
#Check and drop columns with overwhelming null values.
#Convert the datset into a panel set with dates and countries as rows.
#Merge the panel dataframe into the main dataset, to arrive at confirmed, death and recovered columns

In [10]:
## For each data file,
# enumerate creates a tuple with a counter that starts from 0 and the elements of the list, e.g. (0, URL1), (1, URL2), etc.
for i, url in enumerate(COVID_DATA_URLS):

    # Load the file into memory
    df, filename = load_df_from_url(url)

    # rename columns
    df.rename(columns={'Province/State': 'Province', 'Country/Region':'Country'}, inplace=True)
    # and make the column names lowercase, 
    # general method, can be used in all dfs
    df.columns = [c.lower() for c in df.columns]

    # fill missing provinces/states with the country/region name
    df['province'].fillna(df['country'], inplace=True)

    # Create a pivot table with 'Province/State', 'Country/Region', 'Lat', 'Long' groups
    df = df.groupby(['province','country', 'lat', 'long']).sum().reset_index()

    # split the filename to get the keywords we want (confirmed, deaths, recovered)
    value_name = filename.split('_')[-2]

    # pivot the dataframe into a panel format.
    df = df.melt(id_vars=['province','country', 'lat', 'long'], var_name='date',value_name=value_name)

    # convert date column to datetime
    df['date'] = pd.to_datetime(df['date'])

    # Merge the dataframe into the main dataset
    # checking if it's the first df to be read
    # i should be equal to 0
    # i is created from enumerate()
    if i == 0:
        dataset = df
    # print(list(df))
    else:
        dataset = pd.merge(dataset, df, how='outer', on=['province','country', 'date', 'lat', 'long'])

dataset.sort_values(['province','country', 'date'], inplace=True)


Using saved file time_series_covid19_confirmed_global.csv on colab hosted runtime
Using saved file time_series_covid19_deaths_global.csv on colab hosted runtime
Using saved file time_series_covid19_recovered_global.csv on colab hosted runtime


In [11]:
dataset

Unnamed: 0,province,country,lat,long,date,confirmed,deaths,recovered
0,Afghanistan,Afghanistan,33.939110,67.709953,2020-01-22,0.0,0.0,0.0
277,Afghanistan,Afghanistan,33.939110,67.709953,2020-01-23,0.0,0.0,0.0
554,Afghanistan,Afghanistan,33.939110,67.709953,2020-01-24,0.0,0.0,0.0
831,Afghanistan,Afghanistan,33.939110,67.709953,2020-01-25,0.0,0.0,0.0
1108,Afghanistan,Afghanistan,33.939110,67.709953,2020-01-26,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
155119,Zimbabwe,Zimbabwe,-19.015438,29.154857,2021-08-03,112435.0,3676.0,81570.0
155396,Zimbabwe,Zimbabwe,-19.015438,29.154857,2021-08-04,113526.0,3711.0,82994.0
155673,Zimbabwe,Zimbabwe,-19.015438,29.154857,2021-08-05,114489.0,3754.0,0.0
155950,Zimbabwe,Zimbabwe,-19.015438,29.154857,2021-08-06,115445.0,3805.0,0.0
