In [1]:
import pandas as pd

DOWNLOAD_URL = (
    "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/"
    "master/csse_covid_19_data/csse_covid_19_time_series/"
    "time_series_covid19_{kind}_{group}.csv"
)


def download_data(group, kind):
    """
    Reads in a single dataset from the John Hopkins GitHub repo
    as a DataFrame
    
    Parameters
    ----------
    group : "world" or "usa"
    
    kind : "deaths" or "cases"
    
    Returns
    -------
    DataFrame
    """
    group = "US" if group == "usa" else "global"
    kind = "confirmed" if kind == "cases" else "deaths"
    url = DOWNLOAD_URL.format(kind=kind, group=group)
    return pd.read_csv(url)

In [2]:
usa_deaths = download_data('usa', 'deaths')
usa_deaths.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,4/11/22,4/12/22,4/13/22,4/14/22,4/15/22,4/16/22,4/17/22,4/18/22,4/19/22,4/20/22
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,...,213,214,214,214,214,214,214,215,215,215
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,...,677,678,679,679,679,679,679,679,679,679
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,...,98,98,98,98,98,98,98,98,98,98
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,...,101,101,101,103,103,103,103,103,103,103
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,...,239,241,241,241,242,242,242,243,243,243


In [3]:
GROUPS = "world", "usa"
KINDS = "deaths", "cases"

def read_all_data():
    """
    Read in all four CSVs as DataFrames
    
    Returns
    -------
    Dictionary of DataFrames
    """
    data = {}
    for group in GROUPS:
        for kind in KINDS:
            data[f'{group}_{kind}'] = download_data(group, kind)
    return data

In [4]:
data = read_all_data()

In [5]:
data['usa_cases'].head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,4/11/22,4/12/22,4/13/22,4/14/22,4/15/22,4/16/22,4/17/22,4/18/22,4/19/22,4/20/22
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,...,15747,15749,15751,15751,15752,15752,15752,15771,15755,15757
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,...,55499,55512,55522,55536,55547,55547,55547,55579,55564,55571
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,...,5658,5658,5658,5658,5658,5658,5658,5659,5658,5658
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,...,6425,6425,6425,6428,6428,6428,6428,6428,6428,6428
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,...,14934,14934,14935,14953,14959,14959,14959,14963,14961,14963


In [6]:
data['usa_deaths'].head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,4/11/22,4/12/22,4/13/22,4/14/22,4/15/22,4/16/22,4/17/22,4/18/22,4/19/22,4/20/22
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,...,213,214,214,214,214,214,214,215,215,215
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,...,677,678,679,679,679,679,679,679,679,679
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,...,98,98,98,98,98,98,98,98,98,98
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,...,101,101,101,103,103,103,103,103,103,103
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,...,239,241,241,241,242,242,242,243,243,243


Let's use this function to read in all of the data and output the head of two of them.

In [7]:
data['usa_cases'].head(3)

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,4/11/22,4/12/22,4/13/22,4/14/22,4/15/22,4/16/22,4/17/22,4/18/22,4/19/22,4/20/22
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,...,15747,15749,15751,15751,15752,15752,15752,15771,15755,15757
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,...,55499,55512,55522,55536,55547,55547,55547,55579,55564,55571
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,...,5658,5658,5658,5658,5658,5658,5658,5659,5658,5658


In [8]:
def write_data(data, directory, **kwargs):
    """
    Writes each raw data DataFrame to a file as a CSV
    
    Parameters
    ----------
    data : dictionary of DataFrames

    directory : string name of directory to save files i.e. "data/raw"
    
    kwargs : extra keyword arguments for the `to_csv` DataFrame method
    
    Returns
    -------
    None
    """
    for name, df in data.items():
        df.to_csv(f'{directory}/{name}.csv', **kwargs)

In [9]:
write_data(data, 'data/raw', index=False)

In [10]:
def read_local_data(group, kind, directory):
    """
    Read in one CSV as a DataFrame from the given directory
    
    Parameters
    ----------
    group : "world" or "usa"
    
    kind : "deaths" or "cases"
    
    directory : string name of directory to save files i.e. "data/raw"
    
    Returns
    -------
    DataFrame    
    """
    return pd.read_csv(f'{directory}/{group}_{kind}.csv')

In [11]:
usa_cases = read_local_data('usa', 'cases', 'data/raw')

In [12]:
usa_cases.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,4/11/22,4/12/22,4/13/22,4/14/22,4/15/22,4/16/22,4/17/22,4/18/22,4/19/22,4/20/22
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,...,15747,15749,15751,15751,15752,15752,15752,15771,15755,15757
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,...,55499,55512,55522,55536,55547,55547,55547,55579,55564,55571
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,...,5658,5658,5658,5658,5658,5658,5658,5659,5658,5658
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,...,6425,6425,6425,6428,6428,6428,6428,6428,6428,6428
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,...,14934,14934,14935,14953,14959,14959,14959,14963,14961,14963


In [13]:
GROUPS = "world", "usa"
KINDS = "deaths", "cases"

def run():
    """
    Run all cleaning and transformation steps
    
    Returns
    -------
    Dictionary of DataFrames
    """
    data = {}
    for group in GROUPS:
        for kind in KINDS:
            data[f'{group}_{kind}'] = read_local_data(group, kind, 'data/raw')
    return data

In [14]:
data = run()

In [15]:
data['usa_deaths']

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,4/11/22,4/12/22,4/13/22,4/14/22,4/15/22,4/16/22,4/17/22,4/18/22,4/19/22,4/20/22
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,...,213,214,214,214,214,214,214,215,215,215
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.727750,-87.722071,...,677,678,679,679,679,679,679,679,679,679
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,...,98,98,98,98,98,98,98,98,98,98
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,...,101,101,101,103,103,103,103,103,103,103
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,...,239,241,241,241,242,242,242,243,243,243
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3337,84056039,US,USA,840,56039.0,Teton,Wyoming,US,43.935225,-110.589080,...,16,16,16,16,16,16,16,16,16,16
3338,84056041,US,USA,840,56041.0,Uinta,Wyoming,US,41.287818,-110.547578,...,39,39,39,39,39,39,39,39,39,39
3339,84090056,US,USA,840,90056.0,Unassigned,Wyoming,US,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
3340,84056043,US,USA,840,56043.0,Washakie,Wyoming,US,43.904516,-107.680187,...,43,43,43,43,43,43,43,43,44,44
