In [41]:
import pandas as pd

DOWNLOAD_URL = (
    "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/"
    "master/csse_covid_19_data/csse_covid_19_time_series/"
    "time_series_covid19_{kind}_{group}.csv"
)


def download_data(group, kind):
    """
    Reads in a single dataset from the John Hopkins GitHub repo
    as a DataFrame
    
    Parameters
    ----------
    group : "world" or "usa"
    
    kind : "deaths" or "cases"
    
    Returns
    -------
    DataFrame
    """
    group = "US" if group == "usa" else "global"
    kind = "confirmed" if kind == "cases" else "deaths"
    url = DOWNLOAD_URL.format(kind=kind, group=group)
    return pd.read_csv(url)

In [42]:
usa_deaths = download_data('usa', 'deaths')
usa_deaths.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,2/19/22,2/20/22,2/21/22,2/22/22,2/23/22,2/24/22,2/25/22,2/26/22,2/27/22,2/28/22
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,...,184,184,184,184,184,192,194,194,194,194
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,...,635,635,635,635,636,640,640,640,640,640
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,...,92,92,92,92,92,92,93,93,93,93
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,...,99,99,99,99,99,99,99,99,99,99
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,...,216,216,216,216,216,216,217,218,218,218


In [43]:
GROUPS = "world", "usa"
KINDS = "deaths", "cases"

def read_all_data():
    """
    Read in all four CSVs as DataFrames
    
    Returns
    -------
    Dictionary of DataFrames
    """
    data = {}
    for group in GROUPS:
        for kind in KINDS:
            data[f'{group}_{kind}'] = download_data(group, kind)
    return data

In [44]:
data = read_all_data()

In [45]:
data['usa_cases'].head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,2/19/22,2/20/22,2/21/22,2/22/22,2/23/22,2/24/22,2/25/22,2/26/22,2/27/22,2/28/22
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,...,15420,15431,15436,15442,15451,15468,15479,15503,15509,15510
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,...,54734,54763,54784,54805,54837,54874,54904,54957,54972,54978
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,...,5426,5429,5429,5430,5433,5436,5438,5440,5445,5445
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,...,6351,6354,6355,6360,6364,6367,6369,6374,6375,6375
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,...,14663,14672,14682,14688,14706,14710,14734,14753,14757,14760


In [46]:
data['usa_deaths'].head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,2/19/22,2/20/22,2/21/22,2/22/22,2/23/22,2/24/22,2/25/22,2/26/22,2/27/22,2/28/22
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,...,184,184,184,184,184,192,194,194,194,194
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,...,635,635,635,635,636,640,640,640,640,640
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,...,92,92,92,92,92,92,93,93,93,93
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,...,99,99,99,99,99,99,99,99,99,99
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,...,216,216,216,216,216,216,217,218,218,218


Let's use this function to read in all of the data and output the head of two of them.

In [47]:
data['usa_cases'].head(3)

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,2/19/22,2/20/22,2/21/22,2/22/22,2/23/22,2/24/22,2/25/22,2/26/22,2/27/22,2/28/22
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,...,15420,15431,15436,15442,15451,15468,15479,15503,15509,15510
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,...,54734,54763,54784,54805,54837,54874,54904,54957,54972,54978
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,...,5426,5429,5429,5430,5433,5436,5438,5440,5445,5445


In [48]:
def write_data(data, directory, **kwargs):
    """
    Writes each raw data DataFrame to a file as a CSV
    
    Parameters
    ----------
    data : dictionary of DataFrames

    directory : string name of directory to save files i.e. "data/raw"
    
    kwargs : extra keyword arguments for the `to_csv` DataFrame method
    
    Returns
    -------
    None
    """
    for name, df in data.items():
        df.to_csv(f'{directory}/{name}.csv', **kwargs)

In [49]:
write_data(data, 'data/raw', index=False)

In [50]:
def read_local_data(group, kind, directory):
    """
    Read in one CSV as a DataFrame from the given directory
    
    Parameters
    ----------
    group : "world" or "usa"
    
    kind : "deaths" or "cases"
    
    directory : string name of directory to save files i.e. "data/raw"
    
    Returns
    -------
    DataFrame    
    """
    return pd.read_csv(f'{directory}/{group}_{kind}.csv')

In [51]:
usa_cases = read_local_data('usa', 'cases', 'data/raw')

In [52]:
usa_cases.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,2/19/22,2/20/22,2/21/22,2/22/22,2/23/22,2/24/22,2/25/22,2/26/22,2/27/22,2/28/22
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,...,15420,15431,15436,15442,15451,15468,15479,15503,15509,15510
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,...,54734,54763,54784,54805,54837,54874,54904,54957,54972,54978
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,...,5426,5429,5429,5430,5433,5436,5438,5440,5445,5445
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,...,6351,6354,6355,6360,6364,6367,6369,6374,6375,6375
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,...,14663,14672,14682,14688,14706,14710,14734,14753,14757,14760


In [53]:
GROUPS = "world", "usa"
KINDS = "deaths", "cases"

def run():
    """
    Run all cleaning and transformation steps
    
    Returns
    -------
    Dictionary of DataFrames
    """
    data = {}
    for group in GROUPS:
        for kind in KINDS:
            data[f'{group}_{kind}'] = read_local_data(group, kind, 'data/raw')
    return data

In [54]:
data = run()

In [55]:
data['usa_deaths']

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,2/19/22,2/20/22,2/21/22,2/22/22,2/23/22,2/24/22,2/25/22,2/26/22,2/27/22,2/28/22
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,...,184,184,184,184,184,192,194,194,194,194
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.727750,-87.722071,...,635,635,635,635,636,640,640,640,640,640
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,...,92,92,92,92,92,92,93,93,93,93
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,...,99,99,99,99,99,99,99,99,99,99
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,...,216,216,216,216,216,216,217,218,218,218
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3337,84056039,US,USA,840,56039.0,Teton,Wyoming,US,43.935225,-110.589080,...,15,15,15,16,16,16,16,16,16,16
3338,84056041,US,USA,840,56041.0,Uinta,Wyoming,US,41.287818,-110.547578,...,36,36,36,36,36,36,36,36,36,36
3339,84090056,US,USA,840,90056.0,Unassigned,Wyoming,US,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,18
3340,84056043,US,USA,840,56043.0,Washakie,Wyoming,US,43.904516,-107.680187,...,42,42,42,43,43,43,43,43,43,43
