In [1]:
import pandas as pd 
import numpy as np
import os
import re

In [2]:
reports = [report for report 
           in os.listdir('../COVID-19/csse_covid_19_data/csse_covid_19_daily_reports') 
           if re.search(r"\.csv$", report)
          ]

### You should have the https://github.com/CSSEGISandData/COVID-19.git project in the same folder as this project

reports.sort()

raw_dataframe = pd.DataFrame()

#Accumulate data
for report_name in reports[1:]:
    with open('../COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/' + report_name) as csv:
        daily_report = pd.read_csv(csv)
        raw_dataframe = raw_dataframe.append(daily_report, ignore_index=True)        

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [3]:
def blend_columns(row, column_name1, column_name2):
    return (pd.isna(row[column_name1]) and row[column_name2]) or row[column_name1]

def to_int(value):
    if np.isnan(value):
        return 0
    else:
        return int(value)
    
def observation_date(datestr_in):
    date_str = re.split(r'[T,\s]\s*', datestr_in)[0] # Format is 1/23/20 17:00 or 2020-03-01T23:33:03
    
    date_l = date_str.split('/') # Format is 1/23/20
    if len(date_l) == 3:
        date_l[2] = '2020'
        return '/'.join(date_l)
    
    date_l = date_str.split('-') #Format is 2020-04-09
    if len(date_l) == 3:
        date_l.pop(0)
        date_l.append('2020')
        return '/'.join(date_l)
    
    print('ERROR, the format is not known', datestr, datestr_in)
    return 

In [4]:
#Clean data
raw_dataframe['Country_Region'] = raw_dataframe.apply(
    lambda row: blend_columns(row, 'Country_Region', 'Country/Region')
    , axis=1)
raw_dataframe['Last_Update'] = raw_dataframe.apply(
    lambda row: blend_columns(row, 'Last_Update', 'Last Update')
    , axis=1)
raw_dataframe['Province_State'] = raw_dataframe.apply(
    lambda row: blend_columns(row, 'Province_State', 'Province/State')
    , axis=1)
raw_dataframe['Lat'] = raw_dataframe.apply(
    lambda row: blend_columns(row, 'Lat', 'Latitude')
    , axis=1)
raw_dataframe['Long_'] = raw_dataframe.apply(
    lambda row: blend_columns(row, 'Long_', 'Longitude')
    , axis=1)
raw_dataframe['Confirmed'] = raw_dataframe.apply(
    lambda row: to_int(row['Confirmed'])
    , axis=1)
raw_dataframe['Deaths'] = raw_dataframe.apply(
    lambda row: to_int(row['Deaths'])
    , axis=1)
raw_dataframe['Recovered'] = raw_dataframe.apply(
    lambda row: to_int(row['Recovered'])
    , axis=1)
raw_dataframe['Active'] = raw_dataframe.apply(
    lambda row: to_int(row['Active'])
    , axis=1)

raw_dataframe['ObservationDate'] = raw_dataframe.apply(lambda row: observation_date(row['Last_Update']), axis=1)

raw_dataframe = raw_dataframe.drop(columns=[
    'Province/State', 'Last Update', 'Last_Update', 'Country/Region', 'Latitude', 'Longitude', 'FIPS', 'Admin2'
])

column_names = [
    'Province_State','Country_Region','ObservationDate',
    'Lat','Long_','Confirmed','Deaths',
    'Recovered','Active','Combined_Key'
]
raw_dataframe = raw_dataframe.reindex(columns=column_names)

In [5]:
raw_dataframe['Deaths'] = raw_dataframe.apply(
    lambda row: to_int(row['Deaths'])
    , axis=1)

In [6]:
raw_dataframe.to_csv('covid_19_aggregation.csv')