# Script to Consolidate Data and Create a Day Count

In [17]:
import pandas as pd
import glob

In [18]:
path = "D:/Dropbox/Data/R Shiny/20-03-20 Covid-19"

## Combine all the files together

In [19]:
all_files = glob.glob(path + "/csse_covid_19_daily_reports/*.csv")

li = []

# for files in folder, create a dataframe out of them
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    # create a date field that is generated from the file name
    df['date'] = filename.split('\\')[-1].split('.')[0]
    # add data to list
    li.append(df)

# concatinate all data (union) into final dataframe    
frame = pd.concat(li, axis=0, ignore_index=True,sort=False)

## Aggregate the numbers on a country level

In [20]:
# aggregate numbers by country
country_df = frame.groupby(['Country/Region','date']).sum().reset_index()

In [21]:
country_list = []

# split the dataframe by country
for country, country_df in country_df.groupby('Country/Region'):
    # Add a day field and number it chronologically
    country_df = country_df.reset_index()
    country_df['Days'] = country_df.index + 1
    
    # Get deltas for deaths, recovered, and then new cases using pd.diff() function
    country_df['D_delta'] = country_df['Deaths'].diff()
    country_df['R_delta'] = country_df['Recovered'].diff()
    country_df['C_delta'] = country_df['Confirmed'].diff()
    
    
    # add data to list
    country_list.append(country_df)
    
# concatinate all data (union) into final dataframe    
frame2 = pd.concat(country_list, axis=0, ignore_index=True,sort=False)

## Cleaning the data

In [22]:
# delete newly created fields
del frame2['index']

# rearrange fields for easier reading
final_table = frame2[['date','Days','Country/Region','Confirmed','C_delta','Deaths','D_delta','Recovered','R_delta']]

In [23]:
final_table[final_table['Country/Region'] == 'Canada']

Unnamed: 0,date,Days,Country/Region,Confirmed,C_delta,Deaths,D_delta,Recovered,R_delta
529,01-26-2020,1,Canada,1.0,,0.0,,0.0,
530,01-27-2020,2,Canada,1.0,0.0,0.0,0.0,0.0,0.0
531,01-28-2020,3,Canada,2.0,1.0,0.0,0.0,0.0,0.0
532,01-29-2020,4,Canada,2.0,0.0,0.0,0.0,0.0,0.0
533,01-30-2020,5,Canada,3.0,1.0,0.0,0.0,0.0,0.0
534,01-31-2020,6,Canada,3.0,0.0,0.0,0.0,0.0,0.0
535,02-01-2020,7,Canada,4.0,1.0,0.0,0.0,0.0,0.0
536,02-02-2020,8,Canada,4.0,0.0,0.0,0.0,0.0,0.0
537,02-03-2020,9,Canada,4.0,0.0,0.0,0.0,0.0,0.0
538,02-04-2020,10,Canada,4.0,0.0,0.0,0.0,0.0,0.0


In [25]:
final_table.to_csv(path + "/transformed.csv", index=False)