# Script to Consolidate Data and Create a Day Count

In [1]:
import pandas as pd
import glob

In [2]:
path = "D:/Dropbox/Data/R Shiny/20-03-20 Covid-19"

## Combine all the files together

In [3]:
all_files = glob.glob(path + "/csse_covid_19_daily_reports/*.csv")

li = []

# for files in folder, create a dataframe out of them
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    # create a date field that is generated from the file name
    df['date'] = filename.split('\\')[-1].split('.')[0]
    # add data to list
    li.append(df)

# concatinate all data (union) into final dataframe    
frame = pd.concat(li, axis=0, ignore_index=True)

# Change Country Names for consistency
frame["Country/Region"].replace({"occupied Palestinian territory": "Palestine", "Mainland China": "China", "Taiwan*": "Taiwan", "UK": "United Kingdom", "The Bahamas": "Bahamas, The", "Iran (Islamic Republic of)": "Iran", "Hong Kong SAR": "Hong Kong", "The Gambia": "Gambia, The", "Viet Nam": "Vietnam", "Korea, South": "South Korea"}, inplace=True)

In [4]:
# aggregate numbers by country
country_df = frame.groupby(['Country/Region','date']).sum().reset_index()

## Aggregate the numbers on a country level

In [5]:
country_list = []

# split the dataframe by country
for country, country_df in country_df.groupby('Country/Region'):
    # Add a day field and number it chronologically
    country_df = country_df.reset_index()
    country_df['Days'] = country_df.index + 1
    
    # Get deltas for deaths, recovered, and then new cases using pd.diff() function
    country_df['New_Deaths'] = country_df['Deaths'].diff()
    country_df['New_Recovered'] = country_df['Recovered'].diff()
    country_df['New_Confirmed'] = country_df['Confirmed'].diff()
    
    
    # add data to list
    country_list.append(country_df)
    
# concatinate all data (union) into final dataframe    
frame2 = pd.concat(country_list, axis=0, ignore_index=True)

In [8]:
# Ensure new values are > 0
import numpy as np
frame2['New_Confirmed'] = np.where(frame2['New_Confirmed'] < 0, 0, frame2['New_Confirmed'])
frame2['New_Deaths'] = np.where(frame2['New_Deaths'] < 0, 0, frame2['New_Deaths'])
frame2['New_Recovered'] = np.where(frame2['New_Recovered'] < 0, 0, frame2['New_Recovered'])

## Cleaning the data

In [9]:
# delete newly created fields
del frame2['index']

In [10]:
# rearrange fields for easier reading
final_table = frame2[['date','Days','Country/Region','Confirmed','New_Confirmed','Deaths','New_Deaths','Recovered','New_Recovered','Latitude','Longitude']]
#replace NaN with empty string
final_table = final_table.fillna(0)
# Get active cases for the day
final_table['Active_Cases'] = final_table['Confirmed'] - final_table['Deaths'] - final_table['Recovered']

In [11]:
# for row in dataframe
for i, row in final_table.iterrows():
    # if Days == 1
    if (row['Days'] == 1):
        # C_delta == Confirmed, D_delta == Deaths, R_delta == Recovered
        final_table.at[i,'New_Confirmed'] = row['Confirmed']
        final_table.at[i,'New_Deaths'] = row['Deaths']
        final_table.at[i,'New_Recovered'] = row['Recovered']

In [13]:
#final_table[final_table['Country/Region'] == 'US']

In [14]:
final_table.to_csv(path + "/transformed.csv", index=False)