# Script to Consolidate Data and Create a Day Count

In [35]:
import pandas as pd
import glob

In [36]:
path = "D:/Dropbox/Data/R Shiny/20-03-20 Covid-19"

## Combine all the files together

In [37]:
all_files = glob.glob(path + "/csse_covid_19_daily_reports/*.csv")

li = []

# for files in folder, create a dataframe out of them
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    # create a date field that is generated from the file name
    df['date'] = filename.split('\\')[-1].split('.')[0]
    # add data to list
    li.append(df)

# concatinate all data (union) into final dataframe    
frame = pd.concat(li, axis=0, ignore_index=True,sort=False)

## Aggregate the numbers on a country level

In [38]:
# aggregate numbers by country
country_df = frame.groupby(['Country/Region','date']).sum().reset_index()

In [39]:
country_list = []

# split the dataframe by country
for country, country_df in country_df.groupby('Country/Region'):
    # Add a day field and number it chronologically
    country_df = country_df.reset_index()
    country_df['Days'] = country_df.index + 1
    
    # Get deltas for deaths, recovered, and then new cases using pd.diff() function
    country_df['D_delta'] = country_df['Deaths'].diff()
    country_df['R_delta'] = country_df['Recovered'].diff()
    country_df['C_delta'] = country_df['Confirmed'].diff()
    
    
    # add data to list
    country_list.append(country_df)
    
# concatinate all data (union) into final dataframe    
frame2 = pd.concat(country_list, axis=0, ignore_index=True,sort=False)

## Cleaning the data

In [40]:
# delete newly created fields
del frame2['index']

# rearrange fields for easier reading
final_table = frame2[['date','Days','Country/Region','Confirmed','C_delta','Deaths','D_delta','Recovered','R_delta']]
# Get active cases for the day
final_table['Active_Cases'] = final_table['Confirmed'] - final_table['Deaths'] - final_table['Recovered']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [47]:
# for row in dataframe
for index, row in final_table.iterrows():
    # if Days == 1
    #print(row['Days'])
    if (row['Days'] == 1):
        # C_delta == Confirmed, D_delta == Deaths, R_delta == Recovered
        row['C_delta'] = row['Confirmed']
        row['D_delta'] = row['Deaths']
        row['R_delta'] = row['Recovered']

In [48]:
final_table[final_table['Country/Region'] == 'Canada']

Unnamed: 0,date,Days,Country/Region,Confirmed,C_delta,Deaths,D_delta,Recovered,R_delta,Active_Cases
529,01-26-2020,1,Canada,1.0,,0.0,,0.0,,1.0
530,01-27-2020,2,Canada,1.0,0.0,0.0,0.0,0.0,0.0,1.0
531,01-28-2020,3,Canada,2.0,1.0,0.0,0.0,0.0,0.0,2.0
532,01-29-2020,4,Canada,2.0,0.0,0.0,0.0,0.0,0.0,2.0
533,01-30-2020,5,Canada,3.0,1.0,0.0,0.0,0.0,0.0,3.0
534,01-31-2020,6,Canada,3.0,0.0,0.0,0.0,0.0,0.0,3.0
535,02-01-2020,7,Canada,4.0,1.0,0.0,0.0,0.0,0.0,4.0
536,02-02-2020,8,Canada,4.0,0.0,0.0,0.0,0.0,0.0,4.0
537,02-03-2020,9,Canada,4.0,0.0,0.0,0.0,0.0,0.0,4.0
538,02-04-2020,10,Canada,4.0,0.0,0.0,0.0,0.0,0.0,4.0


In [49]:
final_table.to_csv(path + "/transformed.csv", index=False)