# Preprocess Raw Data

In [21]:
library(reshape)
library(plyr)

In [22]:
horizontaldf2verticaldf <- function(horizontal.df,value_column_name){
    colnames(horizontal.df) <- c(c("Province.State","Country.Region","Lat","Long"),seq(as.Date("2020-01-22"), length = ncol(global_recovered.df)-4, by = "days"))
    vertical.df <- melt(horizontal.df,
             id=c("Province.State","Country.Region","Lat","Long"))
    colnames(vertical.df) <- c(c("Province.State","Country.Region","Lat","Long"),"Date",value_column_name)
    vertical.df$Date <- as.Date(as.integer(vertical.df$Date),origin="21/01/2020", format="%d/%m/%Y")
    return(vertical.df)
}

In [119]:
# Given a dataframe df of data and a list of country / province pairs,
# this functions adds a row to df for each element of pairs not present in df.
# This is used because we have regional data for deaths and confirmed cases,
# but not for recovery in some countries, and thus the merge discard these countries
add_missing_rows <- function(df, pairs, value_column_name) {
    for (i in nrow(pairs)) {
        confirmed_row <- df[(df$Country.Region == pairs[i, "Country.Region"]) &
                            (df$Province.State == pairs[i, "Province.State"]),]
        if (nrow(confirmed_row) == 0) {
            new_row <- df[df$Country.Region == pairs[i, "Country.Region"],][1,]
            new_row[,value_column_name] <- NA
            new_row[,"Province.State"] <- pairs[i, "Province.State"]
            df <- rbind(df, new_row)
        }
    }
    return(df)
}

## Read data

In [23]:
italy.df <- read.csv("data/Italy/dpc-covid19-ita-regioni-latest.csv")

In [24]:
global_confirmed.df <- read.csv("data/Global_JohnsHopkins/time_series_covid19_confirmed_global.csv")
global_deaths.df <- read.csv("data/Global_JohnsHopkins/time_series_covid19_deaths_global.csv")
global_recovered.df <- read.csv("data/Global_JohnsHopkins/time_series_covid19_recovered_global.csv")

## Check data structure

In [25]:
head(italy.df)

Unnamed: 0_level_0,data,stato,codice_regione,denominazione_regione,lat,long,ricoverati_con_sintomi,terapia_intensiva,totale_ospedalizzati,isolamento_domiciliare,totale_attualmente_positivi,nuovi_attualmente_positivi,dimessi_guariti,deceduti,totale_casi,tamponi,note_it,note_en
Unnamed: 0_level_1,<fct>,<fct>,<int>,<fct>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<lgl>,<lgl>
1,2020-03-27T17:00:00,ITA,13,Abruzzo,42.35122,13.39844,289,71,360,565,925,65,24,68,1017,6109,,
2,2020-03-27T17:00:00,ITA,17,Basilicata,40.63947,15.80515,22,15,37,110,147,14,1,3,151,1254,,
3,2020-03-27T17:00:00,ITA,4,P.A. Bolzano,46.49933,11.35662,249,45,294,539,833,42,110,60,1003,8520,,
4,2020-03-27T17:00:00,ITA,18,Calabria,38.90598,16.5944,103,22,125,344,469,97,7,18,494,6901,,
5,2020-03-27T17:00:00,ITA,15,Campania,40.83957,14.25085,456,113,569,723,1292,123,64,98,1454,9613,,
6,2020-03-27T17:00:00,ITA,8,Emilia Romagna,44.49437,11.34172,3461,308,3769,5592,9361,511,960,1267,11588,47798,,


In [26]:
head(global_confirmed.df)

Unnamed: 0_level_0,Province.State,Country.Region,Lat,Long,X1.22.20,X1.23.20,X1.24.20,X1.25.20,X1.26.20,X1.27.20,⋯,X3.17.20,X3.18.20,X3.19.20,X3.20.20,X3.21.20,X3.22.20,X3.23.20,X3.24.20,X3.25.20,X3.26.20
Unnamed: 0_level_1,<fct>,<fct>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,,Afghanistan,33.0,65.0,0,0,0,0,0,0,⋯,22,22,22,24,24,40,40,74,84,94
2,,Albania,41.1533,20.1683,0,0,0,0,0,0,⋯,55,59,64,70,76,89,104,123,146,174
3,,Algeria,28.0339,1.6596,0,0,0,0,0,0,⋯,60,74,87,90,139,201,230,264,302,367
4,,Andorra,42.5063,1.5218,0,0,0,0,0,0,⋯,39,39,53,75,88,113,133,164,188,224
5,,Angola,-11.2027,17.8739,0,0,0,0,0,0,⋯,0,0,0,1,2,2,3,3,3,4
6,,Antigua and Barbuda,17.0608,-61.7964,0,0,0,0,0,0,⋯,1,1,1,1,1,1,3,3,3,7


In [27]:
head(global_deaths.df)

Unnamed: 0_level_0,Province.State,Country.Region,Lat,Long,X1.22.20,X1.23.20,X1.24.20,X1.25.20,X1.26.20,X1.27.20,⋯,X3.17.20,X3.18.20,X3.19.20,X3.20.20,X3.21.20,X3.22.20,X3.23.20,X3.24.20,X3.25.20,X3.26.20
Unnamed: 0_level_1,<fct>,<fct>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,,Afghanistan,33.0,65.0,0,0,0,0,0,0,⋯,0,0,0,0,0,1,1,1,2,4
2,,Albania,41.1533,20.1683,0,0,0,0,0,0,⋯,1,2,2,2,2,2,4,5,5,6
3,,Algeria,28.0339,1.6596,0,0,0,0,0,0,⋯,4,7,9,11,15,17,17,19,21,25
4,,Andorra,42.5063,1.5218,0,0,0,0,0,0,⋯,0,0,0,0,0,1,1,1,1,3
5,,Angola,-11.2027,17.8739,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
6,,Antigua and Barbuda,17.0608,-61.7964,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [28]:
head(global_recovered.df)

Unnamed: 0_level_0,Province.State,Country.Region,Lat,Long,X1.22.20,X1.23.20,X1.24.20,X1.25.20,X1.26.20,X1.27.20,⋯,X3.17.20,X3.18.20,X3.19.20,X3.20.20,X3.21.20,X3.22.20,X3.23.20,X3.24.20,X3.25.20,X3.26.20
Unnamed: 0_level_1,<fct>,<fct>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,,Afghanistan,33.0,65.0,0,0,0,0,0,0,⋯,1,1,1,1,1,1,1,1,2,2
2,,Albania,41.1533,20.1683,0,0,0,0,0,0,⋯,0,0,0,0,2,2,2,10,17,17
3,,Algeria,28.0339,1.6596,0,0,0,0,0,0,⋯,12,12,32,32,32,65,65,24,65,29
4,,Andorra,42.5063,1.5218,0,0,0,0,0,0,⋯,1,1,1,1,1,1,1,1,1,1
5,,Angola,-11.2027,17.8739,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
6,,Antigua and Barbuda,17.0608,-61.7964,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


## Global
### Transform horizontal to vertical

In [128]:
global_confirmed_vertical.df <- horizontaldf2verticaldf(global_confirmed.df,"Confirmed")
global_deaths_vertical.df <- horizontaldf2verticaldf(global_deaths.df,"Deaths")
global_recovered_vertical.df <- horizontaldf2verticaldf(global_recovered.df,"Recovered")

In [30]:
head(global_confirmed_vertical.df)

Unnamed: 0_level_0,Province.State,Country.Region,Lat,Long,Date,Confirmed
Unnamed: 0_level_1,<fct>,<fct>,<dbl>,<dbl>,<date>,<int>
1,,Afghanistan,33.0,65.0,2020-01-22,0
2,,Albania,41.1533,20.1683,2020-01-22,0
3,,Algeria,28.0339,1.6596,2020-01-22,0
4,,Andorra,42.5063,1.5218,2020-01-22,0
5,,Angola,-11.2027,17.8739,2020-01-22,0
6,,Antigua and Barbuda,17.0608,-61.7964,2020-01-22,0


In [31]:
head(global_recovered_vertical.df)

Unnamed: 0_level_0,Province.State,Country.Region,Lat,Long,Date,Recovered
Unnamed: 0_level_1,<fct>,<fct>,<dbl>,<dbl>,<date>,<int>
1,,Afghanistan,33.0,65.0,2020-01-22,0
2,,Albania,41.1533,20.1683,2020-01-22,0
3,,Algeria,28.0339,1.6596,2020-01-22,0
4,,Andorra,42.5063,1.5218,2020-01-22,0
5,,Angola,-11.2027,17.8739,2020-01-22,0
6,,Antigua and Barbuda,17.0608,-61.7964,2020-01-22,0


In [105]:
head(global_deaths_vertical.df)

Unnamed: 0_level_0,Province.State,Country.Region,Lat,Long,Date,Deaths
Unnamed: 0_level_1,<fct>,<fct>,<dbl>,<dbl>,<date>,<int>
218,Bermuda,United Kingdom,32.3078,-64.7505,2020-01-22,0
219,Cayman Islands,United Kingdom,19.3133,-81.2546,2020-01-22,0
220,Channel Islands,United Kingdom,49.3723,-2.3644,2020-01-22,0
221,Gibraltar,United Kingdom,36.1408,-5.3536,2020-01-22,0
222,Isle of Man,United Kingdom,54.2361,-4.5481,2020-01-22,0
223,Montserrat,United Kingdom,16.7425,-62.1874,2020-01-22,0
224,,United Kingdom,55.3781,-3.4360,2020-01-22,0
466,Bermuda,United Kingdom,32.3078,-64.7505,2020-01-23,0
467,Cayman Islands,United Kingdom,19.3133,-81.2546,2020-01-23,0
468,Channel Islands,United Kingdom,49.3723,-2.3644,2020-01-23,0


### Merge data

In [129]:
global_confirmed_vertical.df[,"Country.Region"] <- as.character(global_confirmed_vertical.df[,"Country.Region"])
global_confirmed_vertical.df[,"Province.State"] <- as.character(global_confirmed_vertical.df[,"Province.State"])
global_deaths_vertical.df[,"Country.Region"] <- as.character(global_deaths_vertical.df[,"Country.Region"])
global_deaths_vertical.df[,"Province.State"] <- as.character(global_deaths_vertical.df[,"Province.State"])
global_recovered_vertical.df[,"Country.Region"] <- as.character(global_recovered_vertical.df[,"Country.Region"])
global_recovered_vertical.df[,"Province.State"] <- as.character(global_recovered_vertical.df[,"Province.State"])

In [130]:
# Get the list of all country / province pairs in all global dataframes (not necessarily present in all of them)
pairs <- unique(rbind(
        global_confirmed_vertical.df[,c("Country.Region", "Province.State")],
        global_deaths_vertical.df[,c("Country.Region", "Province.State")],
        global_recovered_vertical.df[,c("Country.Region", "Province.State")]))

global_confirmed_vertical.df <- add_missing_rows(global_confirmed_vertical.df, pairs, "Confirmed")
global_deaths_vertical.df <- add_missing_rows(global_deaths_vertical.df, pairs, "Deaths")
global_recovered_vertical.df <- add_missing_rows(global_recovered_vertical.df, pairs, "Recovered")

In [131]:
global_merged.df <- merge(global_confirmed_vertical.df,
      merge(global_recovered_vertical.df,global_deaths_vertical.df,
      by=c("Province.State","Country.Region","Date")),
      by=c("Province.State","Country.Region","Date"))

In [132]:
"Canada" %in% global_merged.df$Country.Region

## Italy

### Translate columns

In [34]:
colnames(italy.df) <- c("Date","Country.Region","RegionCode","Province.State","Lat","Long","HospitalizedWSymptoms","ICU","TotalHospitalized","HomeIsolation","Confirmed","DailyConfirmed","Recovered","Deaths","Total","Tests")
italy.df$Country.Region <- "Italy"
italy.df$Date <- as.Date(italy.df$Date)
head(italy.df)

Unnamed: 0_level_0,Date,Country.Region,RegionCode,Province.State,Lat,Long,HospitalizedWSymptoms,ICU,TotalHospitalized,HomeIsolation,Confirmed,DailyConfirmed,Recovered,Deaths,Total,Tests,NA,NA
Unnamed: 0_level_1,<date>,<chr>,<int>,<fct>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<lgl>,<lgl>.1
1,2020-03-27,Italy,13,Abruzzo,42.35122,13.39844,289,71,360,565,925,65,24,68,1017,6109,,
2,2020-03-27,Italy,17,Basilicata,40.63947,15.80515,22,15,37,110,147,14,1,3,151,1254,,
3,2020-03-27,Italy,4,P.A. Bolzano,46.49933,11.35662,249,45,294,539,833,42,110,60,1003,8520,,
4,2020-03-27,Italy,18,Calabria,38.90598,16.5944,103,22,125,344,469,97,7,18,494,6901,,
5,2020-03-27,Italy,15,Campania,40.83957,14.25085,456,113,569,723,1292,123,64,98,1454,9613,,
6,2020-03-27,Italy,8,Emilia Romagna,44.49437,11.34172,3461,308,3769,5592,9361,511,960,1267,11588,47798,,


### Rearrange columns and drop useless columns

In [35]:
italy.df <- italy.df[,c(4,2,5,6,1,11,13,14,7,8,9,10,12,15,16)]

### Merge with global dataset and fill with NA

In [133]:
output.df <- rbind.fill(italy.df,global_merged.df)

### Sort data by date and then by country in alphabetic order

In [134]:
output.df <- output.df[order(output.df$Country.Region, output.df$Date),]
head(output.df)

Unnamed: 0_level_0,Province.State,Country.Region,Lat,Long,Date,Confirmed,Recovered,Deaths,HospitalizedWSymptoms,ICU,TotalHospitalized,HomeIsolation,DailyConfirmed,Total,Tests,Lat.x,Long.x,Lat.y,Long.y
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<date>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>
22,,Afghanistan,33,65,2020-01-22,0,0,0,,,,,,,,33,65,33,65
23,,Afghanistan,33,65,2020-01-23,0,0,0,,,,,,,,33,65,33,65
24,,Afghanistan,33,65,2020-01-24,0,0,0,,,,,,,,33,65,33,65
25,,Afghanistan,33,65,2020-01-25,0,0,0,,,,,,,,33,65,33,65
26,,Afghanistan,33,65,2020-01-26,0,0,0,,,,,,,,33,65,33,65
27,,Afghanistan,33,65,2020-01-27,0,0,0,,,,,,,,33,65,33,65


In [139]:
saveRDS(output.df, "COVID19_Global_Italy.Rdata", version = 2)

## Preprocess extra country data

In [205]:
restrictions.df <- read.csv("data/Kaggle_CountryInfo/restrictions.csv")
countryinfo.df <- read.csv("data/Kaggle_CountryInfo/covid19countryinfo.csv")

In [200]:
head(restrictions.df)

Unnamed: 0_level_0,country_region,date,type,limit,mandatory,notes
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<int>,<fct>,<fct>
1,Alabama,3/18/2020,Schools,0,Yes,
2,Alabama,3/20/2020,Public Places,0,Yes,
3,Alaska,3/16/2020,Schools,0,Yes,
4,Alaska,3/18/2020,Public Places,0,Yes,
5,Alaska,3/24/2020,Gatherings,10,Yes,
6,Argentina,3/20/2020,Stay at Home,0,Yes,


In [201]:
head(countryinfo.df)

Unnamed: 0_level_0,country,pop,tests,testpop,density,medianage,urbanpop,quarantine,schools,restrictions,⋯,sex0,sex14,sex25,sex54,sex64,sex65plus,sexratio,lung,femalelung,malelung
Unnamed: 0_level_1,<fct>,<fct>,<int>,<dbl>,<int>,<int>,<int>,<fct>,<fct>,<fct>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,Afghanistan,38928346,,,60,18,25,,,,⋯,1.05,1.03,1.03,1.03,0.97,0.85,1.03,37.62,36.31,39.33
2,Albania,2877797,,,105,36,63,,,,⋯,1.08,1.11,1.09,0.93,0.95,0.87,0.98,11.67,7.02,17.04
3,Algeria,43851044,,,18,29,73,,,,⋯,1.05,1.05,1.05,1.03,1.01,0.89,1.03,8.77,5.03,12.81
4,Andorra,77265,,,164,45,88,,,,⋯,1.07,1.05,1.08,1.05,1.15,1.02,1.06,,,
5,Antigua and Barbuda,97929,,,223,34,26,,,,⋯,1.05,1.03,0.99,0.84,0.82,0.76,0.9,11.76,7.67,18.78
6,Argentina,45195774,,,17,32,93,3/20/2020,,,⋯,1.05,1.06,1.05,1.0,0.94,0.71,0.98,29.27,20.16,42.59


### Rename columns for consistency

In [206]:
names(restrictions.df) <- c("Country.Region", "Date", "Type", "Limit", "Mandatory", "Notes")
names(countryinfo.df)[1] <- "Country.Region"

### Remove unused columns
We will have this data after we merge it with restrictions.df

In [213]:
countryinfo.df[c("quarantine", "schools", "restrictions")] <- NULL
restrictions.df[c("Mandatory", "Notes", "Limit")] <- NULL

### Reshape the restrictions to wide format
We want one line per country

In [214]:
restrictions_wide.df <- reshape(restrictions.df, idvar = c("Country.Region"), timevar = "Type", direction = "wide")

“multiple rows match for Type=Gatherings: first taken”


### Merge country extra data and lockdown dates

In [215]:
country_info_output.df <- merge(restrictions_wide.df, countryinfo.df, by=c("Country.Region"))

Country.Region,Date.Schools,Date.Public Places,Date.Gatherings,Date.Stay at Home,Date.Lockdown,Date.Non-essential,pop,tests,testpop,⋯,sex0,sex14,sex25,sex54,sex64,sex65plus,sexratio,lung,femalelung,malelung
<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<int>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Alabama,3/18/2020,3/20/2020,,,,,,,,⋯,,,,,,,,,,
Alaska,3/16/2020,3/18/2020,3/24/2020,,,,,,,⋯,,,,,,,,,,
Argentina,,,,3/20/2020,,,45195774,,,⋯,1.05,1.06,1.05,1.00,0.94,0.71,0.98,29.27,20.16,42.59
Arizona,3/16/2020,3/20/2020,,,,,,,,⋯,,,,,,,,,,
Arkansas,3/17/2020,,,,,,,,,⋯,,,,,,,,,,
Atlanta,,3/19/2020,,,,,,,,⋯,,,,,,,,,,
Australia,,3/23/2020,,,,,25499884,31635,806.0656,⋯,1.06,1.06,1.09,0.99,0.93,0.86,0.99,18.79,15.90,22.16
Austria,,,,3/16/2020,,,9006398,10278,876.2792,⋯,1.05,1.05,1.04,1.00,0.98,0.76,0.96,17.02,13.02,22.14
Belgium,,,,3/18/2020,,,11589623,4225,2743.1060,⋯,1.05,1.05,1.04,1.02,0.98,0.76,0.97,27.11,21.20,34.98
California,3/12/2020,3/17/2020,,3/19/2020,,,,,,⋯,,,,,,,,,,


In [None]:
saveRDS(country_info_output.df, "COVID19_Country_Info.Rdata", version = 2)