# Coronavirus - Global Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os, re, pickle

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [13]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Load Data

Load the most recent data available on the Covid-19 epidemic from the `CSSEGISandData` repository. 

There are three reports that are interesting to us: *confirmed cases*, *fatalities* and *recovered cases*.
All the cases for a specific region (from different provinces/states) are summed under the umbrella of that region to have country-level granularity on our data.

In [6]:
# uncomment below to load latest data

# coronavirus_confirmed_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')\
#                         .drop(["Lat", "Long", "Province/State"], axis=1)\
#                         .groupby('Country/Region').sum()\
#                         .transpose()
# coronavirus_confirmed_df.to_csv("../data/raw/coronavirus_confirmed_global.csv")
coronavirus_confirmed_df = pd.read_csv("../data/raw/coronavirus_confirmed_global.csv", index_col=0)

# coronavirus_death_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')\
#                     .drop(["Lat", "Long", "Province/State"], axis=1)\
#                     .groupby('Country/Region').sum()\
#                     .transpose()
# coronavirus_death_df.to_csv("../data/raw/coronavirus_death_global.csv")
coronavirus_death_df = pd.read_csv("../data/raw/coronavirus_death_global.csv", index_col=0)

# coronavirus_recovered_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')\
#                     .drop(["Lat", "Long", "Province/State"], axis=1)\
#                     .groupby('Country/Region').sum()\
#                     .transpose()
# coronavirus_recovered_df.to_csv("../data/raw/coronavirus_recovered_global.csv")
coronavirus_recovered_df = pd.read_csv("../data/raw/coronavirus_recovered_global.csv", index_col=0)

## Remove Reclassifications

Within the data, there are inconsistencies related the cumulative number of cases dropping from one day to the next due to re-classifications in the data. To address this, we correct the data by taking the cumulative minimum of cases in reverse order, as to ensure monoticity of all of our cumulative curves.

In [7]:
def verify_monotonicity(df):
    return df.apply(lambda s: s.is_monotonic).all()

def consider_reclassifications(df):
    return df[::-1].cummin()[::-1]

#### Confirmed Cases

In [8]:
verify_monotonicity(coronavirus_confirmed_df)

False

In [9]:
coronavirus_confirmed_df = consider_reclassifications(coronavirus_confirmed_df)
verify_monotonicity(coronavirus_confirmed_df)

True

#### Fatalities

In [10]:
verify_monotonicity(coronavirus_death_df)

True

#### Recovered Cases

In [11]:
verify_monotonicity(coronavirus_recovered_df)

False

In [12]:
coronavirus_recovered_df = consider_reclassifications(coronavirus_recovered_df)
verify_monotonicity(coronavirus_recovered_df)

True

## Save Clean Data

Now that we've appropriately treated and cleaned our data, we can store for further analysis in CSV files. We will store the daily date as opposed to cumulative as our analysis will need the data in that format.

In [14]:
coronavirus_confirmed_df.diff(axis=0).fillna(0).to_csv("../data/clean/coronavirus_confirmed_global.csv")
coronavirus_death_df.diff(axis=0).fillna(0).to_csv("../data/clean/coronavirus_death_global.csv")
coronavirus_recovered_df.diff(axis=0).fillna(0).to_csv("../data/clean/coronavirus_recovered_global.csv")