# Coronavirus - Exploratory Analysis

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os, re, pickle

from pydemic import Pandemic, Outbreak

In [17]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

plt.rcParams['figure.figsize']=[32,18]
plt.rcParams['font.size']=22
plt.rcParams['font.weight']='bold'
plt.rcParams['axes.titlesize'] = 28
plt.rcParams['axes.labelsize'] = 24

plt.style.use('seaborn-whitegrid')

In [3]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Construct Pandemic

In [4]:
coronavirus_confirmed_df = pd.read_csv("../data/clean/coronavirus_confirmed_global.csv", index_col=0)
coronavirus_death_df = pd.read_csv("../data/clean/coronavirus_death_global.csv", index_col=0)
coronavirus_recovered_df = pd.read_csv("../data/clean/coronavirus_recovered_global.csv", index_col=0)

pandemic = Pandemic("Coronavirus", coronavirus_confirmed_df, coronavirus_death_df, coronavirus_recovered_df)

### Save Outbreaks

In [5]:
for outbreak in pandemic.outbreaks.values():
    outbreak.save()

## Analyze Data

### Top Regions Coverage

We have data available on 185 different regions (mostly representing countries). Of course, for a preliminary analysis, we don't want to necessarily consider all of these. We will therefore take a subset of top affected countries that represent a large enough coverage of the global pandemic. As of today (22 of April), the top 10 affected countries represent 2/3 of cases and 3/4ths of deaths. We consider this to be representative enough for the preliminary analysis and will proceed with this truncated dataset.

In [8]:
top10_countries = pandemic.get_top_regions(top_n=10, exclude=["China", "Iran"])
pandemic.print_regions_coverage(top10_countries)

Regions: US, Spain, Italy, United Kingdom, France, Germany, Russia, Turkey, Brazil, Canada
Case coverage=66.26
Death coverage=78.92
Recovery coverage=58.60


In [9]:
top20_countries = pandemic.get_top_regions(top_n=20, exclude=["China", "Iran"])
pandemic.print_regions_coverage(top20_countries)

Regions: US, Spain, Italy, United Kingdom, France, Germany, Russia, Turkey, Brazil, Canada, Peru, Belgium, India, Netherlands, Ecuador, Saudi Arabia, Switzerland, Mexico, Portugal, Sweden
Case coverage=79.77
Death coverage=92.21
Recovery coverage=77.69


In [10]:
top50_countries = pandemic.get_top_regions(top_n=50)
pandemic.print_regions_coverage(top50_countries)

Regions: US, Spain, Italy, United Kingdom, France, Germany, Russia, Turkey, Brazil, Iran, China, Canada, Peru, Belgium, India, Netherlands, Ecuador, Saudi Arabia, Switzerland, Mexico, Portugal, Sweden, Pakistan, Chile, Ireland, Singapore, Belarus, Qatar, Israel, Austria, Japan, United Arab Emirates, Poland, Romania, Ukraine, Indonesia, Bangladesh, Korea, South, Denmark, Philippines, Serbia, Colombia, Dominican Republic, Norway, Czechia, South Africa, Panama, Egypt, Australia, Malaysia
Case coverage=95.04
Death coverage=98.02
Recovery coverage=94.17


In [14]:
top10_outbreaks = pandemic.get_outbreaks(top10_countries)

### Smooth Curve Resampling

Let the standard sampling size be of 3 days.

In [None]:
def plot_different_sampling_sizes(outbreak, ax, sampling_sizes=[1, 2, 3, 7, 14]):
    for k in [1, 2, 3, 7, 14]:
        outbreak.smoothing_coefficient = k
        
        sns.lineplot(data=outbreak.smooth_fatality_curve.values, label=f"{k} days", ax=ax)

In [None]:
fig, axes = plt.subplots(nrows=5, ncols=2)

for i, (region, outbreak) in enumerate(top10_outbreaks.items()):
    ax = axes[i // 2][i % 2]
    ax.set_title(outbreak.region)
    plot_different_sampling_sizes(outbreak, ax)