# Import libraries and read data

In [None]:
import pandas as pd
import numpy as np

import plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.express as px

import plotly.graph_objects as go
from plotly.graph_objects import Bar

init_notebook_mode(connected=True)
plotly.offline.init_notebook_mode(connected=True)

from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import squarify

sns.set()

In [None]:
# global data
confirmed_global_path = 'COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
deaths_global_path = 'COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
recovered_global_path = 'COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'

# U.S. data
confirmed_us_path = 'COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv'
deaths_us_path = 'COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv'

In [None]:
# global data
confirmed_global_df = pd.read_csv(confirmed_global_path)
deaths_global_df = pd.read_csv(deaths_global_path)
recovered_global_df = pd.read_csv(recovered_global_path)

# U.S data
confirmed_us_df = pd.read_csv(confirmed_us_path)
deaths_us_df = pd.read_csv(deaths_us_path)

# Helper functions

In [None]:
def save_fig_as_div(fig_obj, file_name):
    with open(f'assets/{file_name}', 'w') as fig_file:
        fig_div_string = plotly.offline.plot(figure_or_data=fig_obj, output_type='div',
                                             include_plotlyjs='cdn')
        fig_file.write(fig_div_string)

In [None]:
def get_n_color_palette(palette_name, n_colors, as_hex=False):
    palette = sns.color_palette(palette=palette_name, n_colors=n_colors)
    if as_hex:
        palette = palette.as_hex()
    palette.reverse()
    return palette

# Get to know the data:

## Globally confirmed cases:

In [None]:
confirmed_global_df.head()

In [None]:
confirmed_global_df.shape

## Globally deaths:

In [None]:
deaths_global_df.head()

In [None]:
deaths_global_df.shape

## Globally recovered cases:

In [None]:
recovered_global_df.head()

In [None]:
recovered_global_df.shape

## U.S. confirmed cases:

In [None]:
confirmed_us_df.head()

In [None]:
confirmed_us_df.shape

## U.S. deaths:

In [None]:
deaths_us_df.head()

In [None]:
deaths_us_df.shape

## Summary:

The three datasets share the same structure, and have the same fields, one for **confirmed** cases, one for **deaths** cases, and one for **recovered** cases.
Fields descriptioin
* Province/State: China - province name; US/Canada/Australia/ - city name, state/province name; Others - name of the event (e.g., "Diamond Princess" cruise ship); other countries - blank.
* Country/Region: country/region name conforming to WHO (will be updated).
* Lat and Long: a coordinates reference for the user.
* Date fields: M/DD/YYYY (UTC), the *cumulative* number of cases up until this date.

It can be seen that the `U.S.` dataset differs from the `global` one, as it has these additional columns: UID, iso2, iso3, code3, FIPS (we'll get to them later)

# Data Preprocessing:

In [None]:
# global data
print(f'Globally confirmed cases: {confirmed_global_df.shape}')
print(f'Globally deaths: {deaths_global_df.shape}')
print(f'Globally recovered cases: {recovered_global_df.shape}')

# U.S. data
print(f'U.S. Confirmed cases: {confirmed_us_df.shape}')
print(f'U.S. deaths: {deaths_us_df.shape}')

although the three datasets have the same structure, they are inconsistent, the number of **recovered** cases is different, so I think we should drop it.

## 1 - Drop irrelevant columns:
`Lat` and `Long` columns specify the coordinates of the case, we won't be needing this inofrmation right now.

In [None]:
def drop_irrelevant_columns(df, irrelevant_columns):
    new_df = df.drop(columns=irrelevant_columns)
    return new_df

In [None]:
irrelevant_columns = ['Lat', 'Long']

In [None]:
confirmed_global_df = drop_irrelevant_columns(confirmed_global_df, irrelevant_columns)
deaths_global_df = drop_irrelevant_columns(deaths_global_df, irrelevant_columns)
recovered_global_df = drop_irrelevant_columns(recovered_global_df, irrelevant_columns)

In [None]:
confirmed_global_df.head()

In [None]:
deaths_global_df.head()

In [None]:
recovered_global_df.head()

Also, drop columns `UID`, `iso2`, `iso3`, `code3`, `FIPS`, `Admin2`, `Combined_Key` from the `U.S.` data:

In [None]:
irrelevant_columns = ['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Lat', 'Long_', 'Combined_Key']

In [None]:
confirmed_us_df = drop_irrelevant_columns(confirmed_us_df, irrelevant_columns)
deaths_us_df = drop_irrelevant_columns(deaths_us_df, irrelevant_columns)

In [None]:
confirmed_us_df.head()

In [None]:
deaths_us_df.head()

only deaths data has the column `Population`, but it should the same for the confirmed data also.

In [None]:
confirmed_us_df.insert(2, 'Population', deaths_us_df['Population'])

In [None]:
confirmed_us_df.head()

## 2 - Normalize columns' names:
throughout this notebook, we will be quering columns `Province/State`, `Country/Region` a lot! so it would be better to rename them so some thing like `State` and `Country` respectively.

In [None]:
def rename_columns(df, columns_mapping):
    new_df = df.rename(columns=columns_mapping)
    return new_df

In [None]:
columns_mapping = {
    'Province/State': 'State',
    'Province_State': 'State',
    'Country/Region': 'Country',
    'Country_Region': 'Country'
}

In [None]:
# global data
confirmed_global_df = rename_columns(confirmed_global_df, columns_mapping)
deaths_global_df = rename_columns(deaths_global_df, columns_mapping)
recovered_global_df = rename_columns(recovered_global_df, columns_mapping)

# U.S. data
confirmed_us_df = rename_columns(confirmed_us_df, columns_mapping)
deaths_us_df = rename_columns(deaths_us_df, columns_mapping)

### Global data:

In [None]:
confirmed_global_df.head()

In [None]:
deaths_global_df.head()

In [None]:
recovered_global_df.head()

### U.S. data:

In [None]:
confirmed_us_df.head()

In [None]:
deaths_us_df.head()

## 3 - Drop `State` column from the global data:
most records of the global data doesn't have `State` value, so it's better to drop it.

In [None]:
confirmed_global_df = drop_irrelevant_columns(confirmed_global_df, 'State')
deaths_global_df = drop_irrelevant_columns(deaths_global_df, 'State')
recovered_global_df = drop_irrelevant_columns(recovered_global_df, 'State')

In [None]:
confirmed_global_df.head()

In [None]:
deaths_global_df.head()

In [None]:
recovered_global_df.head()

## 4 - Aggregate data by countries for global data:
aggregate the number of confirmed cases, number of deaths, and number of reovered cases at each day for each country.

aggregate global data:

In [None]:
def apply_aggregation(df, by_column):
    aggregate_df = df.groupby(by=by_column)\
                        .sum()\
                        .reset_index()
    return aggregate_df

In [None]:
by_column = 'Country'

In [None]:
# global
confirmed_global_agg_df = apply_aggregation(confirmed_global_df, by_column)
deaths_global_agg_df = apply_aggregation(deaths_global_df, by_column)
recovered_global_agg_df = apply_aggregation(recovered_global_df, by_column)

In [None]:
confirmed_global_agg_df.head()

In [None]:
deaths_global_agg_df.head()

In [None]:
recovered_global_agg_df.head()

aggregate U.S. data:

In [None]:
by_columns = ['Country', 'State']

In [None]:
confirmed_us_agg_df = apply_aggregation(confirmed_us_df, by_columns)
deaths_us_agg_df = apply_aggregation(deaths_us_df, by_columns)

In [None]:
confirmed_us_agg_df.head()

In [None]:
deaths_us_agg_df.head()

## 5 - create `Population` dataframe for the `U.S.` data:

Later, we will convert the data to time series structure, and the value of `Population` shouldn't be in a time series data, as it doesn't change over time, it's rather a characteristic of the state.

In [None]:
us_population_df = confirmed_us_agg_df[['Country', 'State', 'Population']]

In [None]:
confirmed_us_agg_df = drop_irrelevant_columns(confirmed_us_agg_df, ['Population'])
deaths_us_agg_df = drop_irrelevant_columns(deaths_us_agg_df, ['Population'])

In [None]:
confirmed_us_agg_df.head()

In [None]:
deaths_us_agg_df.head()

## 5 - Restructe the Data:
convert the data into an easier structure, add column `date`, and change the columns to rows (for easier manipulation)

In [None]:
date_columns = confirmed_global_agg_df.filter(regex='\d{1,2}/\d{1,2}/\d{1,4}').columns.values

In [None]:
def construct_date_df(df, date_column_name, copy_columns, value_column_name):
    
    data = {}
    
    data['date'] = date_column_name
    
    for column in copy_columns:
        data[column] = df[column]
    
    data[value_column_name] = df[date_column_name]
    
    date_df = pd.DataFrame(data=data)
    
    return date_df

In [None]:
# global data
confirmed_global_date_frames = [construct_date_df(confirmed_global_agg_df, date_column, ['Country'], 'confirmed') for date_column in date_columns]
deaths_global_date_frames = [construct_date_df(deaths_global_agg_df, date_column, ['Country'], 'deaths') for date_column in date_columns]
recovered_global_date_frames = [construct_date_df(recovered_global_agg_df, date_column, ['Country'], 'recovered') for date_column in date_columns]

# U.S. data
confirmed_us_date_frames = [construct_date_df(confirmed_us_agg_df, date_column, ['Country', 'State'], 'confirmed') for date_column in date_columns]
deaths_us_date_frames = [construct_date_df(deaths_us_agg_df, date_column, ['Country', 'State'], 'deaths') for date_column in date_columns]

In [None]:
# global data
confirmed_global_time_series = pd.concat(confirmed_global_date_frames)
deaths_global_time_series = pd.concat(deaths_global_date_frames)
recovered_global_time_series = pd.concat(recovered_global_date_frames)

# U.S. data
confirmed_us_time_series = pd.concat(confirmed_us_date_frames)
deaths_us_time_series = pd.concat(deaths_us_date_frames)

In [None]:
confirmed_global_time_series.head()

In [None]:
deaths_global_time_series.head()

In [None]:
recovered_global_time_series.head()

In [None]:
confirmed_us_time_series.head()

In [None]:
deaths_us_time_series.head()

in the rest of the notebook, we will focus on the following five dataframes:
- `confirmed_global_time_series`: time series of the global confirmed cases.
- `deaths_global_time_series`: time series of the global deaths.
- `recovered_global_time_series`: time series of the global recovered cases.
- `confirmed_us_time_series`: time series of U.S. confirmed cases.
- `deaths_us_time_series`: time series of U.S. deaths.

A *time series* data shows the change in a *statiscal variable* according to *time*.


Each row in the previous time series is a tuple of date, country, and measure, the measure represents the cumulative sum in the country, up until the date.

## 6 - Focus on *Outbreaks*:
We will focus in this notebook only on countries with high **death tolls**, countires that are nearly affected by the pandemic will be discarded.

In [None]:
most_n_countries = 15

In [None]:
last_day = deaths_global_time_series['date'].values[-1]

In [None]:
most_affected_countries = deaths_global_time_series[deaths_global_time_series['date'] == last_day]\
                                                    .sort_values(by='deaths', ascending=False)\
                                                    [:most_n_countries]

In [None]:
most_affected_countries

In [None]:
most_affected_countries_names = most_affected_countries['Country'].values

Create time series for most affected countries:

In [None]:
most_deaths_time_series = deaths_global_time_series[deaths_global_time_series['Country']\
                                                    .isin(most_affected_countries_names)]

In [None]:
most_deaths_time_series.head()

In [None]:
most_confirmed_time_series = confirmed_global_time_series[confirmed_global_time_series['Country']\
                                                          .isin(most_affected_countries_names)]

In [None]:
most_confirmed_time_series.head()

In [None]:
most_recovered_time_series = recovered_global_time_series[recovered_global_time_series['Country']\
                                                          .isin(most_affected_countries_names)]

In [None]:
most_recovered_time_series.head()

In [None]:
most_recovered_time_series['date'] = pd.to_datetime(most_recovered_time_series['date'], utc=False)

In [None]:
most_recovered_time_series.sort_values(by='date', inplace=True)

In [None]:
most_recovered_time_series.head()

# Data Visualization:

## First infection date

In [None]:
most_confirmed_time_series['date'] = pd.to_datetime(most_confirmed_time_series['date'], utc=False)

In [None]:
most_confirmed_time_series.sort_values(by='date', inplace=True)

In [None]:
most_confirmed_time_series.head()

In [None]:
first_infection_date_df = most_confirmed_time_series[most_confirmed_time_series['confirmed'] > 0]\
                                                    .groupby('Country')\
                                                    .first()\
                                                    .reset_index()\
                                                    .sort_values(by='date')

In [None]:
first_infection_date_df

In [None]:
first_infection_date_df.style.set_properties(**{'background-color': 'white',
                           'color': 'black',
                           'border-color': 'white',
                           'border-width': '350px'})\
                        .format({'date': "{:%Y-%m-%d}"})\
                        .hide_index()

In [None]:
html_string = first_infection_date_df.style.set_properties(**{'background-color': 'white',
                           'color': 'black',
                           'border-color': 'white',
                           'border-width': '350px'})\
                        .format({'date': "{:%Y-%m-%d}"})\
                        .hide_index()\
                        .render()

In [None]:
with open('assets/first_infection_date_table.html', 'w') as html_file:
    html_file.write(html_string)

## First death date

In [None]:
most_deaths_time_series['date'] = pd.to_datetime(most_deaths_time_series['date'], utc=False)

In [None]:
most_deaths_time_series.sort_values(by='date', inplace=True)

In [None]:
most_deaths_time_series.head()

In [None]:
first_death_date_df = most_deaths_time_series[most_deaths_time_series['deaths'] > 0]\
                                                .groupby(by='Country')\
                                                .first()\
                                                .reset_index()\
                                                .sort_values(by='date')

In [None]:
first_death_date_df

In [None]:
first_death_date_df.style.set_properties(**{'background-color': 'white',
                           'color': 'black',
                           'border-color': 'white',
                           'border-width': '350px'})\
                        .format({'date': "{:%Y-%m-%d}"})\
                        .hide_index()

In [None]:
html_string = first_death_date_df.style.set_properties(**{'background-color': 'white',
                           'color': 'black',
                           'border-color': 'white',
                           'border-width': '350px'})\
                        .format({'date': "{:%Y-%m-%d}"})\
                        .hide_index()\
                        .render()

In [None]:
with open('assets/first_death_date_table.html', 'w') as html_file:
    html_file.write(html_string)

## Line Chart:

### Deaths trajectory

In [None]:
fig = px.line(data_frame=most_deaths_time_series, x='date', y='deaths', color='Country',
              line_group='Country')

fig.update_layout(autosize=False,
                  width=800, height=600,
                  title_text='<i><b>Deaths trajectory</b></i>',
                  xaxis_title="Date", yaxis_title="Number of deaths",
                  plot_bgcolor='rgba(0, 0, 0, 0)',
                  paper_bgcolor= 'rgba(0, 0, 0, 0)',
                  font={
                      'family': "Courier New, monospace",
                      'size': 14,
                      'color': "#eaeaea"
                  }
                 )

fig.show()

### Infections trajectory

In [None]:
fig = px.line(data_frame=most_confirmed_time_series, x='date', y='confirmed', color='Country',
              line_group='Country')

fig.update_layout(autosize=False,
                  width=800, height=600,
                  title_text='<i><b>Infections trajectory</b></i>',
                  xaxis_title="Date", yaxis_title="Number of infections",
                  plot_bgcolor='rgba(0, 0, 0, 0)',
                  paper_bgcolor= 'rgba(0, 0, 0, 0)',
                  font={
                      'family': "Courier New, monospace",
                      'size': 14,
                      'color': "#eaeaea"
                  }
                 )

fig.show()

### Recovery trajectory:

In [None]:
fig = px.line(data_frame=most_recovered_time_series, x='date', y='recovered', color='Country',
              line_group='Country')

fig.update_layout(autosize=False,
                  width=800, height=600,
                  title_text='<i><b>Recovery trajectory</b></i>',
                  xaxis_title="Date", yaxis_title="Number of recovered cases",
                  plot_bgcolor='rgba(0, 0, 0, 0)',
                  paper_bgcolor= 'rgba(0, 0, 0, 0)',
                  font={
                      'family': "Courier New, monospace",
                      'size': 14,
                      'color': "#eaeaea"
                  }
                 )

fig.show()

## Choropleth Map:

In [None]:
fig = px.choropleth(data_frame=deaths_global_time_series, locations='Country',
                    locationmode='country names', color='deaths',
                    hover_name='deaths', animation_frame='date',
                    color_continuous_scale=px.colors.sequential.Plasma)

fig.update_layout(autosize=False,
                  width=800, height=600,
                  title_text='<i><b>Deaths through time</b></i>',
                  xaxis_title="Measures", yaxis_title="Class",
                  plot_bgcolor='rgba(0, 0, 0, 0)',
                  paper_bgcolor= 'rgba(0, 0, 0, 0)',
                  font={
                      'family': "Courier New, monospace",
                      'size': 14,
                      'color': "#eaeaea"
                  }
                 )

fig.show()

## U.S. Choropleth:

In [None]:
fig = px.choropleth(data_frame=dea, locations='Country',
                    locationmode='country names', color='deaths',
                    hover_name='deaths', animation_frame='date',
                    color_continuous_scale=px.colors.sequential.Plasma)

In [None]:
# lookup_df = pd.read_csv('COVID-19/csse_covid_19_data/us_states_abbrevations.csv')

In [None]:
# lookup_df.head()

In [None]:
# confirmed_us_time_series.head()

In [None]:
# temp_df = confirmed_us_time_series.merge(lookup_df[['State', 'Code']],
#                                 left_on='State',
#                                 right_on='State',
#                                 how='inner')

In [None]:
# temp_df.head()

In [None]:
# fig = px.choropleth(data_frame=temp_df,
#                     locations='Code',
#                     locationmode="USA-states",
#                     color='confirmed',
#                     animation_frame='date',
#                     scope="usa")
# fig.show()

## Stacked Area chart

In [None]:
countries_daily_deaths.head()

In [None]:
fig = px.area(data_frame=countries_daily_deaths, x='date', y='daily_deaths', color='Country')
fig.show()

## Histogram

In [None]:
import plotly.figure_factory as ff

In [None]:
help(ff.create_distplot)

In [None]:
china_daily_deaths = countries_daily_deaths[countries_daily_deaths['Country'] == 'China']['daily_deaths'].values

In [None]:
china_daily_deaths

In [None]:
ff.create_distplot(hist_data)

In [None]:
my_list = [countries_daily_deaths[countries_daily_deaths['Country'] == country_name]['daily_deaths'] for country_name in most_affected_countries_names]

In [None]:
len(my_list)

In [None]:
china_daily_deaths

In [None]:
italy_daily_deaths = countries_daily_deaths[countries_daily_deaths['Country'] == 'Italy']['daily_deaths'].values

In [None]:
italy_daily_deaths

In [None]:
ff.create_distplot(hist_data=[china_daily_deaths, italy_daily_deaths], group_labels=['China', 'Italy'], bin_size=.2)

In [None]:
import plotly.express as px
df = px.data.tips()
fig = px.histogram(df, x="total_bill", histnorm='probability density')
fig.show()

In [None]:
px.histogram(data_frame=countries_daily_deaths, x='date', y='daily_deaths', color='Country', histnorm='probability density')

In [None]:
def get_daily_deaths(deaths_columns):
    number_of_days = len(deaths_columns)
    daily_deaths = deaths_columns[0]
    diff = np.absolute(deaths_columns[1:number_of_days] - deaths_columns[0:number_of_days - 1])
    daily_deaths = np.append(daily_deaths, diff)
    return daily_deaths

In [None]:
def get_daily_deaths_df(df, country_name):
    new_df = df[df['Country'] == country_name]
    new_df['date'] = pd.to_datetime(new_df['date'], utc=False)
    new_df.sort_values(by='date', inplace=True)
    country_daily_deaths = get_daily_deaths(new_df['deaths'].values)
    new_df['daily_deaths'] = country_daily_deaths
    return new_df

In [None]:
most_affected_countries_names

In [None]:
countries_daily_deaths = pd.concat([get_daily_deaths_df(most_deaths_time_series, country_name) for country_name in most_affected_countries_names])

In [None]:
countries_daily_deaths.sort_values(by='date', inplace=True)

In [None]:
countries_daily_deaths.head()

In [None]:
fig = px.line(data_frame=countries_daily_deaths, x='date', y='daily_deaths', color='Country',
              line_group='Country', title='Daily Deaths', height=600)
fig.show()

In [None]:
bar_chart_data = [go.Bar(name=country_name,
        x=countries_daily_deaths[countries_daily_deaths['Country'] == country_name]['date'],
        y=countries_daily_deaths[countries_daily_deaths['Country'] == country_name]['daily_deaths'])
 for country_name in most_affected_countries_names]

In [None]:
from plotly.subplots import make_subplots
fig = make_subplots(rows=5, cols=2)

In [None]:
fig.add_trace(bar_chart_data[0], row=1, col=1)
fig.add_trace(bar_chart_data[1], row=1, col=2)

fig.update_layout(height=600, width=1000, title_text="Stacked Subplots")
fig.show()

In [None]:
px.bar(data_frame=countries_daily_deaths[countries_daily_deaths['Country'] == 'China'],
      x = 'date', y='daily_deaths', title='China daily deaths')