# COVID-19: Exploratory Data Analysis & Forecast Number of Cases

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.graph_objects as go
from fbprophet import Prophet
import pycountry
import plotly.express as px
from collections import namedtuple
from bs4 import BeautifulSoup
import requests
import re 


# Data Import, Preprocessing and EDA

In [None]:


# Store the url as a string scalar: url => str
url = "https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports"

# Issue request: r => requests.models.Response
r = requests.get(url)

# Extract text: html_doc => str
html_doc = r.text

# Parse the HTML: soup => bs4.BeautifulSoup
soup = BeautifulSoup(html_doc)

# Find all 'a' tags (which define hyperlinks): a_tags => bs4.element.ResultSet
a_tags = soup.find_all('a')

# Store a list of urls ending in .csv: urls => list
urls = ['https://raw.githubusercontent.com'+re.sub('/blob', '', link.get('href')) 
        for link in a_tags  if '.csv' in link.get('href')]
url = urls[246:]

# Store a list of Data Frame names to be assigned to the list: df_list_names => list
#df_list_names = [url.split('.csv')[0].split('/')[url.count('/')] for url in urls]

# Initialise an empty list the same length as the urls list: df_list => list
#df_list = [pd.DataFrame([None]) for i in range(len(urls))]

# Store an empty list of dataframes: df_list => list
#df_list = [pd.read_csv(url, sep = ',') for url in urls]

# Name the dataframes in the list, coerce to a dictionary: df_dict => dict
#df_dict = dict(zip(df_list_names, df_list))

dfs = []
for filename in url:
    dfs.append(pd.read_csv(filename))

df = pd.concat(dfs, ignore_index=True)
dfx = df.drop(['FIPS','Admin2','Active','Combined_Key','Incidence_Rate','Case-Fatality_Ratio','Lat','Long_'], axis =1)

date = list(dfx['Last_Update'])
date_n =[]
for each in date:
    date_n.append(each[5:7]+'/'+each[8:10]+'/'+each[:4])
dfx.rename(columns={'Province_State':'Province/State', 'Country_Region':'Country', 'Last_Update':'Last Update'}, inplace=True)

dfx['Date']= date_n
dfy= dfx[['Date','Province/State','Country','Last Update','Confirmed','Deaths','Recovered']]
#dfy

In [None]:
df2 = pd.read_csv('../input/novel-corona-virus-2019-dataset/covid_19_data.csv',parse_dates=['Last Update'])
df2=df2.drop('SNo', axis =1)
df2.rename(columns={'ObservationDate':'Date', 'Country/Region':'Country'}, inplace=True)

df_confirmed = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
df_recovered = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
df_deaths = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')

df_confirmed.rename(columns={'Country/Region':'Country'}, inplace=True)
df_recovered.rename(columns={'Country/Region':'Country'}, inplace=True)
df_deaths.rename(columns={'Country/Region':'Country'}, inplace=True)
df_new = pd.concat([df2,dfy], ignore_index=True)

In [None]:
df_new

## By Country View (World)

In [None]:
df_country = df_new.groupby(["Date", "Country", "Province/State"])[[ 'Date', 'Province/State', 'Country', 'Confirmed', 'Deaths', 'Recovered']].sum().reset_index()

In [None]:
df_country

In [None]:
dataSL = df_new.query('Country=="Sri Lanka"').groupby("Last Update")[['Confirmed', 'Deaths', 'Recovered']].sum().reset_index()

## By Country (Sorted)

In [None]:
df_new.groupby("Country")[['Confirmed', 'Deaths', 'Recovered']].sum().reset_index()

# Visualizations

In [None]:
df_new.groupby('Date').sum()

In [None]:
df_confirmed

## Summary Plot of Worldwide Cases - Confirmed, Deaths & Recovered

In [None]:
confirmed = df_new.groupby('Date').sum()['Confirmed'].reset_index()
deaths = df_new.groupby('Date').sum()['Deaths'].reset_index()
recovered = df_new.groupby('Date').sum()['Recovered'].reset_index()

In [None]:
confirmed

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x=confirmed['Date'],
                y=confirmed['Confirmed'],
                name='Confirmed',
                marker_color='blue'
                ))
fig.add_trace(go.Bar(x=deaths['Date'],
                y=deaths['Deaths'],
                name='Deaths',
                marker_color='Red'
                ))
fig.add_trace(go.Bar(x=recovered['Date'],
                y=recovered['Recovered'],
                name='Recovered',
                marker_color='Green'
                ))

fig.update_layout(
    title='Worldwide Corona Virus Cases - Confirmed, Deaths, Recovered (Bar Chart)',
    xaxis_tickfont_size=14,
    yaxis=dict(
        title='Number of Cases',
        titlefont_size=16,
        tickfont_size=14,
    ),
    legend=dict(
        x=0,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='group',
    bargap=0.15, # gap between bars of adjacent location coordinates.
    bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()

## By Country View (Sri Lanka)

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x=confirmed['Date'],
                y=dataSL['Confirmed'],
                name='Confirmed',
                marker_color='blue'
                ))
fig.add_trace(go.Bar(x=deaths['Date'],
                y=dataSL['Deaths'],
                name='Deaths',
                marker_color='Red'
                ))
fig.add_trace(go.Bar(x=recovered['Date'],
                y=dataSL['Recovered'],
                name='Recovered',
                marker_color='Green'
                ))

fig.update_layout(
    title='Sri Lanka Corona Virus Cases - Confirmed, Deaths, Recovered (Bar Chart)',
    xaxis_tickfont_size=14,
    yaxis=dict(
        title='Number of Cases',
        titlefont_size=16,
        tickfont_size=14,
    ),
    legend=dict(
        x=0,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='group',
    bargap=0.15, # gap between bars of adjacent location coordinates.
    bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()

In [None]:
df_confirmed = df_confirmed[["Province/State","Lat","Long","Country"]]
df_temp = df_new.copy()
df_temp['Country'].replace({'Mainland China': 'China'}, inplace=True)
df_latlong = pd.merge(df_temp, df_confirmed, on=["Country", "Province/State"])

In [None]:
fig = px.density_mapbox(df_latlong, 
                        lat="Lat", 
                        lon="Long", 
                        hover_name="Province/State", 
                        hover_data=["Confirmed"], 
                        animation_frame="Date",
                        color_continuous_scale="Portland",
                        radius=7, 
                        zoom=0,height=700)
fig.update_layout(title='Worldwide Corona Virus Cases Time Lapse - Confirmed, Deaths, Recovered',
                  font=dict(family="Courier New, monospace",
                            size=18,
                            color="#7f7f7f")
                 )
fig.update_layout(mapbox_style="open-street-map", mapbox_center_lon=0)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})


fig.show()

## Analysis by Country

We use Plotly (https://plot.ly/python/bubble-maps/) for the visuals

### Latest Date in Data

In [None]:
confirmed = df_new.groupby(['Date', 'Country']).sum()[['Confirmed']].reset_index()
deaths = df_new.groupby(['Date', 'Country']).sum()[['Deaths']].reset_index()
recovered = df_new.groupby(['Date', 'Country']).sum()[['Recovered']].reset_index()

In [None]:
latest_date = confirmed['Date'].max()
latest_date

In [None]:
confirmed = confirmed[(confirmed['Date']==latest_date)][['Country', 'Confirmed']]
deaths = deaths[(deaths['Date']==latest_date)][['Country', 'Deaths']]
recovered = recovered[(recovered['Date']==latest_date)][['Country', 'Recovered']]

### Countries/Regions Affected

In [None]:
all_countries = confirmed['Country'].unique()
print("Number of countries/regions with cases: " + str(len(all_countries)))
print("Countries/Regions with cases: ")
for i in all_countries:
    print("    " + str(i))


We need to do some processing to the country names for this bubble plot as some of the countries are not found in `pycountry.countries` although they are, just that its due to them being listed acronyms and with additional words, such as Mainland China instead of China.

In [None]:
print(list(country.name for country in pycountry.countries))

In [None]:
confirmed2 = confirmed.copy()
deaths2 = deaths.copy()
recovered2 = recovered.copy()
bubble_plot_dfs = [confirmed2, deaths2, recovered2]
for df_ in bubble_plot_dfs:
    df_["Country"].replace({'Mainland China': 'China'}, inplace=True)
    df_["Country"].replace({'UK': 'United Kingdom'}, inplace=True)
    df_["Country"].replace({'US': 'United States'}, inplace=True)

In [None]:
countries = {}
for country in pycountry.countries:
    countries[country.name] = country.alpha_3
    
confirmed2["iso_alpha"] = confirmed2["Country"].map(countries.get)
deaths2["iso_alpha"] = deaths2["Country"].map(countries.get)
recovered2["iso_alpha"] = recovered2["Country"].map(countries.get)

In [None]:
plot_data_confirmed = confirmed2[["iso_alpha","Confirmed", "Country"]]
plot_data_deaths = deaths2[["iso_alpha","Deaths"]]
plot_data_recovered = recovered2[["iso_alpha","Recovered"]]

In [None]:
fig = px.scatter_geo(plot_data_confirmed, locations="iso_alpha", color="Country",
                     hover_name="iso_alpha", size="Confirmed",
                     projection="natural earth", title = 'Worldwide Confirmed Cases')
fig.show()

In [None]:
fig = px.scatter_geo(plot_data_deaths, locations="iso_alpha", color="Deaths",
                     hover_name="iso_alpha", size="Deaths",
                     projection="natural earth", title="Worldwide Death Cases")
fig.show()

In [None]:
fig = px.scatter_geo(plot_data_recovered, locations="iso_alpha", color="Recovered",
                     hover_name="iso_alpha", size="Recovered",
                     projection="natural earth", title="Worldwide Recovered Cases")
fig.show()

## Transforming Data for Forecasting

In [None]:
confirmed = df_new.groupby('Date').sum()['Confirmed'].reset_index()
deaths = df_new.groupby('Date').sum()['Deaths'].reset_index()
recovered = df_new.groupby('Date').sum()['Recovered'].reset_index()

In [None]:
SL = pd.DataFrame(df_new.loc[df_new.Country == "Sri Lanka"])
#confirmed_df.loc[confirmed_df.Country == 'Sri Lanka']
confirmedSL = SL.groupby('Date').sum()['Confirmed'].reset_index()
deathsSL = SL.groupby('Date').sum()['Deaths'].reset_index()
recoveredSL = SL.groupby('Date').sum()['Recovered'].reset_index()

In [None]:
confirmed.columns = ['ds','y']
#confirmed['ds'] = confirmed['ds'].dt.date
confirmed['ds'] = pd.to_datetime(confirmed['ds'])

In [None]:
confirmed.head()

In [None]:
confirmedSL.head()

# Forecasting Total Number of Cases Worldwide

## Prophet

We use Prophet, a procedure for forecasting time series data based on an additive model where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects. It works best with time series that have strong seasonal effects and several seasons of historical data. Prophet is robust to missing data and shifts in the trend, and typically handles outliers well. It is also an open source software released by Facebook’s Core Data Science team. It is available for download on CRAN and PyPI.

## Why Prophet?

Prophet is easy to customize and use, and to produce accurate forecasts which can be explained intuitively with supporting evidence such as forecast seasonality components. It allows the analyst to explain in an intuitive and convinving manner to higher management as to why the forecasts are as such, and the plausible underlying factors that contribute to its result. Furthermore, it is also open-source! :)

## References 
- https://facebook.github.io/prophet/
- https://facebook.github.io/prophet/docs/
- https://github.com/facebook/prophet

## Forecasting Confirmed Cases Worldwide with Prophet (Baseline)

We perform a week's ahead forecast with Prophet, with 95% prediction intervals. Here, no tweaking of seasonality-related parameters and additional regressors are performed.

In [None]:
m = Prophet(interval_width=0.95)
m.fit(confirmed)
future = m.make_future_dataframe(periods=7)
future_confirmed = future.copy() # for non-baseline predictions later on
future.tail()

In [None]:
forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

In [None]:
confirmed_forecast_plot = m.plot(forecast)

## Forecasting Deaths Worldwide with Prophet (Baseline)

We perform a week's ahead forecast with Prophet, with 95% prediction intervals. Here, no tweaking of seasonality-related parameters and additional regressors are performed.

In [None]:
deaths.columns = ['ds','y']
deaths['ds'] = pd.to_datetime(deaths['ds'])

In [None]:
m = Prophet(interval_width=0.95)
m.fit(deaths)
future = m.make_future_dataframe(periods=7)
future_deaths = future.copy() # for non-baseline predictions later on
future.tail()

In [None]:
forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

In [None]:
deaths_forecast_plot = m.plot(forecast)

## Forecasting Recovered Cases Worldwide with Prophet (Baseline)

We perform a week's ahead forecast with Prophet, with 95% prediction intervals. Here, no tweaking of seasonality-related parameters and additional regressors are performed.

In [None]:
recovered.columns = ['ds','y']
recovered['ds'] = pd.to_datetime(recovered['ds'])


In [None]:
m = Prophet(interval_width=0.95)
m.fit(recovered)
future = m.make_future_dataframe(periods=7)
future_recovered = future.copy() # for non-baseline predictions later on
future.tail()

In [None]:
forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

In [None]:
recovered_forecast_plot = m.plot(forecast)

Seems pretty decent for a baseline Prophet model in the case of the number of recovered! :)

From the forecast component plots, it is clear that there exists an upward trend in the number of cases worldwide. In the weekly trends plot, interestingly, it is the **highest at the weekends**!

# **Forcasting Confirmed Cases For Sri Lanka**

In [None]:
confirmedSL.columns = ['ds','y']
#confirmed['ds'] = confirmed['ds'].dt.date
confirmedSL['ds'] = pd.to_datetime(confirmedSL['ds'])

In [None]:
m = Prophet(interval_width=0.95)
m.fit(confirmedSL)
futureSL = m.make_future_dataframe(periods=7)
future_confirmedSL = futureSL.copy() # for non-baseline predictions later on
futureSL.tail()



In [None]:
forecastSL = m.predict(futureSL)
forecastSL[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()


In [None]:
confirmed_forecast_plot = m.plot(forecastSL)

# **Forcasting Deaths For Sri Lanka**

In [None]:
deathsSL.columns = ['ds','y']
deathsSL['ds'] = pd.to_datetime(deathsSL['ds'])



In [None]:
m = Prophet(interval_width=0.95)
m.fit(deathsSL)
futureSLd = m.make_future_dataframe(periods=7)
future_deathsSL = futureSLd.copy() # for non-baseline predictions later on
futureSLd.tail()


In [None]:

forecastSLd = m.predict(futureSLd)
forecastSLd[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()



In [None]:
deaths_forecast_plot = m.plot(forecastSLd)

# **Forcasting Recovery Cases for Sri Lanka**

In [None]:
recoveredSL.columns = ['ds','y']
recoveredSL['ds'] = pd.to_datetime(recoveredSL['ds'])



In [None]:
m = Prophet(interval_width=0.95)
m.fit(recoveredSL)
futureSLr = m.make_future_dataframe(periods=7)
future_recoveredSLr = futureSLr.copy() # for non-baseline predictions later on
futureSLr.tail()


In [None]:

forecastSLr = m.predict(futureSLr)
forecastSLr[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()



In [None]:
recovered_forecast_plot = m.plot(forecastSLr)