In [1]:
# essential libraries
import random
from datetime import timedelta

# sorting and analysis 
import numpy as np
import pandas as pd


#visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import calmap
import folium
import gt


In [2]:
# color pallete
cnf, dth, rec, act = '#393e46', '#ff2e63', '#21bf73', '#fe9801'

In [3]:
# converter
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

In [4]:
# hide warnings
import warnings
warnings.filterwarnings('ignore')

In [5]:
# html embedding
from IPython.display import Javascript 
from IPython.core.display import display
from IPython.core.display import HTML

In [6]:
# importing datasets 
full_table = pd.read_csv('covid_19_clean_complete.csv',
                        parse_dates=['Date'])
full_table.sample(6)

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
7400,,Antigua and Barbuda,17.0608,-61.7964,2020-02-20,0,0,0
13774,,Angola,-11.2027,17.8739,2020-03-16,0,0,0
2675,,Guyana,5.0,-58.75,2020-02-01,0,0,0
9908,Cayman Islands,United Kingdom,19.3133,-81.2546,2020-02-29,0,0,0
5543,,Rwanda,-1.9403,29.8739,2020-02-12,0,0,0
2699,,Liechtenstein,47.14,9.55,2020-02-01,0,0,0


# Cleaning Data 

In [7]:
# cases
cases = ['Confirmed', 'Deaths', 'Recovered', 'Active']

# Active Case = confirmed - deaths - recovered
full_table['Active'] = full_table['Confirmed'] - full_table['Deaths'] - full_table['Recovered']

# replacing Mainland china with just China
full_table['Country/Region'] = full_table['Country/Region'].replace('Mainland China', 'China')

In [8]:
# filling missing values 
full_table[['Province/State']] = full_table[['Province/State']].fillna('')
full_table[cases] = full_table[cases].fillna(0)

# fixing datatypes
full_table['Recovered'] = full_table['Recovered'].astype(int)

full_table.sample(6)

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered,Active
16850,,Bangladesh,23.685,90.3563,2020-03-28,48,5,15,28
10791,Zhejiang,China,29.1832,120.0934,2020-03-04,1213,1,1114,98
7545,,Lithuania,55.1694,23.8813,2020-02-20,0,0,0,0
10958,Anguilla,United Kingdom,18.2206,-63.0686,2020-03-04,0,0,0,0
13247,,Mali,17.570692,-3.996166,2020-03-13,0,0,0,0
13723,,Tanzania,-6.369,34.8888,2020-03-15,0,0,0,0


In [9]:
# cases in the ships
ship = full_table[full_table['Province/State'].str.contains('Grand Princess')|full_table['Country/Region'].str.contains('Diamond Princess')]

# china and the row
china = full_table[full_table['Country/Region']=='China']
row = full_table[full_table['Country/Region']!='China']

# latest
full_latest = full_table[full_table['Date'] == max(full_table['Date'])].reset_index()
china_latest = full_latest[full_latest['Country/Region']=='China']
row_latest = full_latest[full_latest['Country/Region']!='China']

# latest condensed
full_latest_grouped = full_latest.groupby('Country/Region')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
china_latest_grouped = china_latest.groupby('Province/State')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
row_latest_grouped = row_latest.groupby('Country/Region')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()

# Latest Data

In [10]:
temp = full_table.groupby(['Country/Region', 'Province/State'])['Confirmed', 'Deaths', 'Recovered', 'Active'].max()
# temp.style.background_gradient(cmap='Reds')

In [11]:
temp = full_table.groupby('Date')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
temp = temp[temp['Date']==max(temp['Date'])].reset_index(drop=True)
temp['Global Moratality'] = temp['Deaths']/temp['Confirmed']
temp['Deaths per 100 Confirmed Cases'] = temp['Global Moratality']*100
temp.style.background_gradient(cmap='Pastel1')

Unnamed: 0,Date,Confirmed,Deaths,Recovered,Active,Global Moratality,Deaths per 100 Confirmed Cases
0,2020-03-31 00:00:00,857487,42107,176442,638938,0.0491051,4.91051


In [12]:
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)

In [13]:
tm = temp.melt(id_vars="Date", value_vars=['Active', 'Deaths', 'Recovered'])
fig = px.treemap(tm, path=["variable"], values="value", height=400, width=600,
                 color_discrete_sequence=[act, rec, dth])
fig.data[0].textinfo = 'label+text+value'
fig.show()

# Country Wise Data

## In Each Country 

In [14]:
temp_f = full_latest_grouped.sort_values(by='Confirmed', ascending=False)
temp_f = temp_f[['Country/Region', 'Confirmed', 'Active', 'Deaths', 'Recovered']]
temp_f = temp_f.reset_index(drop=True)

temp_f.style.background_gradient(cmap="Blues", subset=['Confirmed', 'Active'])\
            .background_gradient(cmap="Greens", subset=['Recovered'])\
            .background_gradient(cmap="Reds", subset=['Deaths'])

Unnamed: 0,Country/Region,Confirmed,Active,Deaths,Recovered
0,US,188172,177275,3873,7024
1,Italy,105792,77635,12428,15729
2,Spain,95923,68200,8464,19259
3,China,82279,2764,3309,76206
4,Germany,71808,54933,775,16100
5,France,52827,39782,3532,9513
6,Iran,44605,27051,2898,14656
7,United Kingdom,25481,23509,1793,179
8,Switzerland,16605,14349,433,1823
9,Turkey,13531,13074,214,243


## Countries With Death Reported 

In [15]:
temp_flg = temp_f[temp_f['Deaths']>0][['Country/Region', 'Deaths']]
temp_flg['Deaths / 100 Cases'] = round((temp_f['Deaths']/temp_f['Confirmed'])*100, 2)
temp_flg.sort_values('Deaths', ascending=False).reset_index(drop=True).style.background_gradient(cmap='Reds')

Unnamed: 0,Country/Region,Deaths,Deaths / 100 Cases
0,Italy,12428,11.75
1,Spain,8464,8.82
2,US,3873,2.06
3,France,3532,6.69
4,China,3309,4.02
5,Iran,2898,6.5
6,United Kingdom,1793,7.04
7,Netherlands,1040,8.21
8,Germany,775,1.08
9,Belgium,705,5.52


# MAPS

## Across the world

In [16]:
# World wide

m = folium.Map(location=[0, 0], tiles='cartodbpositron',
               min_zoom=1, max_zoom=4, zoom_start=1)

for i in range(0, len(full_latest)):
    folium.Circle(
        location=[full_latest.iloc[i]['Lat'], full_latest.iloc[i]['Long']],
        color='crimson', 
        tooltip =   '<li><bold>Country : '+str(full_latest.iloc[i]['Country/Region'])+
                    '<li><bold>Province : '+str(full_latest.iloc[i]['Province/State'])+
                    '<li><bold>Confirmed : '+str(full_latest.iloc[i]['Confirmed'])+
                    '<li><bold>Deaths : '+str(full_latest.iloc[i]['Deaths']),
        radius=int(full_latest.iloc[i]['Confirmed'])**1.1).add_to(m)
m

In [17]:
# Confirmed
fig = px.choropleth(full_latest_grouped, locations="Country/Region", 
                    locationmode='country names', color=np.log(full_latest_grouped["Confirmed"]), 
                    hover_name="Country/Region", hover_data=['Confirmed'],
                    color_continuous_scale="Sunsetdark", 
                    title='Countries with Confirmed Cases')
fig.update(layout_coloraxis_showscale=False)
fig.show()

In [18]:
# Deaths
temp = full_latest_grouped[full_latest_grouped['Deaths']>0]
fig = px.choropleth(temp, 
                    locations="Country/Region", locationmode='country names',
                    color=np.log(temp["Deaths"]), hover_name="Country/Region", 
                    color_continuous_scale="Peach", hover_data=['Deaths'],
                    title='Countries with Deaths Reported')
fig.update(layout_coloraxis_showscale=False)
fig.show()

In [19]:
formated_gdf = full_table.groupby(['Date', 'Country/Region'])['Confirmed', 'Deaths'].max()
formated_gdf = formated_gdf.reset_index()
formated_gdf['Date'] = pd.to_datetime(formated_gdf['Date'])
formated_gdf['Date'] = formated_gdf['Date'].dt.strftime('%m/%d/%Y')
formated_gdf['size'] = formated_gdf['Confirmed'].pow(0.3)

fig = px.scatter_geo(formated_gdf, locations="Country/Region", locationmode='country names', 
                     color="Confirmed", size='size', hover_name="Country/Region", 
                     range_color= [0, max(formated_gdf['Confirmed'])+2], animation_frame="Date", 
                     title='Spread over time')
fig.update(layout_coloraxis_showscale=False)
fig.show()

# Ships

## Diamond Princess cruise ship
* On 20 January 2020, an 80-year-old passenger from Hong Kong embarked in Yokohama, later tested positive for COVID-19 on 1 February.
* On 4 February, the ship was in Japanese waters when 10 passengers were diagnosed with COVID-19.
* The ship was quarantined on 4 February 2020 in the Port of Yokohama in Japan.

* The infections included at least 138 from India (including 132 crew and 6 passengers), 35 Filipinos,
* 32 Canadians, 24 Australians, 13 Americans, 4 Indonesians, 4 Malaysians,and 2 Britons.
* Home countries arranged to evacuate their citizens and quarantine them further in their own countries.

* As of 5 March, at least 696 out of the 3,711 passengers and crew had tested positive for the virus.
* As of 7 March, there had been at least 7 deaths
## Grand Princess Ship
* Another Grand-class ship owned by Princess Cruises, also experienced an outbreak of the disease
* Two of its passengers fell ill after a cruise from San Francisco to Mexico in February 2020, and one of them died

In [20]:
# Latest cases from the ships
temp = ship[ship['Date']==max(ship['Date'])]
temp = temp[['Country/Region', 'Province/State', 'Confirmed', 'Deaths']].groupby(['Country/Region', 'Province/State']).sum().reset_index()
temp.style.background_gradient(cmap='Pastel1_r')

Unnamed: 0,Country/Region,Province/State,Confirmed,Deaths
0,Canada,Grand Princess,13,0
1,Diamond Princess,,712,10


# Cases over the time

In [21]:
temp = full_table.groupby('Date')['Recovered', 'Deaths', 'Active'].sum().reset_index()
temp = temp.melt(id_vars="Date", value_vars=['Recovered', 'Deaths', 'Active'],
                 var_name='Case', value_name='Count')
temp.head()

fig = px.area(temp, x="Date", y="Count", color='Case', height=800,
             title='Cases over time', color_discrete_sequence = [rec, dth, act])
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

In [22]:
temp = full_table.groupby('Date')['Confirmed', 'Deaths', 'Recovered'].sum().reset_index()
temp['No. of Deaths to 100 Confirmed Cases'] = round(temp['Deaths']/temp['Confirmed'], 3)*100
temp['No. of Recovered to 100 Confirmed Cases'] = round(temp['Recovered']/temp['Confirmed'], 3)*100

fig_1 = px.area(temp, x="Date", y="Confirmed", color_discrete_sequence = [act])
fig_2 = px.area(temp, x="Date", y="Deaths", color_discrete_sequence = [dth])
fig_3 = px.line(temp, x="Date", y="No. of Deaths to 100 Confirmed Cases",  color_discrete_sequence=['#333333'])
fig_4 = px.line(temp, x="Date", y="No. of Recovered to 100 Confirmed Cases",  color_discrete_sequence=['#0f4c75'])


In [23]:
spread = full_table[full_table['Confirmed']!=0].groupby('Date')['Country/Region'].unique().apply(len)
spread = pd.DataFrame(spread).reset_index()

fig_5 = px.line(spread, x='Date', y='Country/Region',  color_discrete_sequence=[dth])

In [24]:
temp = full_table.groupby(['Country/Region', 'Date', ])['Confirmed', 'Deaths']
temp = temp.sum().diff().reset_index()

mask = temp['Country/Region'] != temp['Country/Region'].shift(1)

temp.loc[mask, 'Confirmed'] = np.nan
temp.loc[mask, 'Deaths'] = np.nan

temp = temp.groupby('Date')['Confirmed'].sum().reset_index()

fig_6 = px.bar(temp, x="Date", y="Confirmed", color_discrete_sequence=['#2a7886'])
fig_6.update_layout(xaxis_rangeslider_visible=True)


In [25]:
fig = make_subplots(rows=3, cols=2, shared_xaxes=False, vertical_spacing=0.08, horizontal_spacing=0.1,
                    subplot_titles=('Confirmed cases', 'Deaths reported', 
                                    'No. of deaths to 100 Confirmed', 'No. of recovered cases to 100 Confirmed', 
                                    'No. of new countries everyday', 'No. of new cases everyday'))

fig.add_trace(fig_1['data'][0], row=1, col=1)
fig.add_trace(fig_2['data'][0], row=1, col=2)
fig.add_trace(fig_3['data'][0], row=2, col=1)
fig.add_trace(fig_4['data'][0], row=2, col=2)
fig.add_trace(fig_5['data'][0], row=3, col=1)
fig.add_trace(fig_6['data'][0], row=3, col=2)

fig.update_layout(height=1200)

# TOP 20 Countries

In [26]:
flg = full_latest_grouped.copy('deept')
# flg.head()

# mortality rate

flg['Mortality Rate'] = round((flg['Deaths']/flg['Confirmed'])*100, 2)
temp_m = flg[flg['Confirmed']>100]
temp_m = temp_m.sort_values('Mortality Rate', ascending=False)

# load population dataset
pop_clean_data = pd.read_csv("D:/Novel Coronavirus/population_total_long.csv")
pop = pop_clean_data[pop_clean_data['Year']==max(pop_clean_data['Year'])]
# select only the latest data
pop = pop.loc[:, ['Country Name', 'Count']]

# substitute for missing values
cols = ['French Guiana', 'Martinique', 'Palestine' , 'Republic of Ireland', 'Saint Barthelemy', 'Taiwan', 'Vatican City']
pops = [290691, 376480, 4750000, 4920455, 7122, 23476640, 1000]
for c, p in zip(cols, pops):
    pop.loc[pop['Country Name']== c, 'Count'] = p

# replace country names 
name = {
    'Egypt, Arab Rep.': 'Egypt',
    'Hong Kong SAR, China': 'Hong Kong',
    'Iran, Islamic Rep.': 'Iran (Islamic Republic of)',
    'Macao SAR, China': 'Macau',
    'Hong Kong SAR': 'Hong Kong',
    'Russian Federation': 'Russia',
    'Slovak Republic': 'Slovakia',
    'Korea, Rep.': 'Republic of Korea',
    'United Kingdom': 'UK',
    'United States': 'US' }
pop.loc[:, 'Country Name'] = pop.loc[: 'Country Name'].replace(name)

# merge dataframes
temp = pd.merge(full_latest_grouped, pop, how='left', right_on='Country Name', left_on='Country/Region')
# print(temp[temp['Country Name'].isna()])

temp = temp[['Country/Region', 'Confirmed', 'Deaths', 'Recovered', 'Active', 'Count']]
temp.columns = ['Country/Region', 'Confirmed', 'Deaths', 'Recovered', 'Active', 'Population']
    
# calculate Confirmed/Population
temp['Confirmed Per Million People'] = round(temp['Confirmed']/temp['Population']*1000000, 2)

# countries with population greater that 1 million only
temp_p = temp[temp['Population']>1000000].sort_values('Confirmed Per Million People', ascending=False).reset_index(drop=True)
# temp_p.head()

FileNotFoundError: [Errno 2] File b'D:/Novel Coronavirus/population_total_long.csv' does not exist: b'D:/Novel Coronavirus/population_total_long.csv'

In [None]:
# Confirmed ==================================================================================================

fig_c = px.bar(flg.sort_values('Confirmed').tail(20), x="Confirmed", y="Country/Region", text='Confirmed', orientation='h')
fig_c.update_traces(marker_color='#000839', opacity=0.8, textposition='auto')
fig_c.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

# Deaths ===================================================================================================

fig_d = px.bar(flg.sort_values('Deaths').tail(20), x="Deaths", y="Country/Region",  text='Deaths', orientation='h')
fig_d.update_traces(marker_color='#ff1e56', opacity=0.8, textposition='auto')
fig_d.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

# Recovered =================================================================================================

fig_r = px.bar(flg.sort_values('Recovered').tail(20), x="Recovered", y="Country/Region", text='Recovered', orientation='h')
fig_r.update_traces(marker_color=rec, opacity=0.8, textposition='auto')
fig_r.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

# Active ====================================================================================================

fig_a = px.bar(flg.sort_values('Active').tail(20), x="Active", y="Country/Region", text='Active', orientation='h')
fig_a.update_traces(marker_color=act, opacity=0.8, textposition='auto')
fig_a.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

# Moratality ================================================================================================

fig_m = px.bar(temp_m.sort_values('Mortality Rate').tail(20), x="Mortality Rate", y="Country/Region", text='Mortality Rate', orientation='h')
fig_m.update_traces(marker_color='#00bdaa', opacity=0.8, textposition='auto')
fig_m.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

# Per Million People =================================================================================================

fig_p = px.bar(temp_p.sort_values('Confirmed Per Million People').tail(20), x='Confirmed Per Million People', y='Country/Region', orientation='h', text='Confirmed Per Million People')
fig_p.update_traces(marker_color='#2c003e', opacity=0.8, textposition='auto')
fig_p.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

# ==========================================================================================================

fig = make_subplots(rows=3, cols=2, shared_xaxes=False, vertical_spacing=0.08, horizontal_spacing=0.1,
                    subplot_titles=("Confirmed", "Deaths", "Recovered", "Active", "Moratlity rate", "Confirmed cases per Million People"))
fig.add_trace(fig_c['data'][0], row=1, col=1)
fig.add_trace(fig_d['data'][0], row=1, col=2)
fig.add_trace(fig_r['data'][0], row=2, col=1)
fig.add_trace(fig_a['data'][0], row=2, col=2)
fig.add_trace(fig_m['data'][0], row=3, col=1)
fig.add_trace(fig_p['data'][0], row=3, col=2)
fig.update_layout(height=1200, title_text="Top 20")

In [None]:
fig = px.scatter(full_latest_grouped.sort_values('Deaths', ascending=False).iloc[:15, :], 
                 x='Confirmed', y='Deaths', color='Country/Region', size='Confirmed', height=800,
                 text='Country/Region', log_x=True, log_y=True, title='Deaths vs Confirmed')
fig.update_traces(textposition='top center')
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

# Data vs 

In [None]:
temp = full_table.groupby(['Country/Region', 'Date'])['Confirmed', 'Deaths'].sum()
temp = temp.reset_index()

fig = px.bar(temp, x="Date", y="Confirmed", color='Country/Region', orientation='v', height=600,
             title='Confirmed', color_discrete_sequence = px.colors.cyclical.mygbm)
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

# =========================================

fig = px.bar(temp, x="Date", y="Deaths", color='Country/Region', orientation='v', height=600,
             title='Deaths', color_discrete_sequence = px.colors.cyclical.mygbm)
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

# =========================================

temp = full_table.groupby(['Country/Region', 'Date', ])['Confirmed', 'Deaths']
temp = temp.sum().diff().reset_index()

mask = temp['Country/Region'] != temp['Country/Region'].shift(1)

temp.loc[mask, 'Confirmed'] = np.nan
temp.loc[mask, 'Deaths'] = np.nan

fig = px.bar(temp, x="Date", y="Confirmed", color='Country/Region',title='New cases')
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

# Composition of Cases 

In [None]:
fig = px.treemap(full_latest.sort_values(by='Confirmed', ascending=False).reset_index(drop=True), 
                 path=["Country/Region", "Province/State"], values="Confirmed", height=700,
                 title='Number of Confirmed Cases',
                 color_discrete_sequence = px.colors.qualitative.Prism)
fig.data[0].textinfo = 'label+text+value'
fig.show()

fig = px.sunburst(full_latest.sort_values(by='Deaths', ascending=False).reset_index(drop=True), 
                 path=["Country/Region", "Province/State"], values="Deaths", height=700,
                 title='Number of Deaths reported',
                 color_discrete_sequence = px.colors.qualitative.Prism)
fig.data[0].textinfo = 'label+text+value'
fig.show()

# Epidemic Span

In [None]:
# Note : In the graph, last day is shown as one day after the last time a new confirmed cases reported in the Country / Region

In [None]:
# first date
# ==========
first_date = full_table[full_table['Confirmed']>0]
first_date = first_date.groupby('Country/Region')['Date'].agg(['min']).reset_index()
# first_date.head()

# last date
# =========
last_date = full_table.groupby(['Country/Region', 'Date', ])['Confirmed', 'Deaths', 'Recovered']
last_date = last_date.sum().diff().reset_index()

mask = last_date['Country/Region'] != last_date['Country/Region'].shift(1)
last_date.loc[mask, 'Confirmed'] = np.nan
last_date.loc[mask, 'Deaths'] = np.nan
last_date.loc[mask, 'Recovered'] = np.nan

last_date = last_date[last_date['Confirmed']>0]
last_date = last_date.groupby('Country/Region')['Date'].agg(['max']).reset_index()
# last_date.head()

# first_last
# ==========
first_last = pd.concat([first_date, last_date[['max']]], axis=1)

# added 1 more day, which will show the next day as the day on which last case appeared
first_last['max'] = first_last['max'] + timedelta(days=1)

# no. of days
first_last['Days'] = first_last['max'] - first_last['min']

# task column as country
first_last['Task'] = first_last['Country/Region']

# rename columns
first_last.columns = ['Country/Region', 'Start', 'Finish', 'Days', 'Task']

# sort by no. of days
first_last = first_last.sort_values('Days')
# first_last.head()

# visualization
# =============

# produce random colors
clr = ["#"+''.join([random.choice('0123456789ABC') for j in range(6)]) for i in range(len(first_last))]

# plot
fig = ff.create_gantt(first_last, index_col='Country/Region', colors=clr, show_colorbar=False, 
                      bar_width=0.2, showgrid_x=True, showgrid_y=True, height=2500)
fig.show()

In [None]:
HTML('''<div class="flourish-embed flourish-bar-chart-race" data-src="visualisation/1571387"><script src="https://public.flourish.studio/resources/embed.js"></script></div>''')