# Covid-19 EDA

In [0]:
# !pip install folium
# !pip install plotly


In [0]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

import folium

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import math
import random
from datetime import timedelta

import warnings
warnings.filterwarnings('ignore')

#color pallette
cnf = '#393e46'
dth = '#ff2e63'
rec = '#21bf73'
act = '#fe9801'



In [0]:
# call wherever using plotly or  use renderer='colab' in fig.show()
def enable_plotly_in_cell():
    import IPython
    from plotly.offline import init_notebook_mode
    display(IPython.core.display.HTML('''<script src="/static/components/requirejs/require.js"></script>'''))
    init_notebook_mode(connected=False)

In [0]:
df = pd.read_csv('datasets/covid_19_data_cleaned.csv', parse_dates=['Date'])
country_daywise = pd.read_csv('datasets/country_daywise.csv', parse_dates=['Date'])
countywise = pd.read_csv('datasets/countrywise.csv')
daywise = pd.read_csv('datasets/daywise.csv', parse_dates=['Date'])


In [159]:
df.head()

Unnamed: 0,Date,Province/State,Country,Lat,Long,Confirmed,Recovered,Deaths,Active
0,2020-01-22,,Afghanistan,33.0,65.0,0,0,0,0
1,2020-01-23,,Afghanistan,33.0,65.0,0,0,0,0
2,2020-01-24,,Afghanistan,33.0,65.0,0,0,0,0
3,2020-01-25,,Afghanistan,33.0,65.0,0,0,0,0
4,2020-01-26,,Afghanistan,33.0,65.0,0,0,0,0


In [160]:
confirmed=df.groupby('Date').sum()['Confirmed'].reset_index()
recovered = df.groupby('Date').sum()['Recovered'].reset_index()
deaths = df.groupby('Date').sum()['Deaths'].reset_index()
deaths.head()

Unnamed: 0,Date,Deaths
0,2020-01-22,17
1,2020-01-23,18
2,2020-01-24,26
3,2020-01-25,42
4,2020-01-26,56


In [161]:
df.isnull().sum()

Date                  0
Province/State    24696
Country               0
Lat                   0
Long                  0
Confirmed             0
Recovered             0
Deaths                0
Active                0
dtype: int64

In [162]:
df['Province/State']=df['Province/State'].fillna('')
df.isnull().sum()

Date              0
Province/State    0
Country           0
Lat               0
Long              0
Confirmed         0
Recovered         0
Deaths            0
Active            0
dtype: int64

## Worldwide Total Confirmed, Recovered, and Deaths

In [163]:
confirmed['Date'].tail()

121   2020-05-22
122   2020-05-23
123   2020-05-24
124   2020-05-25
125   2020-05-26
Name: Date, dtype: datetime64[ns]

In [164]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = confirmed['Date'], y = confirmed['Confirmed'], mode = 'lines+markers', name = 'Confirmed', line = dict(color = "Orange", width = 2)))
fig.add_trace(go.Scatter(x = recovered['Date'], y = recovered['Recovered'], mode = 'lines+markers', name = 'Recovered', line = dict(color = "Green", width = 2)))
fig.add_trace(go.Scatter(x = deaths['Date'], y = deaths['Deaths'], mode = 'lines+markers', name = 'Deaths', line = dict(color = "Red", width = 2)))
fig.update_layout(title = 'Worldwide Covid-19 Cases', xaxis_tickfont_size = 14, yaxis = dict(title = 'Number of Cases'))

fig.show(renderer="colab")

## Cases Density Animation on World Map

In [165]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35406 entries, 0 to 35405
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Date            35406 non-null  datetime64[ns]
 1   Province/State  35406 non-null  object        
 2   Country         35406 non-null  object        
 3   Lat             35406 non-null  float64       
 4   Long            35406 non-null  float64       
 5   Confirmed       35406 non-null  int64         
 6   Recovered       35406 non-null  int64         
 7   Deaths          35406 non-null  int64         
 8   Active          35406 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(4), object(2)
memory usage: 2.4+ MB


In [166]:
df['Date'] = df['Date'].astype(str)
df.head()

Unnamed: 0,Date,Province/State,Country,Lat,Long,Confirmed,Recovered,Deaths,Active
0,2020-01-22,,Afghanistan,33.0,65.0,0,0,0,0
1,2020-01-23,,Afghanistan,33.0,65.0,0,0,0,0
2,2020-01-24,,Afghanistan,33.0,65.0,0,0,0,0
3,2020-01-25,,Afghanistan,33.0,65.0,0,0,0,0
4,2020-01-26,,Afghanistan,33.0,65.0,0,0,0,0


In [167]:
fig = px.density_mapbox(df, lat = 'Lat', lon = 'Long', hover_name = 'Country', hover_data = ['Confirmed', 'Recovered', 'Deaths'], animation_frame='Date', color_continuous_scale='Portland', radius = 7, zoom = 0, height= 700)
fig.update_layout(title = 'Worldwide Covid-19 Cases with Time Laps')
fig.update_layout(mapbox_style = 'open-street-map', mapbox_center_lon = 0)

fig.show(renderer='colab')

## Total Cases on Ships

In [168]:
df['Date'] = pd.to_datetime(df['Date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35406 entries, 0 to 35405
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Date            35406 non-null  datetime64[ns]
 1   Province/State  35406 non-null  object        
 2   Country         35406 non-null  object        
 3   Lat             35406 non-null  float64       
 4   Long            35406 non-null  float64       
 5   Confirmed       35406 non-null  int64         
 6   Recovered       35406 non-null  int64         
 7   Deaths          35406 non-null  int64         
 8   Active          35406 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(4), object(2)
memory usage: 2.4+ MB


In [169]:
ship_rows = df['Province/State'].str.contains('Grand Princess') | df['Province/State'].str.contains('Diamond Princess') | df['Country'].str.contains('Grand Princess') | df['Country'].str.contains('Diamond Princess') | df['Country'].str.contains('MS Zaandam')
ship = df[ship_rows]

df = df[~ship_rows]
ship

Unnamed: 0,Date,Province/State,Country,Lat,Long,Confirmed,Recovered,Deaths,Active
4662,2020-01-22,Grand Princess,Canada,37.6489,-122.6655,0,0,0,0
4663,2020-01-23,Grand Princess,Canada,37.6489,-122.6655,0,0,0,0
4664,2020-01-24,Grand Princess,Canada,37.6489,-122.6655,0,0,0,0
4665,2020-01-25,Grand Princess,Canada,37.6489,-122.6655,0,0,0,0
4666,2020-01-26,Grand Princess,Canada,37.6489,-122.6655,0,0,0,0
...,...,...,...,...,...,...,...,...,...
31747,2020-05-22,,MS Zaandam,0.0000,0.0000,9,0,2,7
31748,2020-05-23,,MS Zaandam,0.0000,0.0000,9,0,2,7
31749,2020-05-24,,MS Zaandam,0.0000,0.0000,9,0,2,7
31750,2020-05-25,,MS Zaandam,0.0000,0.0000,9,0,2,7


In [170]:
ship_latest = ship[ship['Date'] == max(ship['Date'])]
ship_latest

Unnamed: 0,Date,Province/State,Country,Lat,Long,Confirmed,Recovered,Deaths,Active
4787,2020-05-26,Grand Princess,Canada,37.6489,-122.6655,13,0,0,13
11213,2020-05-26,,Diamond Princess,0.0,0.0,712,651,13,48
29231,2020-05-26,Diamond Princess,Canada,0.0,0.0,1,0,1,0
31751,2020-05-26,,MS Zaandam,0.0,0.0,9,0,2,7


In [178]:
#max values
ship_latest.style.background_gradient(cmap = 'Pastel1_r')

Unnamed: 0,Date,Province/State,Country,Lat,Long,Confirmed,Recovered,Deaths,Active
4787,2020-05-26 00:00:00,Grand Princess,Canada,37.6489,-122.6655,13,0,0,13
11213,2020-05-26 00:00:00,,Diamond Princess,0.0,0.0,712,651,13,48
29231,2020-05-26 00:00:00,Diamond Princess,Canada,0.0,0.0,1,0,1,0
31751,2020-05-26 00:00:00,,MS Zaandam,0.0,0.0,9,0,2,7


## Cases Over Time With Area Plot

In [188]:
temp = df.groupby('Date')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
temp = temp[temp['Date']==max(temp['Date'])].reset_index(drop = True)

tm = temp.melt(id_vars = 'Date', value_vars = ['Active', 'Deaths', 'Recovered'])
fig = px.treemap(tm, path = ['variable'], values = 'value', height = 250, width = 800, color_discrete_sequence=[act, rec, dth])
fig.data[0].textinfo = 'label+value'

fig.show(renderer='colab')

## Folium Map

In [190]:
#Cases across the world
temp=df[df['Date']==max(df['Date'])]

m1=folium.Map(location=[0,0],tiles='cartodbpositron',min_zoom=1,max_zoom=3,zoom_start=1)
for i in range(0, len(temp)):
    folium.Circle(location=[temp.iloc[i]['Lat'], temp.iloc[i]['Long']], color = 'crimson', fill = 'crimson',
                 tooltip = '<li><bold> Country: ' + str(temp.iloc[i]['Country'])+
                            '<li><bold> Province: ' + str(temp.iloc[i]['Province/State'])+
                             '<li><bold> Confirmed: ' + str(temp.iloc[i]['Confirmed'])+
                             '<li><bold> Deaths: ' + str(temp.iloc[i]['Deaths']),
                 radius = int(temp.iloc[i]['Confirmed'])**0.5).add_to(m1)
    

m1

## Choropleth Map

In [191]:
country_daywise.head()

Unnamed: 0,Date,Country,Confirmed,Deaths,Recovered,Active,New Cases,New Deaths,New Recovered
0,2020-01-22,Afghanistan,0,0,0,0,0,0,0
1,2020-01-22,Albania,0,0,0,0,0,0,0
2,2020-01-22,Algeria,0,0,0,0,0,0,0
3,2020-01-22,Andorra,0,0,0,0,0,0,0
4,2020-01-22,Angola,0,0,0,0,0,0,0


In [196]:
fig = px.choropleth(country_daywise, locations= 'Country', locationmode='country names', color = np.log(country_daywise['Confirmed']),
                   hover_name = 'Country', animation_frame=country_daywise['Date'].dt.strftime('%Y-%m-%d'),
                   title='Cases over time', color_continuous_scale=px.colors.sequential.Inferno)

fig.update(layout_coloraxis_showscale = True)
fig.show(renderer='colab')

## Confirmed and Death Cases with Static Colormap

In [209]:
fig_conf= px.choropleth(countywise, locations='Country',locationmode='country names',
                        color=np.log(countywise['Confirmed']),hover_name='Country',hover_data=['Confirmed'])

temp = countywise[countywise['Deaths']>0]
fig_d = px.choropleth(temp, locations='Country', locationmode='country names',
                     color = np.log(temp['Deaths']), hover_name = 'Country',
                     hover_data = ['Deaths'])

fig_r = px.choropleth(temp, locations='Country', locationmode='country names',
                     color = np.log(temp['Recovered']), hover_name = 'Country',
                     hover_data = ['Recovered'])

fig=make_subplots(rows=3,cols=1,subplot_titles=['Confirmed','Deaths','Recovered'],
                  specs=[[{'type':'choropleth'}],[{'type':'choropleth'}],[{'type':'choropleth'}]])

fig.add_trace(fig_conf['data'][0], row = 1, col = 1)
fig.add_trace(fig_d['data'][0], row = 2, col = 1)
fig.add_trace(fig_r['data'][0], row = 3, col = 1)

fig.show(renderer='colab')

## New Cases and Number of Countries

In [211]:
fig_c = px.bar(daywise, x = 'Date', y = 'Confirmed', color_discrete_sequence=[act])
fig_d = px.bar(daywise, x = 'Date', y = 'No. of Countries', color_discrete_sequence=[dth])

fig = make_subplots(rows = 1, cols = 2, shared_xaxes=False, horizontal_spacing=0.1,
                   subplot_titles=('No. of New Cases per Day', 'No. of Countries'))

fig.add_trace(fig_c['data'][0], row = 1, col = 1)
fig.add_trace(fig_d['data'][0], row = 1, col = 2)

fig.show(renderer='colab')

## Top 10 Countries Analysis

In [214]:
top  = 10

fig_c = px.bar(countywise.sort_values('Confirmed').tail(top), x = 'Confirmed', y = 'Country',
              text = 'Confirmed', orientation='h', color_discrete_sequence=[act])
fig_d = px.bar(countywise.sort_values('Deaths').tail(top), x = 'Deaths', y = 'Country',
              text = 'Deaths', orientation='h', color_discrete_sequence=[dth])


fig_a = px.bar(countywise.sort_values('Active').tail(top), x = 'Active', y = 'Country',
              text = 'Active', orientation='h', color_discrete_sequence=['#434343'])
fig_r = px.bar(countywise.sort_values('Recovered').tail(top), x = 'Recovered', y = 'Country',
              text = 'Recovered', orientation='h', color_discrete_sequence=[rec])


fig_dc = px.bar(countywise.sort_values('Deaths / 100 Cases').tail(top), x = 'Deaths / 100 Cases', y = 'Country',
              text = 'Deaths / 100 Cases', orientation='h', color_discrete_sequence=['#f84351'])
fig_rc = px.bar(countywise.sort_values('Recovered / 100 Cases').tail(top), x = 'Recovered / 100 Cases', y = 'Country',
              text = 'Recovered / 100 Cases', orientation='h', color_discrete_sequence=['#a45398'])


fig_nc = px.bar(countywise.sort_values('New Cases').tail(top), x = 'New Cases', y = 'Country',
              text = 'New Cases', orientation='h', color_discrete_sequence=['#f04341'])
temp = countywise[countywise['Population']>1000000]
fig_p = px.bar(temp.sort_values('Cases / Million People').tail(top), x = 'Cases / Million People', y = 'Country',
              text = 'Cases / Million People', orientation='h', color_discrete_sequence=['#b40398'])



fig_wc = px.bar(countywise.sort_values('1 week change').tail(top), x = '1 week change', y = 'Country',
              text = '1 week change', orientation='h', color_discrete_sequence=['#c04041'])
temp = countywise[countywise['Confirmed']>100]
fig_wi = px.bar(temp.sort_values('1 week % increase').tail(top), x = '1 week % increase', y = 'Country',
              text = '1 week % increase', orientation='h', color_discrete_sequence=['#b00398'])


fig = make_subplots(rows = 5, cols = 2, shared_xaxes=False, horizontal_spacing=0.2, 
                    vertical_spacing=.05,
                   subplot_titles=('Confirmed Cases', 'Deaths Reported', 'Recovered Cases', 'Active Cases',
                                  'Deaths / 100 Cases', 'Recovered / 100 Cases',
                                  'New Cases', 'Cases / Million People',
                                  '1 week change', '1 week % increase'))

fig.add_trace(fig_c['data'][0], row = 1, col = 1)
fig.add_trace(fig_d['data'][0], row = 1, col = 2)

fig.add_trace(fig_r['data'][0], row = 2, col = 1)
fig.add_trace(fig_a['data'][0], row = 2, col = 2)

fig.add_trace(fig_dc['data'][0], row = 3, col = 1)
fig.add_trace(fig_rc['data'][0], row = 3, col = 2)

fig.add_trace(fig_nc['data'][0], row = 4, col = 1)
fig.add_trace(fig_p['data'][0], row = 4, col = 2)

fig.add_trace(fig_wc['data'][0], row = 5, col = 1)
fig.add_trace(fig_wi['data'][0], row = 5, col = 2)

fig.update_layout(height = 2000)
fig.show(renderer='colab')


## Bar Plots of New Cases and Confirmed Cases

In [243]:
fig = px.bar(country_daywise, x = 'Date', y = 'Confirmed', color = 'Country', height = 600,
            title='Confirmed', color_discrete_sequence=px.colors.cyclical.mygbm)

fig.show(renderer='colab')

In [223]:
fig = px.bar(country_daywise, x = 'Date', y = 'New Cases', color = 'Country', height = 600,
            title='New Cases', color_discrete_sequence=px.colors.cyclical.mygbm)
fig.show(renderer='colab')

## Growth Rate after 100 cases, 1000 cases and after 10000 cases

In [244]:
gt_100 = country_daywise[country_daywise['Confirmed']>100]['Country'].unique()
temp = df[df['Country'].isin(gt_100)]

temp = temp.groupby(['Country', 'Date'])['Confirmed'].sum().reset_index()
temp = temp[temp['Confirmed']>100]


min_date = temp.groupby('Country')['Date'].min().reset_index()
min_date.columns = ['Country', 'Min Date']


from_100th_case = pd.merge(temp, min_date, on = 'Country')
from_100th_case['N days'] = (from_100th_case['Date'] - from_100th_case['Min Date']).dt.days

fig = px.line(from_100th_case, x = 'N days', y = 'Confirmed', color = 'Country', title = 'N days from 100 case', height = 600)
fig.show(renderer='colab')

In [245]:
gt_1000 = country_daywise[country_daywise['Confirmed']>1000]['Country'].unique()
temp = df[df['Country'].isin(gt_1000)]

temp = temp.groupby(['Country', 'Date'])['Confirmed'].sum().reset_index()
temp = temp[temp['Confirmed']>1000]


min_date = temp.groupby('Country')['Date'].min().reset_index()
min_date.columns = ['Country', 'Min Date']


from_1000th_case = pd.merge(temp, min_date, on = 'Country')
from_1000th_case['N days'] = (from_1000th_case['Date'] - from_1000th_case['Min Date']).dt.days

fig = px.line(from_1000th_case, x = 'N days', y = 'Confirmed', color = 'Country', title = 'N days from 1000 case', height = 600)
fig.show(renderer='colab')

In [248]:
gt_100000 = country_daywise[country_daywise['Confirmed']>100000]['Country'].unique()
temp = df[df['Country'].isin(gt_100000)]

temp = temp.groupby(['Country', 'Date'])['Confirmed'].sum().reset_index()
temp = temp[temp['Confirmed']>100000]


min_date = temp.groupby('Country')['Date'].min().reset_index()
min_date.columns = ['Country', 'Min Date']


from_100000th_case = pd.merge(temp, min_date, on = 'Country')
from_100000th_case['N days'] = (from_100000th_case['Date'] - from_100000th_case['Min Date']).dt.days

fig = px.line(from_100000th_case, x = 'N days', y = 'Confirmed', color = 'Country', title = 'N days from 100000 case', height = 600)
fig.show(renderer='colab')

## Covid-19 vs Other Similar Epidemics

In [255]:
# Wikipedia Source

epidemics = pd.DataFrame({
    'epidemic' : ['COVID-19', 'SARS', 'EBOLA', 'MERS', 'H1N1'],
    'start_year' : [2019, 2002, 2013, 2012, 2009],
    'end_year' : [2020, 2004, 2016, 2020, 2010],
    'confirmed' : [full_latest['Confirmed'].sum(), 8422, 28646, 2519, 6724149],
    'deaths' : [full_latest['Deaths'].sum(), 813, 11323, 866, 19654]
})

epidemics['mortality'] = round((epidemics['deaths']/epidemics['confirmed'])*100, 2)

epidemics.head()

Unnamed: 0,epidemic,start_year,end_year,confirmed,deaths,mortality
0,COVID-19,2019,2020,5588891,350437,6.27
1,SARS,2002,2004,8422,813,9.65
2,EBOLA,2013,2016,28646,11323,39.53
3,MERS,2012,2020,2519,866,34.38
4,H1N1,2009,2010,6724149,19654,0.29


In [256]:
temp = epidemics.melt(id_vars='epidemic', value_vars=['confirmed', 'deaths', 'mortality'],
                     var_name='Case', value_name='Value')

fig = px.bar(temp, x = 'epidemic', y = 'Value', color = 'epidemic', text = 'Value', facet_col = 'Case',
            color_discrete_sequence= px.colors.qualitative.Bold)

fig.update_traces(textposition='outside')
fig.update_layout(uniformtext_minsize = 8, uniformtext_mode = 'hide')
fig.update_yaxes(showticklabels = False)
fig.layout.yaxis2.update(matches = None)
fig.layout.yaxis3.update(matches = None)
fig.show(renderer='colab')