<a href="https://colab.research.google.com/github/SriSatyaLokesh/AutoTimeTracker/blob/master/COVID19_Worldwide_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## COVID19 Exploratory Data Analysis
### World wide

#### follwing cells is for performing data analysis in google colab

In [0]:
# upload your kaggle API token (you can get that from your account) 
from google.colab import files
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [0]:
# Run this to create a kaggle environment
# !pip install -q kaggle
# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/kaggle.json
# !chmod 600 /root/.kaggle/kaggle.json

# import numpy as np
# import pandas as pd
# import plotly.express as px

**```Let's perform exploratory data analysis on covid-19 data ```**
- I'm using data from kaggle and github
- Global covid-19 data https://www.kaggle.com/imdevskp/corona-virus-report/
- India covid-19 data https://www.kaggle.com/sudalairajkumar/novel-corona-virus-2019-dataset/
- Time series covid-19 data https://github.com/CSSEGISandData/COVID-19.git

In [0]:
# Get data from kaggle 
import zipfile
# Download data
!kaggle datasets download -d imdevskp/corona-virus-report/
!kaggle datasets download -d sudalairajkumar/novel-corona-virus-2019-dataset/

# UnZip data
zip_ref = zipfile.ZipFile("corona-virus-report.zip", 'r')
zip_ref.extractall()
zip_ref = zipfile.ZipFile("novel-corona-virus-2019-dataset.zip", 'r')
zip_ref.extractall()
zip_ref.close()

Downloading corona-virus-report.zip to /content
 43% 3.00M/6.90M [00:00<00:00, 23.9MB/s]
100% 6.90M/6.90M [00:00<00:00, 33.8MB/s]
Downloading novel-corona-virus-2019-dataset.zip to /content
  0% 0.00/713k [00:00<?, ?B/s]
100% 713k/713k [00:00<00:00, 47.1MB/s]


In [0]:
# Get data from github 

# Download data
!git clone https://github.com/CSSEGISandData/COVID-19.git

Cloning into 'COVID-19'...
remote: Enumerating objects: 18918, done.[K
remote: Total 18918 (delta 0), reused 0 (delta 0), pack-reused 18918[K
Receiving objects: 100% (18918/18918), 76.14 MiB | 32.19 MiB/s, done.
Resolving deltas: 100% (9737/9737), done.


In [0]:
!ls

corona-virus-report.zip		     sample_data
COVID-19			     time_series_covid_19_confirmed.csv
covid_19_clean_complete.csv	     time_series_covid_19_confirmed_US.csv
covid_19_data.csv		     time_series_covid_19_deaths.csv
COVID19_line_list_data.csv	     time_series_covid_19_deaths_US.csv
COVID19_open_line_list.csv	     time_series_covid_19_recovered.csv
kaggle.json			     usa_county_wise.csv
novel-corona-virus-2019-dataset.zip


In [0]:
#IMPORT required libraries
import numpy as np
import pandas as pd
import plotly.express as px

In [0]:
# load the data
data = pd.read_csv("covid_19_clean_complete.csv")

In [0]:
print(data.columns)
data.tail(5)

Index(['Province/State', 'Country/Region', 'Lat', 'Long', 'Date', 'Confirmed',
       'Deaths', 'Recovered'],
      dtype='object')


Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
20092,Falkland Islands (Malvinas),United Kingdom,-51.7963,-59.5236,4/7/20,2,0,0
20093,Saint Pierre and Miquelon,France,46.8852,-56.3159,4/7/20,1,0,0
20094,,South Sudan,6.877,31.307,4/7/20,2,0,0
20095,,Western Sahara,24.2155,-12.8858,4/7/20,4,0,0
20096,,Sao Tome and Principe,0.18636,6.613081,4/7/20,4,0,0


In [0]:
data[data["Country/Region"]=="India"]["Province/State"]

131      NaN
392      NaN
653      NaN
914      NaN
1175     NaN
        ... 
18923    NaN
19184    NaN
19445    NaN
19706    NaN
19967    NaN
Name: Province/State, Length: 77, dtype: object

#### Even India doesn't have state specification so we should fill those values

In [0]:
# Replacing all the NaN values with Country/Region
data["Province/State"].fillna(data["Country/Region"], inplace=True)

In [0]:
data.tail(5)

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
20092,Falkland Islands (Malvinas),United Kingdom,-51.7963,-59.5236,4/7/20,2,0,0
20093,Saint Pierre and Miquelon,France,46.8852,-56.3159,4/7/20,1,0,0
20094,South Sudan,South Sudan,6.877,31.307,4/7/20,2,0,0
20095,Western Sahara,Western Sahara,24.2155,-12.8858,4/7/20,4,0,0
20096,Sao Tome and Principe,Sao Tome and Principe,0.18636,6.613081,4/7/20,4,0,0


In [0]:
data[data["Country/Region"]=="India"]["Province/State"]

131      India
392      India
653      India
914      India
1175     India
         ...  
18923    India
19184    India
19445    India
19706    India
19967    India
Name: Province/State, Length: 77, dtype: object

###### We have filled all NaN values, we are ready to perform analysis

In [0]:
data["Date"].tail(5)

20092    4/7/20
20093    4/7/20
20094    4/7/20
20095    4/7/20
20096    4/7/20
Name: Date, dtype: object

In [0]:
#we need to form date with that specific format
from datetime import datetime as dt,date,timedelta
today = dt.now()-timedelta(days=3)
today = dt.strftime(today,"%-m/%-d/%y")
print(today)

4/7/20


In [0]:

latest_data = data.loc[data["Date"]==today][['Province/State',"Country/Region",'Lat','Long',"Confirmed","Deaths","Recovered"]]

### Let's find total active cases


In [0]:

#total active cases in every country
latest_data["Active"] = latest_data["Confirmed"] - latest_data["Deaths"] - latest_data["Recovered"]

In [0]:
latest_data.tail()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Confirmed,Deaths,Recovered,Active
20092,Falkland Islands (Malvinas),United Kingdom,-51.7963,-59.5236,2,0,0,2
20093,Saint Pierre and Miquelon,France,46.8852,-56.3159,1,0,0,1
20094,South Sudan,South Sudan,6.877,31.307,2,0,0,2
20095,Western Sahara,Western Sahara,24.2155,-12.8858,4,0,0,4
20096,Sao Tome and Principe,Sao Tome and Principe,0.18636,6.613081,4,0,0,4


###### Aggregating the results specific to each country

In [0]:
latest_data_aggregated = latest_data.groupby("Country/Region",as_index=False)[["Country/Region","Confirmed","Deaths","Recovered","Active"]].sum()

In [0]:
latest_data_aggregated.tail()

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active
179,Vietnam,249,0,123,126
180,West Bank and Gaza,261,1,42,218
181,Western Sahara,4,0,0,4
182,Zambia,39,1,7,31
183,Zimbabwe,11,2,0,9


Let's see how each country is affected with the COVID-19 **Confirmed** cases through the color from light to dark(heatmap)

- green color means least no of confirmed cases
- yellow color means mid range of confirmed cases
- red color means highest no of confirmed cases

We can see **US, Italy, China, Spain, Germany, France** are highly affected countries 

In [0]:
config = dict({'scrollZoom': False})
worldwide_confirmed_cases_fig = px.choropleth(latest_data_aggregated, locations="Country/Region", 
                    locationmode='country names', color="Confirmed", 
                    hover_name="Country/Region", range_color=[1,8000], 
                    # color_continuous_scale="peach", 
                    # color_continuous_scale="Inferno", 
                    # color_continuous_scale=px.colors.sequential.Cividis_r,
                    color_continuous_scale=["green", "yellow", "red"], 
                    title='Countries with Confirmed Cases - '+str(today))

worldwide_confirmed_cases_fig.show(config=config)

Let's see how each country is affected with the COVID-19 **Active** cases through the color from **green to red** (heatmap)

- green color means least no of active cases
- yellow color means mid range of active cases
- red color means highest no of active cases



In [0]:
worldwide_active_cases_fig = px.choropleth(latest_data_aggregated, locations="Country/Region", 
                    locationmode='country names', color="Active", 
                    hover_name="Country/Region", range_color=[1,8000], 
                    color_continuous_scale=["green", "yellow", "red"], 
                    title='Countries with Active Cases - '+str(today))

worldwide_active_cases_fig.show(config=config)

Let's see how each country is affected with the COVID-19 **Death** cases through the color from **green to red** (heatmap)

- green color means least no of death cases
- yellow color means mid range of death cases
- red color means highest no of death cases

We can see **Italy, Spain, China, Iran, France** have high Death rates

Least Death rate countries are **Guinea, Haiti, Rwanda, Qatar, Liberia**

In [0]:
worldwide_death_cases_fig = px.choropleth(latest_data_aggregated, locations="Country/Region", 
                    locationmode='country names', color="Deaths", 
                    hover_name="Country/Region", range_color=[1,2000], 
                    color_continuous_scale=["green", "yellow", "red"], 
                    title='Countries with Death Cases - '+str(today))

worldwide_death_cases_fig.show(config=config)

Let's see how each country is affected with the COVID-19 **Recovered** cases through the color from **green to red** (heatmap)

- red color means least no of recovered cases
- yellow color means mid range of recovered cases
- green color means highest no of recovered cases


In [0]:
worldwide_recovered_cases_fig = px.choropleth(latest_data_aggregated, locations="Country/Region", 
                    locationmode='country names', color="Recovered", 
                    hover_name="Country/Region", range_color=[1,8000], 
                    color_continuous_scale=["yellow", "green"], 
                    title='Countries with Recovered Cases - '+str(today))

worldwide_recovered_cases_fig.show(config=config)

### Let's see how this spread over time world wide

In [0]:
data["Active"] = data["Confirmed"] - data["Deaths"] - data["Recovered"]
world_over_time = data.groupby(['Date', 'Country/Region'],as_index=False)[['Confirmed', 'Deaths', "Recovered", "Active"]].max()
world_over_time = world_over_time.reset_index()
world_over_time['Date'] = pd.to_datetime(world_over_time['Date'])
world_over_time['Date'] = world_over_time['Date'].dt.strftime('%Y-%m-%d')
world_over_time = world_over_time.sort_values('Date')
world_over_time.tail()


Unnamed: 0,index,Date,Country/Region,Confirmed,Deaths,Recovered,Active
14047,14047,2020-04-07,Gambia,4,1,2,1
14048,14048,2020-04-07,Georgia,196,3,46,147
14049,14049,2020-04-07,Germany,107663,2016,36081,69566
14040,14040,2020-04-07,Estonia,1149,21,69,1059
14167,14167,2020-04-07,Zimbabwe,11,2,0,9


#####  1. This is how the confirmed cases grown over time

In [0]:

world_over_time['size'] = world_over_time['Confirmed'].pow(0.2) # will be used for the size of circrle
world_over_time.tail()

confirmed_cases_over_time_fig = px.scatter_geo(world_over_time, locations="Country/Region", locationmode='country names', 
                            color="Confirmed", size='size', hover_name="Country/Region", 
                            range_color= [0, 5000], 
                            projection="natural earth", animation_frame="Date", 
                            title='COVID-19: Confiremed cases Spread Over Time till - '+str(today), color_continuous_scale=["green","yellow","red"])
confirmed_cases_over_time_fig.show(config=config)


##### 2. This is how death cases spread over time

In [0]:
world_over_time['size'] = world_over_time['Deaths'].pow(0.2)

deaths_over_time_fig = px.scatter_geo(world_over_time, locations="Country/Region", locationmode='country names', 
                     color="Deaths", size='size', hover_name="Country/Region", 
                     range_color= [0, 2000], 
                    #  projection="", 
                     animation_frame="Date", 
                     title='COVID-19: Deaths cases Spread Over Time', color_continuous_scale=["yellow","red"])
deaths_over_time_fig.show(config=config)

##### 3. recovered cases over time

In [0]:
world_over_time['size'] = world_over_time['Recovered'].pow(0.2)

recovered_cases_fig = px.scatter_geo(world_over_time, locations="Country/Region", locationmode='country names', 
                     color="Recovered", size='size', hover_name="Country/Region", 
                     range_color= [0, 2000], 
                     projection="natural earth", 
                     animation_frame="Date", 
                     title='COVID-19: Recovered cases Spread Over Time', color_continuous_scale=["orange","yellow","green"])
recovered_cases_fig.show(config=config)

### Let's see how each continent is affected

###### 1. Europe

In [0]:
europe_fig = px.choropleth(latest_data_aggregated, locations="Country/Region", 
                    locationmode='country names', color="Active", 
                    hover_name="Country/Region", range_color=[1,8000], 
                    color_continuous_scale=["green","yellow","red"], 
                    title='European Countries with Active Cases',scope='europe', height=600)
europe_fig.show(config=config)

###### 2. Asia

This plot helps us to understand the active cases in the european countries through the color from light to dark(heatmap)


- green color means least no of active cases
- yellow color means least no of active cases
- red color means highest no of active cases



In [0]:
asia_fig = px.choropleth(latest_data_aggregated, locations="Country/Region", 
                    locationmode='country names', color="Active", 
                    hover_name="Country/Region", range_color=[1,6000], 
                    color_continuous_scale=["green","yellow","red"], 
                    title='Asian Countries with Active Cases',scope='asia', height=600)
asia_fig.show(config=config)

### graphs for Top 15 countries world wide for all below factors -

*   Confirmed cases
*   Death cases
*   Recovered cases
*   Active cases



In [0]:
from plotly.subplots import make_subplots
# Confirmed cases
fig_c = px.bar(latest_data_aggregated.sort_values('Confirmed', ascending=False)[:15][::-1], 
             x='Confirmed', y='Country/Region',text='Confirmed', orientation='h',
             color="Confirmed",range_color=[1,80000],  )
#fig.show()

# Death cases
fig_d = px.bar(latest_data_aggregated.sort_values('Deaths', ascending=False)[:15][::-1], 
             x='Deaths', y='Country/Region', text='Deaths', orientation='h',
             color="Deaths",range_color=[1,8000],  )
#fig.show()

# Recovered cases
fig_r = px.bar(latest_data_aggregated.sort_values('Recovered', ascending=False)[:15][::-1], 
             x='Recovered', y='Country/Region', text='Recovered', orientation='h',
              color="Recovered",range_color=[1,50000],  )
#fig.show()

# Active cases
fig_a = px.bar(latest_data_aggregated.sort_values('Active', ascending=False)[:15][::-1], 
             x='Active', y='Country/Region', text='Active', orientation='h',
             color="Active",range_color=[1,80000], )
#fig.show()


fig = make_subplots(rows=2, cols=2, shared_xaxes=False, vertical_spacing=0.08, horizontal_spacing=0.1,
                    subplot_titles=("Confirmed", "Deaths", "Recovered", "Active"))
fig.add_trace(fig_c['data'][0], row=1, col=1)
fig.add_trace(fig_d['data'][0], row=1, col=2)
fig.add_trace(fig_r['data'][0], row=2, col=1)
fig.add_trace(fig_a['data'][0], row=2, col=2)
fig.update_layout(height=800, title_text="Top 15 Countries",)

#### Let's see ASIA continent specific graphs

In [0]:
! pip install datapackage

In [0]:
# to get list of asian countries
from datapackage import Package

package = Package('https://datahub.io/JohnSnowLabs/country-and-continent-codes-list/datapackage.json')

# print list of all resources:
# print(package.resource_names)
countries = package.get_resource("country-and-continent-codes-list-csv_csv")
pd_countries = pd.DataFrame(countries.read())
asian_countries = list(pd_countries[pd_countries[0]=="Asia"][2])
asian_countries = [country.split(',')[0] for country in asian_countries]



In [0]:
#get asian countries in data
asia_latest_data_aggregated = latest_data_aggregated[latest_data_aggregated['Country/Region'].isin(asian_countries)]
asia_latest_data_aggregated

In [106]:
from plotly.subplots import make_subplots
# Confirmed cases
fig_c = px.bar(asia_latest_data_aggregated.sort_values('Confirmed', ascending=False)[:15][::-1], 
             x='Confirmed', y='Country/Region',text='Confirmed', orientation='h')
#fig.show()

# Death cases
fig_d = px.bar(asia_latest_data_aggregated.sort_values('Deaths', ascending=False)[:15][::-1], 
             x='Deaths', y='Country/Region', text='Deaths', orientation='h')
#fig.show()

# Recovered cases
fig_r = px.bar(asia_latest_data_aggregated.sort_values('Recovered', ascending=False)[:15][::-1], 
             x='Recovered', y='Country/Region', text='Recovered', orientation='h')
#fig.show()

# Active cases
fig_a = px.bar(asia_latest_data_aggregated.sort_values('Active', ascending=False)[:15][::-1], 
             x='Active', y='Country/Region', text='Active', orientation='h')
#fig.show()


fig = make_subplots(rows=2, cols=2, shared_xaxes=False, vertical_spacing=0.08, horizontal_spacing=0.1,
                    subplot_titles=("Confirmed", "Deaths", "Recovered", "Active"))
fig.add_trace(fig_c['data'][0], row=1, col=1)
fig.add_trace(fig_d['data'][0], row=1, col=2)
fig.add_trace(fig_r['data'][0], row=2, col=1)
fig.add_trace(fig_a['data'][0], row=2, col=2)
fig.update_layout(height=800, title_text="Top 15 countries in Asia")

##### 2. Europe specific

In [0]:
europe = list(['Austria','Belgium','Bulgaria','Croatia','Cyprus','Czechia','Denmark','Estonia','Finland','France','Germany','Greece','Hungary','Ireland',
               'Italy', 'Latvia','Luxembourg','Lithuania','Malta','Norway','Netherlands','Poland','Portugal','Romania','Slovakia','Slovenia',
               'Spain', 'Sweden', 'United Kingdom', 'Iceland', 'Russia', 'Switzerland', 'Serbia', 'Ukraine', 'Belarus',
               'Albania', 'Bosnia and Herzegovina', 'Kosovo', 'Moldova', 'Montenegro', 'North Macedonia'])

europe_latest_data_aggregated = latest_data_aggregated[latest_data_aggregated['Country/Region'].isin(europe)]

In [105]:
from plotly.subplots import make_subplots
# Confirmed cases
fig_c = px.bar(europe_latest_data_aggregated.sort_values('Confirmed', ascending=False)[:15][::-1], 
             x='Confirmed', y='Country/Region',text='Confirmed', orientation='h')
#fig.show()

# Death cases
fig_d = px.bar(europe_latest_data_aggregated.sort_values('Deaths', ascending=False)[:15][::-1], 
             x='Deaths', y='Country/Region', text='Deaths', orientation='h')
#fig.show()

# Recovered cases
fig_r = px.bar(europe_latest_data_aggregated.sort_values('Recovered', ascending=False)[:15][::-1], 
             x='Recovered', y='Country/Region', text='Recovered', orientation='h')
#fig.show()

# Active cases
fig_a = px.bar(europe_latest_data_aggregated.sort_values('Active', ascending=False)[:15][::-1], 
             x='Active', y='Country/Region', text='Active', orientation='h')
#fig.show()


fig = make_subplots(rows=2, cols=2, shared_xaxes=False, vertical_spacing=0.08, horizontal_spacing=0.1,
                    subplot_titles=("Confirmed", "Deaths", "Recovered", "Active"))
fig.add_trace(fig_c['data'][0], row=1, col=1)
fig.add_trace(fig_d['data'][0], row=1, col=2)
fig.add_trace(fig_r['data'][0], row=2, col=1)
fig.add_trace(fig_a['data'][0], row=2, col=2)
fig.update_layout(height=800, title_text="Top 15 countries in Europe")

### lets see how these are over time world wide with line charts 

In [109]:
grouped_data = data.groupby('Date')[['Date', 'Confirmed', 'Deaths', 'Recovered', 'Active']].sum().reset_index()
grouped_data = grouped_data.reset_index()
grouped_data['Date'] = pd.to_datetime(grouped_data['Date'])
grouped_data['Date'] = grouped_data['Date'].dt.strftime('%m/%d/%Y')
grouped_data = grouped_data.sort_values('Date')
grouped_data

Unnamed: 0,index,Date,Confirmed,Deaths,Recovered,Active
0,0,01/22/2020,555,17,28,510
1,1,01/23/2020,654,18,30,606
2,2,01/24/2020,941,26,36,879
3,3,01/25/2020,1434,42,39,1353
4,4,01/26/2020,2118,56,52,2010
...,...,...,...,...,...,...
72,72,04/03/2020,1095915,58787,223621,813507
73,73,04/04/2020,1197403,64606,243572,889225
74,74,04/05/2020,1272113,69374,256997,945742
75,75,04/06/2020,1345099,74565,273256,997278


In [111]:
# confirmed cases
fig_c = px.line(grouped_data, x="Date", y="Confirmed", 
              title="Worldwide Confirmed Cases Over Time")

# Death cases
fig_d = px.line(grouped_data, x="Date", y="Deaths", 
              title="Worldwide Death Cases Over Time")

# Recovered cases
fig_r = px.line(grouped_data, x="Date", y="Recovered", 
              title="Worldwide Recovered Cases Over Time")

# Active cases
fig_a = px.line(grouped_data, x="Date", y="Active", 
              title="Worldwide Active Cases Over Time")

fig = make_subplots(rows=2, cols=2, shared_xaxes=False, vertical_spacing=0.2, horizontal_spacing=0.1,
                    subplot_titles=("Confirmed", "Deaths", "Recovered", "Active"))
fig.add_trace(fig_c['data'][0], row=1, col=1)
fig.add_trace(fig_d['data'][0], row=1, col=2)
fig.add_trace(fig_r['data'][0], row=2, col=1)
fig.add_trace(fig_a['data'][0], row=2, col=2)
fig.update_layout(height=800, title_text="World wide data over time")

#### lets see how these are over time world wide with line charts(Logarithmic Scale)

In [112]:
# confirmed cases
fig_c = px.line(grouped_data, x="Date", y="Confirmed", 
              title="Worldwide Confirmed Cases Over Time", log_y=True)
fig_c.show()

# Death cases
fig_d = px.line(grouped_data, x="Date", y="Deaths", 
              title="Worldwide Death Cases Over Time", log_y=True)
fig_d.show()

# Recovered cases
fig_r = px.line(grouped_data, x="Date", y="Recovered", 
              title="Worldwide Recovered Cases Over Time", log_y=True)
fig_r.show()

# Active cases
fig_a = px.line(grouped_data, x="Date", y="Active", 
              title="Worldwide Active Cases Over Time", log_y=True)
fig_a.show()