# Group 1 - Data Project - Covid-19  

**Imports and set magics:**

In [36]:
import pandas as pd
import matplotlib.pyplot as plt
import ipywidgets as widgets
import folium
import plotly.express as px
import requests
import numpy as np 
import ipywidgets as widgets
from matplotlib_venn import venn2 # install with pip install matplotlib-venn
from ipywidgets import interact, interactive, fixed, interact_manual
from datetime import datetime
from IPython.core.display import display, HTML
from plotly.subplots import make_subplots
import plotly.graph_objects as go


# autoreload modules when code is run
%load_ext autoreload
%autoreload 2

# local modules
import dataproject

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Read and clean data

## Covid-19 data retrieved from The Humanitarian Data Exchange collected by the John Hopkin's Hospital. We are using data on confirmed covid-19 cases, deaths due to covid-19, recovered patients of covid-19 and data on each individual country. ##

**Read the CSSEGIS data** on covid-19 retrieved from the official data repository for the 2019 Novel Coronavirus Visual Dashboard operated by the Johns Hopkins University Center for Systems Science and Engineering (JHU CSSE). Also, Supported by ESRI Living Atlas Team and the Johns Hopkins University Applied Physics Lab (JHU APL). The data is avialable at: https://github.com/CSSEGISandData/COVID-19 and https://data.humdata.org/dataset/novel-coronavirus-2019-ncov-cases. The data is **cleaned**, removing and renaming columns:

In [37]:
# a. Loading data
death = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
confirmed = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
recovered = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')
country = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/web-data/data/cases_country.csv')


# c. Renaming country/region to country
confirmed = confirmed.rename(columns={'Country/Region': 'Country'})
recovered = recovered.rename(columns={'Country/Region': 'Country'})
death = death.rename(columns={'Country/Region': 'Country'})
country = country.rename(columns={'Country_Region': 'Country'})

# d. Droping columns
drop_these = ['Province/State', 'Lat', 'Long']
confirmed.drop(drop_these, axis=1, inplace=True)
recovered.drop(drop_these, axis=1, inplace=True)
death.drop(drop_these, axis=1, inplace=True)

# Visualising the worst-hit countries in terms of confirmed cases


In [38]:
sorted_country = country.sort_values('Confirmed', ascending= False)

def highlight_col(x):
    b = 'background-color: blue'
    d = 'background-color: darkblue'
    g = 'background-color: green'
    df1 = pd.DataFrame('', index=x.index, columns=x.columns)
    df1.iloc[:, 4] = d
    df1.iloc[:, 5] = b
    df1.iloc[:, 6] = g
    return df1

def show_latest_cases(n):
    n = int(n)
    return country.sort_values('Confirmed', ascending= False).head(n).style.apply(highlight_col, axis=None)

interact(show_latest_cases, n='10')

Unnamed: 0,Country,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active
17,US,2020-04-06 10:54:02,40.0,-100.0,337933,9653,17582,0
159,Spain,2020-04-06 10:53:42,40.4637,-3.74922,135032,13055,40437,81540
10,Italy,2020-04-06 10:53:42,41.8719,12.5674,128948,15887,21815,91246
7,Germany,2020-04-06 10:53:42,51.1657,10.4515,100132,1584,28700,69848
6,France,2020-04-06 10:53:42,46.2276,2.2137,93780,8093,16354,69333
3,China,2020-04-06 09:37:01,30.5928,114.305,82665,3335,77310,2020
89,Iran,2020-04-06 10:53:42,32.4279,53.688,60500,3739,24236,32525
16,United Kingdom,2020-04-06 10:53:42,55.0,-3.0,48451,4943,229,43279
171,Turkey,2020-04-06 10:53:42,38.9637,35.2433,27069,574,1042,25453
15,Switzerland,2020-04-06 10:53:42,46.8182,8.2275,21282,734,7298,13250


<function __main__.show_latest_cases(n)>

** A more visually intuitive visualisation of the above table **

In [39]:
def bubble_chart(n):
    fig = px.scatter(sorted_country.head(n), x="Country", y="Confirmed", size="Confirmed", color="Country",
               hover_name="Country", size_max=60)
    fig.update_layout(
    title=str(n) +" Worst hit countries",
    xaxis_title="Countries",
    yaxis_title="Confirmed Cases",
    width = 700
    )
    fig.show()
interact(bubble_chart, n=10)

<function __main__.bubble_chart(n)>

The plot shows the worst hit countries in terms of confirmed cases. It is evident that the US is experiencing the highest number of cases. To fully grasp what countries are worst hit one would need to look at the numbers relative to the sizes of the populations. 

# Visualisation of worst affected countries in terms of deaths**

In [40]:
px.bar(
    sorted_country.head(10),
    x = "Country",
    y = "Deaths",
    title= "10 Countries most affected by Covid-19", # the axis names
    color_discrete_sequence=["blue"], 
    height=500,
    width=800
)

It is evident that among the most affected countries the US, Spain, Italy and France are the countries that are worst hit by the pandemic in terms of deaths caused by covid-19. 

In [41]:
#Confirmed 
timeline = ['1/22/20', '1/23/20',
       '1/24/20', '1/25/20', '1/26/20', '1/27/20', '1/28/20', '1/29/20',
       '1/30/20', '1/31/20', '2/1/20', '2/2/20', '2/3/20', '2/4/20', '2/5/20',
       '2/6/20', '2/7/20', '2/8/20', '2/9/20', '2/10/20', '2/11/20', '2/12/20',
       '2/13/20', '2/14/20', '2/15/20', '2/16/20', '2/17/20', '2/18/20',
       '2/19/20', '2/20/20', '2/21/20', '2/22/20', '2/23/20', '2/24/20',
       '2/25/20', '2/26/20', '2/27/20', '2/28/20', '2/29/20', '3/1/20',
       '3/2/20', '3/3/20'] 
#list of values to append equal to length of x axis plot
#for eg x[t1,t2] = [[p1,p1],[20,30],[1000,5000]]
time = [];value = [];country=[]
col_value = list(confirmed.columns)
for i in timeline:
    time.append(datetime.strptime(i, '%m/%d/%y'))
    value.append(confirmed[i].sum())
    

new_confirmed = pd.DataFrame({'Timeline':time,'Covid-19 impact':value})
#change to date time formatdatetime_object = datetime.strptime(datetime_str, '%m/%d/%y %H:%M:%S')

In [45]:
#Deaths
timeline = ['1/22/20', '1/23/20',
       '1/24/20', '1/25/20', '1/26/20', '1/27/20', '1/28/20', '1/29/20',
       '1/30/20', '1/31/20', '2/1/20', '2/2/20', '2/3/20', '2/4/20', '2/5/20',
       '2/6/20', '2/7/20', '2/8/20', '2/9/20', '2/10/20', '2/11/20', '2/12/20',
       '2/13/20', '2/14/20', '2/15/20', '2/16/20', '2/17/20', '2/18/20',
       '2/19/20', '2/20/20', '2/21/20', '2/22/20', '2/23/20', '2/24/20',
       '2/25/20', '2/26/20', '2/27/20', '2/28/20', '2/29/20', '3/1/20',
       '3/2/20', '3/3/20'] 
#list of values to append equal to length of x axis for plot
time = [];value = [];country=[]
col_value = list(death.columns)
for i in timeline:
    time.append(datetime.strptime(i, '%m/%d/%y'))
    value.append(death[i].sum())
    

new_death = pd.DataFrame({'Timeline':time,'Covid-19 impact':value})
#change to date time formatdatetime_object = datetime.strptime(datetime_str, '%m/%d/%y %H:%M:%S')

In [43]:
#Recovered
timeline = ['1/22/20', '1/23/20',
       '1/24/20', '1/25/20', '1/26/20', '1/27/20', '1/28/20', '1/29/20',
       '1/30/20', '1/31/20', '2/1/20', '2/2/20', '2/3/20', '2/4/20', '2/5/20',
       '2/6/20', '2/7/20', '2/8/20', '2/9/20', '2/10/20', '2/11/20', '2/12/20',
       '2/13/20', '2/14/20', '2/15/20', '2/16/20', '2/17/20', '2/18/20',
       '2/19/20', '2/20/20', '2/21/20', '2/22/20', '2/23/20', '2/24/20',
       '2/25/20', '2/26/20', '2/27/20', '2/28/20', '2/29/20', '3/1/20',
       '3/2/20', '3/3/20'] 
#list of values to append equal to length of x axis for plot
time = [];value = [];country=[]
col_value = list(recovered.columns)
for i in timeline:
    time.append(datetime.strptime(i, '%m/%d/%y'))
    value.append(recovered[i].sum())
    

new_recovered = pd.DataFrame({'Timeline':time,'Covid-19 impact':value})
#change to date time formatdatetime_object = datetime.strptime(datetime_str, '%m/%d/%y %H:%M:%S')

In [47]:
fig = make_subplots()

fig.add_trace(
    go.Scatter(x=new_confirmed["Timeline"], y=new_confirmed["Covid-19 impact"], name = 'Infected'))

fig.add_trace(
    go.Scatter(x=new_death["Timeline"], y=new_death["Covid-19 impact"], name = 'Deaths'))

fig.add_trace(
    go.Scatter(x=new_recovered["Timeline"], y=new_recovered["Covid-19 impact"], name = 'Recovery'))

fig.update_xaxes(title_text="Timeline")


fig.update_layout(height=500, width=800, title_text="Timeline of COVID-19")
fig.show()