In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.express as px
import os

from plotly.offline import download_plotlyjs, init_notebook_mode, iplot

init_notebook_mode(connected=True)

In [2]:
files = os.listdir(r"C:\Users\ruben\Desktop\DataPortfolio\Datasets\Covid-19")
files

['country_wise_latest.csv',
 'covid_19_clean_complete.csv',
 'day_wise.csv',
 'full_grouped.csv',
 'usa_country_wise.csv',
 'worldometer_data.csv']

In [9]:
# build a function that reads my csv files and easily build my df's

In [3]:
path = r"C:\Users\ruben\Desktop\DataPortfolio\Datasets\Covid-19"

def read_data(path, filename):
    return pd.read_csv(path+"/"+filename)

In [4]:
world_df = read_data(path, 'worldometer_data.csv')

In [13]:
# building df by file index

In [5]:
dayWise_df = read_data(path, files[2])
dayWise_df.head(3)

Unnamed: 0,Date,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,No. of countries
0,2020-01-22,555,17,28,510,0,0,0,3.06,5.05,60.71,6
1,2020-01-23,654,18,30,606,99,1,2,2.75,4.59,60.0,8
2,2020-01-24,941,26,36,879,287,8,6,2.76,3.83,72.22,9


In [6]:
group_df = read_data(path, files[3])
usa_df = read_data(path, files[4])
province_df = read_data(path, files[1])

In [16]:
province_df.shape

(49068, 10)

In [17]:
#QUESTION top 20 countries that have the most:
# 1. total cases
# 2. deaths
# 3. recovered
# 4. active cases

In [19]:
world_df.columns

Index(['Country/Region', 'Continent', 'Population', 'TotalCases', 'NewCases',
       'TotalDeaths', 'NewDeaths', 'TotalRecovered', 'NewRecovered',
       'ActiveCases', 'Serious,Critical', 'Tot Cases/1M pop', 'Deaths/1M pop',
       'TotalTests', 'Tests/1M pop', 'WHO Region'],
      dtype='object')

In [None]:
# we build a for loop that generates a treemap for each section

In [18]:
columns = ['TotalCases', 'TotalDeaths', 'TotalRecovered', 'ActiveCases']

for i in columns:
    # the path parameter specifies what is represented by each rectangle
    # Sort the DataFrame by column in descending order
    sorted_df = world_df.sort_values(by=i, ascending=False)
    fig = px.treemap(sorted_df.iloc [0:20], values=i, path=['Country/Region'], title='Top 20 countries with most {}'.format(i))
    fig.show()

In [25]:
#QUESTION: what is the trend on:
# 1. confirmed cases
# 2. deaths
# 3. recovered
# 4. active cases

In [26]:
dayWise_df.head(3)

Unnamed: 0,Date,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,No. of countries
0,2020-01-22,555,17,28,510,0,0,0,3.06,5.05,60.71,6
1,2020-01-23,654,18,30,606,99,1,2,2.75,4.59,60.0,8
2,2020-01-24,941,26,36,879,287,8,6,2.76,3.83,72.22,9


In [8]:
px.line(dayWise_df, x='Date', y=['Confirmed', 'Deaths', 'Recovered', 'Active'], title='Trend per date', template='plotly_dark')

In [None]:
#QUESTION: Analyse the population to tests ratio

In [29]:
# calculate the test ratio for the top 20 most tested countries

In [33]:
# Sort the DataFrame by 'TotalTests' in descending order
sorted_df = world_df.sort_values(by='TotalTests', ascending=False).iloc[0:20]
popu_test_ratio = sorted_df['TotalTests'] / sorted_df['Population']
popu_test_ratio

0     0.190640
3     0.203623
2     0.016035
11    0.257873
1     0.062085
18    0.102452
15    0.117443
9     0.151087
38    0.531470
16    0.060191
67    0.181419
23    0.114339
19    0.061147
12    0.104277
4     0.053044
10    0.031068
6     0.075521
44    0.062752
25    0.115099
13    0.009304
dtype: float64

In [35]:
px.bar(sorted_df.iloc[0:20], 
       x='Country/Region', 
       y=popu_test_ratio[0:20],
       color='Country/Region', 
       title='Tests Ratio - Countries order by population size')

In [None]:
#ANSWER: For every 2 citizens in UAE, 1 test is done

In [34]:
#QUESTION: Top 20 countries most affected by the virus

In [42]:
world_df.columns

Index(['Country/Region', 'Continent', 'Population', 'TotalCases', 'NewCases',
       'TotalDeaths', 'NewDeaths', 'TotalRecovered', 'NewRecovered',
       'ActiveCases', 'Serious,Critical', 'Tot Cases/1M pop', 'Deaths/1M pop',
       'TotalTests', 'Tests/1M pop', 'WHO Region'],
      dtype='object')

In [32]:
sorted_df = world_df.sort_values(by=['TotalDeaths', 'Serious,Critical'], ascending=False)
px.bar(sorted_df.iloc[0:20], 
       x='Country/Region', y=['TotalDeaths', 'Serious,Critical'], 
       barmode='stack')

In [10]:
#QUESTION: Top 20 countries by total cases

In [38]:
sorted_df = world_df.sort_values(by='TotalCases', ascending=False)
px.bar(sorted_df.iloc[0:20],
      y='Country/Region',
      x='TotalCases',
      title='Top 20 countries by total cases',
      #text='TotalCases',
      color='TotalCases')

In [15]:
#QUESTION: Top 20 countries by Max Deaths

In [39]:
sorted_df = world_df.sort_values(by='TotalDeaths', ascending=False)
px.bar(sorted_df.iloc[0:20],
      y='Country/Region',
      x='TotalDeaths',
      title='Top 20 countries by deaths',
      color='TotalDeaths')

In [40]:
#QUESTION: Top 20 countries by Active Cases

In [41]:
sorted_df = world_df.sort_values(by='ActiveCases', ascending=False)
px.bar(sorted_df.iloc[0:20],
      y='Country/Region',
      x='ActiveCases',
      title='Top 20 countries by active cases',
      color='ActiveCases')

In [42]:
#QUESTION: Top 20 countries by Recovered Cases

In [44]:
sorted_df = world_df.sort_values(by='TotalRecovered', ascending=False)
px.bar(sorted_df.iloc[0:20],
      y='Country/Region',
      x='TotalRecovered',
      title='Top 20 countries by recovered cases',
      color='TotalRecovered')

In [48]:
#QUESTION: Pie chart with top 10 countries most affected by the virus

In [56]:
columns = ['TotalDeaths', 'Serious,Critical']

for i in columns:
    sorted_df = world_df.sort_values(by=i, ascending=False)
    fig = px.pie(sorted_df.iloc[0:10], 
                 names='Country/Region', 
                 values=i,
                 # to create a center empty space
                 hole=0.3,
                 title='Top 10 {} % per country'.format(i)
                )
    fig.show()

In [57]:
# Ratio of deaths to total cases and recovered by top 10 populous countries

In [58]:
world_df.columns

Index(['Country/Region', 'Continent', 'Population', 'TotalCases', 'NewCases',
       'TotalDeaths', 'NewDeaths', 'TotalRecovered', 'NewRecovered',
       'ActiveCases', 'Serious,Critical', 'Tot Cases/1M pop', 'Deaths/1M pop',
       'TotalTests', 'Tests/1M pop', 'WHO Region'],
      dtype='object')

In [69]:
columns = ['TotalCases', 'TotalRecovered']

for i in columns:
    #.head(20) == iloc[0:20]
    sorted_df = world_df.sort_values(by='Population', ascending=False).head(20)
    death_ratio = sorted_df['TotalDeaths'] / sorted_df[i]
    
    fig = px.bar(sorted_df, 
           x='Country/Region', 
           y=death_ratio,
           color='Country/Region', 
           title='Death Ratio by {}'.format(i)
           )
    fig.show()

In [63]:
#ANSWER1: For every 7 people that are infected in the UK; 1 dies
#ANSWER2: For every 7 people recovered in Mexico; 1 dies