# **Covid-19 Data Analysis**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as py
import gc
import warnings
from bs4 import BeautifulSoup as soup
from datetime import date, datetime
from urllib.request import Request, urlopen

## **Web Scraping**

In [2]:
url = "https://www.worldometers.info/coronavirus/#countrieshttps"
req = Request(url, headers={'User-Agent':"Mozilla/5.0"})

webpage =  urlopen(req)
page_soup = soup(webpage, "html.parser")

In [3]:
today = datetime.now()
yesterday_str = "%s %d,%d" % (
    date.today().strftime("%b"), today.day-1, today.year)

yesterday_str


'Jan 19,2022'

In [4]:
table = page_soup.findAll("table", id="main_table_countries_yesterday")
container = table[0].findAll("tr",{"style":""})
title = container[0]
del container[0]
all_data = []
clean = True

for country in container:
    country_data = []
    country_container = country.findAll("td")

    if country_container[1].text == "China":
        continue
    for i in range(1,len(country_container)):
        final_feature = country_container[i].text
        if clean and i not in [1, len(country_container) - 1]:
            final_feature = final_feature.replace(",","")
            if final_feature.find('+') != -1:
                final_feature = final_feature.replace("+","")
                final_feature = float(final_feature)
            elif final_feature.find("-") != -1:
                final_feature = final_feature.replace("-", "")
                final_feature = float(final_feature) * -1
        if final_feature == 'N/A':
            final_feature = 0
        elif final_feature in ["", " "]:
            final_feature = -1
        country_data.append(final_feature)
    all_data.append(country_data)  

all_data

[['World',
  '339326561',
  3578927.0,
  '5583490',
  8972.0,
  '273085082',
  1871420.0,
  '60657989',
  '96921',
  '43532',
  '716.3',
  -1,
  -1,
  -1,
  'All',
  '\n',
  -1,
  -1,
  -1,
  -1,
  -1],
 ['USA',
  '69808350',
  710928.0,
  '880976 ',
  2374.0,
  '43892277',
  347360.0,
  '25035097',
  '26283',
  '209001',
  '2638',
  '871652953',
  '2609670',
  '334008890 ',
  'North America',
  '5',
  '379',
  '0',
  '2128',
  '7',
  '74,953'],
 ['India',
  '38218773',
  317532.0,
  '487719 ',
  493.0,
  '35807029',
  223990.0,
  '1924025',
  '8944',
  '27279',
  '348',
  '707421650',
  '504936',
  '1401013398 ',
  'Asia',
  '37',
  '2873',
  '2',
  '227',
  '0.4',
  '1,373'],
 ['Brazil',
  '23420861',
  205310.0,
  '621927 ',
  349.0,
  '21848301',
  75216.0,
  '950633',
  '8318',
  '108985',
  '2894',
  '63776166',
  '296772',
  '214899473 ',
  'South America',
  '9',
  '346',
  '3',
  '955',
  '2',
  '4,424'],
 ['UK',
  '15506750',
  108069.0,
  '152872 ',
  359.0,
  '11738323',
  

In [5]:
df = pd.DataFrame(all_data)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,World,339326561,3578927.0,5583490,8972.0,273085082,1871420.0,60657989,96921,43532,...,-1,-1,-1,All,\n,-1,-1,-1,-1.0,-1
1,USA,69808350,710928.0,880976,2374.0,43892277,347360.0,25035097,26283,209001,...,871652953,2609670,334008890,North America,5,379,0,2128,7.0,74953
2,India,38218773,317532.0,487719,493.0,35807029,223990.0,1924025,8944,27279,...,707421650,504936,1401013398,Asia,37,2873,2,227,0.4,1373
3,Brazil,23420861,205310.0,621927,349.0,21848301,75216.0,950633,8318,108985,...,63776166,296772,214899473,South America,9,346,3,955,2.0,4424
4,UK,15506750,108069.0,152872,359.0,11738323,121292.0,3615555,703,226580,...,437010764,6385466,68438346,Europe,4,448,0,1579,5.0,52829


## **Data Preprocessing**

In [6]:
df = pd.DataFrame(all_data)
df.drop([15,16,17,18,19,20], inplace = True, axis = 1)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,World,339326561,3578927.0,5583490,8972.0,273085082,1871420.0,60657989,96921,43532,716.3,-1,-1,-1,All
1,USA,69808350,710928.0,880976,2374.0,43892277,347360.0,25035097,26283,209001,2638,871652953,2609670,334008890,North America
2,India,38218773,317532.0,487719,493.0,35807029,223990.0,1924025,8944,27279,348,707421650,504936,1401013398,Asia
3,Brazil,23420861,205310.0,621927,349.0,21848301,75216.0,950633,8318,108985,2894,63776166,296772,214899473,South America
4,UK,15506750,108069.0,152872,359.0,11738323,121292.0,3615555,703,226580,2234,437010764,6385466,68438346,Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212,Solomon Islands,33,1.0,-1,-1.0,20,-1.0,13,-1,46,-1,4500,6312,712935,Australia/Oceania
213,Vatican City,29,-1.0,-1,-1.0,27,-1.0,2,-1,36070,-1,-1,-1,804,Europe
214,Western Sahara,10,-1.0,1,-1.0,8,-1.0,1,-1,16,2,-1,-1,620053,Africa
215,Marshall Islands,7,-1.0,-1,-1.0,4,-1.0,3,-1,117,-1,-1,-1,59809,Australia/Oceania


In [7]:
column_labels = ["Country", "Total Cases", "New Cases", "Total Deaths", "New Deaths", "Total Recovered", "New Recovered", "Active Cases",
                 "Serious/Critical", "Total Cases/1M", "Deaths/1M", "Total Tests", "Tests/1M", "Population", "Continent"]
df.columns = column_labels
df.head()


Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,Serious/Critical,Total Cases/1M,Deaths/1M,Total Tests,Tests/1M,Population,Continent
0,World,339326561,3578927.0,5583490,8972.0,273085082,1871420.0,60657989,96921,43532,716.3,-1,-1,-1,All
1,USA,69808350,710928.0,880976,2374.0,43892277,347360.0,25035097,26283,209001,2638.0,871652953,2609670,334008890,North America
2,India,38218773,317532.0,487719,493.0,35807029,223990.0,1924025,8944,27279,348.0,707421650,504936,1401013398,Asia
3,Brazil,23420861,205310.0,621927,349.0,21848301,75216.0,950633,8318,108985,2894.0,63776166,296772,214899473,South America
4,UK,15506750,108069.0,152872,359.0,11738323,121292.0,3615555,703,226580,2234.0,437010764,6385466,68438346,Europe


In [8]:
for label in df.columns:
    if label not in ["Country", "Continent"]:
        df[label] = pd.to_numeric(df[label])

In [9]:
df["% Inc Cases"] = df["New Cases"]/df["Total Cases"] * 100
df["% Inc Deaths"] = df["New Deaths"]/df["Total Deaths"] * 100
df["% Inc Recovered"] = df["New Recovered"]/df["Total Recovered"] * 100
df.head()

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,Serious/Critical,Total Cases/1M,Deaths/1M,Total Tests,Tests/1M,Population,Continent,% Inc Cases,% Inc Deaths,% Inc Recovered
0,World,339326561,3578927.0,5583490,8972.0,273085082,1871420.0,60657989,96921,43532.0,716.3,-1,-1,-1,All,1.054715,0.160688,0.685288
1,USA,69808350,710928.0,880976,2374.0,43892277,347360.0,25035097,26283,209001.0,2638.0,871652953,2609670,334008890,North America,1.0184,0.269474,0.791392
2,India,38218773,317532.0,487719,493.0,35807029,223990.0,1924025,8944,27279.0,348.0,707421650,504936,1401013398,Asia,0.830827,0.101083,0.625548
3,Brazil,23420861,205310.0,621927,349.0,21848301,75216.0,950633,8318,108985.0,2894.0,63776166,296772,214899473,South America,0.876612,0.056116,0.344265
4,UK,15506750,108069.0,152872,359.0,11738323,121292.0,3615555,703,226580.0,2234.0,437010764,6385466,68438346,Europe,0.696916,0.234837,1.033299


## **Exploratory Data Analysis**

In [10]:
cases = df[["Total Recovered", "Active Cases", "Total Deaths"]].loc[0]

cases_df = pd.DataFrame(cases).reset_index()
cases_df.columns = ["Type", "Total"]

cases_df["Percentage"] = np.round(100*cases_df["Total"]/np.sum(cases_df["Total"]),2)
cases_df

Unnamed: 0,Type,Total,Percentage
0,Total Recovered,273085082,80.48
1,Active Cases,60657989,17.88
2,Total Deaths,5583490,1.65


In [11]:
fig = px.bar(cases_df, x=["Total Recovered", "Active Cases",
             "Total Deaths"], y="Percentage", color="Type", color_discrete_sequence=["Green", "Blue","Red"])

fig.show()

In [12]:
continent_df = df.groupby("Continent").sum().drop("All")
continent_df.index


Index(['Africa', 'Asia', 'Australia/Oceania', 'Europe', 'North America',
       'South America'],
      dtype='object', name='Continent')

In [13]:
fig = px.bar(continent_df, x = continent_df.index, y=["Total Recovered", "Active Cases",
             "Total Deaths"])

fig.show()


In [14]:
df = df.drop([len(df)-1])
country_df = df.drop([0])
country_df

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,Serious/Critical,Total Cases/1M,Deaths/1M,Total Tests,Tests/1M,Population,Continent,% Inc Cases,% Inc Deaths,% Inc Recovered
1,USA,69808350,710928.0,880976,2374.0,43892277,347360.0,25035097,26283,209001.0,2638.0,871652953,2609670,334008890,North America,1.018400,0.269474,0.791392
2,India,38218773,317532.0,487719,493.0,35807029,223990.0,1924025,8944,27279.0,348.0,707421650,504936,1401013398,Asia,0.830827,0.101083,0.625548
3,Brazil,23420861,205310.0,621927,349.0,21848301,75216.0,950633,8318,108985.0,2894.0,63776166,296772,214899473,South America,0.876612,0.056116,0.344265
4,UK,15506750,108069.0,152872,359.0,11738323,121292.0,3615555,703,226580.0,2234.0,437010764,6385466,68438346,Europe,0.696916,0.234837,1.033299
5,France,15175464,436167.0,127869,231.0,9612962,205868.0,5434633,3881,231696.0,1952.0,216918555,3311873,65497250,Europe,2.874159,0.180654,2.141567
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,Falkland Islands,85,-1.0,-1,-1.0,0,0.0,0,-1,23371.0,-1.0,8528,2344790,3637,South America,-1.176471,100.000000,
212,Solomon Islands,33,1.0,-1,-1.0,20,-1.0,13,-1,46.0,-1.0,4500,6312,712935,Australia/Oceania,3.030303,100.000000,-5.000000
213,Vatican City,29,-1.0,-1,-1.0,27,-1.0,2,-1,36070.0,-1.0,-1,-1,804,Europe,-3.448276,100.000000,-3.703704
214,Western Sahara,10,-1.0,1,-1.0,8,-1.0,1,-1,16.0,2.0,-1,-1,620053,Africa,-10.000000,-100.000000,-12.500000


In [15]:
LOOK_AT = 5
country = country_df.columns[1:14]

fig = go.Figure()
c = 0

for i in country_df.index:
    if c < LOOK_AT:
        fig.add_trace(go.Bar(name = country_df['Country'][i], x = country, y = country_df.loc[i][1:14]))
    else:
        break
    c += 1

fig.update_layout(title = {"text":f'Top {LOOK_AT} affected countries'}, yaxis_type = "log")
fig.show()
