# **Covid-19 Data Analysis**

In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as py
import gc
import warnings
from bs4 import BeautifulSoup as soup
from datetime import date, datetime
from urllib.request import Request, urlopen

## **Web Scraping**

In [31]:
url = "https://www.worldometers.info/coronavirus/#countrieshttps"
req = Request(url, headers={'User-Agent':"Mozilla/5.0"})

webpage =  urlopen(req)
page_soup = soup(webpage, "html.parser")

page_soup


<!DOCTYPE html>

<!--[if IE 8]> <html lang="en" class="ie8"> <![endif]-->
<!--[if IE 9]> <html lang="en" class="ie9"> <![endif]-->
<!--[if !IE]><!-->
<html lang="en">
<!--<![endif]-->
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>COVID Live Update: 216,542,909 Cases and 4,504,507 Deaths from the Coronavirus - Worldometer</title>
<meta content="Live statistics and coronavirus news tracking the number of confirmed cases, recovered patients, tests, and death toll due to the COVID-19 coronavirus from Wuhan, China. Coronavirus counter with new cases, deaths, and number of tests per 1 Million population. Historical data and info. Daily charts, graphs, news and updates" name="description"/>
<link href="/favicon/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
<link href="/favicon/apple-icon-57x57.png" rel="apple-touch-icon" sizes="57x57"/>
<link href="/favicon/apple-icon

In [32]:
today = datetime.now()
yesterday_str = "%s %d,%d" % (
    date.today().strftime("%b"), today.day-1, today.year)

yesterday_str


'Aug 28,2021'

In [33]:
table = page_soup.findAll("table", id="main_table_countries_yesterday")
container = table[0].findAll("tr",{"style":""})
title = container[0]
del container[0]
all_data = []
clean = True

for country in container:
    country_data = []
    country_container = country.findAll("td")

    if country_container[1].text == "China":
        continue
    for i in range(1,len(country_container)):
        final_feature = country_container[i].text
        if clean:
            if i != 1 and i != len(country_container)-1:
                final_feature = final_feature.replace(",","")
                if final_feature.find('+') != -1:
                    final_feature = final_feature.replace("+","")
                    final_feature = float(final_feature)
                elif final_feature.find("-") != -1:
                    final_feature = final_feature.replace("-", "")
                    final_feature = float(final_feature) * -1
        if final_feature == 'N/A':
            final_feature = 0
        elif final_feature == "" or final_feature == " ":
            final_feature = -1
        country_data.append(final_feature) 
    all_data.append(country_data)  

all_data

[['World',
  '216164164',
  711926.0,
  '4497766',
  9969.0,
  '193150447',
  530509.0,
  '18515951',
  '112758',
  '27732',
  '577.0',
  -1,
  -1,
  -1,
  'All',
  '\n',
  -1,
  -1,
  -1,
  -1,
  -1],
 ['USA',
  '39540401',
  190370.0,
  '653405 ',
  1304.0,
  '30786368',
  54248.0,
  '8100628',
  '24809',
  '118654',
  '1961',
  '577474238',
  '1732901',
  '333241357 ',
  'North America',
  '8',
  '510',
  '1',
  '571',
  '4',
  '24,309'],
 ['India',
  '32649130',
  46805.0,
  '437403 ',
  514.0,
  '31845313',
  31341.0,
  '366414',
  '8944',
  '23394',
  '313',
  '514954309',
  '368973',
  '1395642554 ',
  'Asia',
  '43',
  '3191',
  '3',
  '34',
  '0.4',
  '263'],
 ['Brazil',
  '20703906',
  27345.0,
  '578396 ',
  791.0,
  '19629675',
  20172.0,
  '495835',
  '8318',
  '96611',
  '2699',
  '56580445',
  '264022',
  '214301783 ',
  'South America',
  '10',
  '371',
  '4',
  '128',
  '4',
  '2,314'],
 ['Russia',
  '6844049',
  19509.0,
  '180041 ',
  798.0,
  '6112035',
  19217.0,
 

In [34]:
df = pd.DataFrame(all_data)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,World,216164164,711926.0,4497766,9969.0,193150447,530509.0,18515951,112758,27732,...,-1,-1,-1,All,\n,-1,-1,-1,-1.0,-1
1,USA,39540401,190370.0,653405,1304.0,30786368,54248.0,8100628,24809,118654,...,577474238,1732901,333241357,North America,8,510,1,571,4.0,24309
2,India,32649130,46805.0,437403,514.0,31845313,31341.0,366414,8944,23394,...,514954309,368973,1395642554,Asia,43,3191,3,34,0.4,263
3,Brazil,20703906,27345.0,578396,791.0,19629675,20172.0,495835,8318,96611,...,56580445,264022,214301783,South America,10,371,4,128,4.0,2314
4,Russia,6844049,19509.0,180041,798.0,6112035,19217.0,551973,2300,46875,...,177100000,1212958,146006695,Europe,21,811,1,134,5.0,3780


## **Data Preprocessing**

In [35]:
df = pd.DataFrame(all_data)
df.drop([15,16,17,18,19,20], inplace = True, axis = 1)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,World,216164164,711926.0,4497766,9969.0,193150447,530509.0,18515951,112758,27732,577.0,-1,-1,-1,All
1,USA,39540401,190370.0,653405,1304.0,30786368,54248.0,8100628,24809,118654,1961,577474238,1732901,333241357,North America
2,India,32649130,46805.0,437403,514.0,31845313,31341.0,366414,8944,23394,313,514954309,368973,1395642554,Asia
3,Brazil,20703906,27345.0,578396,791.0,19629675,20172.0,495835,8318,96611,2699,56580445,264022,214301783,South America
4,Russia,6844049,19509.0,180041,798.0,6112035,19217.0,551973,2300,46875,1233,177100000,1212958,146006695,Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208,Macao,63,-1.0,-1,-1.0,60,1.0,3,-1,96,-1,4778,7245,659530,Asia
209,Montserrat,26,-1.0,1,-1.0,20,-1.0,5,-1,5205,200,1408,281882,4995,North America
210,Western Sahara,10,-1.0,1,-1.0,8,-1.0,1,-1,16,2,-1,-1,614217,Africa
211,Palau,2,-1.0,-1,-1.0,-1,-1.0,2,-1,110,-1,8221,451877,18193,Australia/Oceania


In [36]:
column_labels = ["Country", "Total Cases", "New Cases", "Total Deaths", "New Deaths", "Total Recovered", "New Recovered", "Active Cases",
                 "Serious/Critical", "Total Cases/1M", "Deaths/1M", "Total Tests", "Tests/1M", "Population", "Continent"]
df.columns = column_labels
df.head()


Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,Serious/Critical,Total Cases/1M,Deaths/1M,Total Tests,Tests/1M,Population,Continent
0,World,216164164,711926.0,4497766,9969.0,193150447,530509.0,18515951,112758,27732,577.0,-1,-1,-1,All
1,USA,39540401,190370.0,653405,1304.0,30786368,54248.0,8100628,24809,118654,1961.0,577474238,1732901,333241357,North America
2,India,32649130,46805.0,437403,514.0,31845313,31341.0,366414,8944,23394,313.0,514954309,368973,1395642554,Asia
3,Brazil,20703906,27345.0,578396,791.0,19629675,20172.0,495835,8318,96611,2699.0,56580445,264022,214301783,South America
4,Russia,6844049,19509.0,180041,798.0,6112035,19217.0,551973,2300,46875,1233.0,177100000,1212958,146006695,Europe


In [37]:
for label in df.columns:
    if label != "Country" and label != "Continent":
        df[label] = pd.to_numeric(df[label])

In [38]:
df["% Inc Cases"] = df["New Cases"]/df["Total Cases"] * 100
df["% Inc Deaths"] = df["New Deaths"]/df["Total Deaths"] * 100
df["% Inc Recovered"] = df["New Recovered"]/df["Total Recovered"] * 100
df.head()

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,Serious/Critical,Total Cases/1M,Deaths/1M,Total Tests,Tests/1M,Population,Continent,% Inc Cases,% Inc Deaths,% Inc Recovered
0,World,216164164,711926.0,4497766,9969.0,193150447,530509.0,18515951,112758,27732.0,577.0,-1,-1,-1,All,0.329345,0.221643,0.274661
1,USA,39540401,190370.0,653405,1304.0,30786368,54248.0,8100628,24809,118654.0,1961.0,577474238,1732901,333241357,North America,0.481457,0.19957,0.176208
2,India,32649130,46805.0,437403,514.0,31845313,31341.0,366414,8944,23394.0,313.0,514954309,368973,1395642554,Asia,0.143358,0.117512,0.098416
3,Brazil,20703906,27345.0,578396,791.0,19629675,20172.0,495835,8318,96611.0,2699.0,56580445,264022,214301783,South America,0.132077,0.136758,0.102763
4,Russia,6844049,19509.0,180041,798.0,6112035,19217.0,551973,2300,46875.0,1233.0,177100000,1212958,146006695,Europe,0.285051,0.443232,0.314412


## **Exploratory Data Analysis**

In [39]:
cases = df[["Total Recovered", "Active Cases", "Total Deaths"]].loc[0]

cases_df = pd.DataFrame(cases).reset_index()
cases_df.columns = ["Type", "Total"]

cases_df["Percentage"] = np.round(100*cases_df["Total"]/np.sum(cases_df["Total"]),2)
cases_df

Unnamed: 0,Type,Total,Percentage
0,Total Recovered,193150447,89.35
1,Active Cases,18515951,8.57
2,Total Deaths,4497766,2.08


In [40]:
fig = px.bar(cases_df, x=["Total Recovered", "Active Cases",
             "Total Deaths"], y="Percentage", color="Type", color_discrete_sequence=["Green", "Blue","Red"])

fig.show()

In [41]:
continent_df = df.groupby("Continent").sum().drop("All")
continent_df.index


Index(['Africa', 'Asia', 'Australia/Oceania', 'Europe', 'North America',
       'South America'],
      dtype='object', name='Continent')

In [42]:
fig = px.bar(continent_df, x = continent_df.index, y=["Total Recovered", "Active Cases",
             "Total Deaths"])

fig.show()


In [43]:
df = df.drop([len(df)-1])
country_df = df.drop([0])
country_df

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,Serious/Critical,Total Cases/1M,Deaths/1M,Total Tests,Tests/1M,Population,Continent,% Inc Cases,% Inc Deaths,% Inc Recovered
1,USA,39540401,190370.0,653405,1304.0,30786368,54248.0,8100628,24809,118654.0,1961.0,577474238,1732901,333241357,North America,0.481457,0.199570,0.176208
2,India,32649130,46805.0,437403,514.0,31845313,31341.0,366414,8944,23394.0,313.0,514954309,368973,1395642554,Asia,0.143358,0.117512,0.098416
3,Brazil,20703906,27345.0,578396,791.0,19629675,20172.0,495835,8318,96611.0,2699.0,56580445,264022,214301783,South America,0.132077,0.136758,0.102763
4,Russia,6844049,19509.0,180041,798.0,6112035,19217.0,551973,2300,46875.0,1233.0,177100000,1212958,146006695,Europe,0.285051,0.443232,0.314412
5,France,6711268,18249.0,114083,95.0,6170720,25432.0,426465,2270,102556.0,1743.0,121518857,1856945,65440194,Europe,0.271916,0.083273,0.412140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,Falkland Islands,66,-1.0,-1,-1.0,63,-1.0,3,-1,18354.0,-1.0,7409,2060345,3596,South America,-1.515152,100.000000,-1.587302
208,Macao,63,-1.0,-1,-1.0,60,1.0,3,-1,96.0,-1.0,4778,7245,659530,Asia,-1.587302,100.000000,1.666667
209,Montserrat,26,-1.0,1,-1.0,20,-1.0,5,-1,5205.0,200.0,1408,281882,4995,North America,-3.846154,-100.000000,-5.000000
210,Western Sahara,10,-1.0,1,-1.0,8,-1.0,1,-1,16.0,2.0,-1,-1,614217,Africa,-10.000000,-100.000000,-12.500000


In [1]:
LOOK_AT = 5
country = country_df.columns[1:14]

fig = go.Figure()
c = 0

for i in country_df.index:
    if c < LOOK_AT:
        fig.add_trace(go.Bar(name = country_df['Country'][i], x = country, y = country_df.loc[i][1:14]))
    else:
        break
    c += 1

fig.update_layout(title = {"text":f'Top {LOOK_AT} affected countries'}, yaxis_type = "log")
fig.show()


NameError: name 'country_df' is not defined