In [113]:
from bs4 import BeautifulSoup as soup #Beautiful Soup is a Python library for pulling data out of HTML and XML files.
import requests #for request to the server 

In [114]:
import pandas as pd
import numpy as np
import gc #Python has an automated garbage collection. It has an algorithm to deallocate objects which are no longer needed.
import plotly.graph_objects as go
import plotly.express as px

## Data Collection
### Web Scrapping:

In [115]:
#lxml is a Python library which allows for easy handling of XML and HTML files, and can also be used for web scraping.
url = "https://www.worldometers.info/coronavirus/#countries"
res = requests.get(url)
page = soup(res.text, 'lxml')

In [116]:
table = page.select('#main_table_countries_yesterday')
row_data = table[0].findAll("tr",{"style":""})
title = row_data[0]
del row_data[0]
data = []
clean = True
for country in row_data:
    country_data = []
    country_container = country.select("td")
    
    if country_container[1].text == "China":
        continue
    for i in range(1, len(country_container)):
        final_feature = country_container[i].text
        if clean:
            if i != 1 and i != len(country_container)-1:
                final_feature = final_feature.replace(",","")
                if final_feature.find('+') != -1:
                    final_feature = final_feature.replace("+","")
                    final_feature = float(final_feature)
            
                elif final_feature.find("-") != -1:
                    final_feature = final_feature.replace("-","")
                    final_feature = float(final_feature)*-1
        if final_feature == "N/A":
            final_feature = 0
        elif final_feature == "" or final_feature == " ":
            final_feature = -1
            
        country_data.append(final_feature)
        
    data.append(country_data)

## Data PreProcessing

In [117]:
import pandas as pd
df = pd.DataFrame(data)

In [118]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,World,685610942,57123.0,6842383,156.0,658326986,60386.0,20441573,39507,87957,...,-1,-1,-1,All,\n,-1,-1,-1,-1.0,-1
1,USA,106464679,1098.0,1158158,8.0,104311698,9159.0,994823,1636,317990,...,1175946071,3512328,334805269,North America,3,289,0,3,0.02,2971
2,India,44818115,10093.0,531114,23.0,44229459,6248.0,57542,0,31862,...,923877832,656801,1406631776,Asia,31,2648,2,7,0.02,41
3,France,39896203,8595.0,166024,-1.0,39592740,6103.0,137439,869,608317,...,271490188,4139547,65584518,Europe,2,395,0,131,-1.0,2096
4,Germany,38382386,-1.0,171992,-1.0,38149200,1200.0,61194,0,457567,...,122332384,1458359,83883596,Europe,2,488,1,-1,-1.0,730


In [119]:
column_labels = ["Country", "Total_Cases", "New_Cases", "Total_Deaths", "New_Deaths", "Total_Recovered", "New_Recovered", "Active_Cases", "Serious", "Tot Cases/ 1M ppl", "Deaths/ 1M ppl", "Total_Tests", "Tests/ 1M ppl", "Population", "Continent"]
len(column_labels)

15

In [120]:
df1 = df

In [121]:
df1.drop([15,16,17,18,19,20], inplace=True, axis=1)

In [122]:
df1.columns = column_labels

In [123]:
df1.head()

Unnamed: 0,Country,Total_Cases,New_Cases,Total_Deaths,New_Deaths,Total_Recovered,New_Recovered,Active_Cases,Serious,Tot Cases/ 1M ppl,Deaths/ 1M ppl,Total_Tests,Tests/ 1M ppl,Population,Continent
0,World,685610942,57123.0,6842383,156.0,658326986,60386.0,20441573,39507,87957,877.8,-1,-1,-1,All
1,USA,106464679,1098.0,1158158,8.0,104311698,9159.0,994823,1636,317990,3459.0,1175946071,3512328,334805269,North America
2,India,44818115,10093.0,531114,23.0,44229459,6248.0,57542,0,31862,378.0,923877832,656801,1406631776,Asia
3,France,39896203,8595.0,166024,-1.0,39592740,6103.0,137439,869,608317,2531.0,271490188,4139547,65584518,Europe
4,Germany,38382386,-1.0,171992,-1.0,38149200,1200.0,61194,0,457567,2050.0,122332384,1458359,83883596,Europe


#### Change Type of columns:

In [124]:
for label in df1.columns:
    if label != 'Country' and label != 'Continent':
        df1[label] = pd.to_numeric(df1[label])

#### Add New Columns:

In [125]:
df1['% Inc_Cases'] = df1['New_Cases']/df1['Total_Cases']*100
df1['% Inc_Deaths'] = df1['New_Deaths']/df1['Total_Deaths']*100
df1['% Inc_Recovered'] = df1['New_Recovered']/df1['Total_Recovered']*100

## EDA

### Cases:

In [126]:
cases = df1[['Total_Recovered','Active_Cases','Total_Deaths']].loc[0]
# DataFrame.loc attribute access a group of rows and columns by label(s) or a boolean array in the given DataFrame.
# here loc[0] means to access the data from 0th row

In [127]:
cases

Total_Recovered    658326986
Active_Cases        20441573
Total_Deaths         6842383
Name: 0, dtype: int64

In [128]:
cases_df = pd.DataFrame(cases).reset_index()
cases_df

Unnamed: 0,index,0
0,Total_Recovered,658326986
1,Active_Cases,20441573
2,Total_Deaths,6842383


In [129]:
cases_df.columns = ['Type','Total']
cases_df

Unnamed: 0,Type,Total
0,Total_Recovered,658326986
1,Active_Cases,20441573
2,Total_Deaths,6842383


In [130]:
cases_df['Percentage'] = np.round((cases_df['Total']/np.sum(cases_df['Total']))*100,2)

In [131]:
cases_df['Virus'] = ["COVID-19" for i in range(len(cases_df))]
cases_df

Unnamed: 0,Type,Total,Percentage,Virus
0,Total_Recovered,658326986,96.02,COVID-19
1,Active_Cases,20441573,2.98,COVID-19
2,Total_Deaths,6842383,1.0,COVID-19


#### Visualize Total Cases:

In [132]:
fig = px.bar(cases_df, x = "Virus", y = "Percentage", color = "Type", hover_data=["Total"])
fig.show()

#### Visualize New Cases:

In [133]:
Newdata = df1[['New_Cases','New_Recovered','New_Deaths']].loc[0]
newcasedf = pd.DataFrame(Newdata).reset_index()
newcasedf.columns = ['Type','Total']
newcasedf['Percentage'] = np.round((newcasedf['Total']/np.sum(newcasedf['Total']))*100,2)
newcasedf['Virus'] = ['COVID-19' for i in range(len(newcasedf))]
fig = px.bar(newcasedf, x="Virus", y="Percentage", color="Type", hover_data=["Total"])
fig.show()

In [134]:
df1.columns

Index(['Country', 'Total_Cases', 'New_Cases', 'Total_Deaths', 'New_Deaths',
       'Total_Recovered', 'New_Recovered', 'Active_Cases', 'Serious',
       'Tot Cases/ 1M ppl', 'Deaths/ 1M ppl', 'Total_Tests', 'Tests/ 1M ppl',
       'Population', 'Continent', '% Inc_Cases', '% Inc_Deaths',
       '% Inc_Recovered'],
      dtype='object')

In [135]:
per = np.round(df[["% Inc_Cases", "% Inc_Deaths", "% Inc_Recovered"]].loc[0], 2)
per_df = pd.DataFrame(per)
per_df.columns = ['Percentage']
fig = go.Figure()
fig.add_trace(go.Bar(x=per_df.index,y=per_df['Percentage'], marker_color=["Yellow", "blue", "red"]))
fig.show()

### Visualize Continents:

In [136]:
continent_df = df1.groupby('Continent').sum().drop("All")
# drop "All" row

In [137]:
continent_df = continent_df.reset_index()
continent_df

Unnamed: 0,Continent,Country,Total_Cases,New_Cases,Total_Deaths,New_Deaths,Total_Recovered,New_Recovered,Active_Cases,Serious,Tot Cases/ 1M ppl,Deaths/ 1M ppl,Total_Tests,Tests/ 1M ppl,Population,% Inc_Cases,% Inc_Deaths,% Inc_Recovered
0,Africa,South AfricaMoroccoTunisiaEgyptLibyaEthiopiaRé...,12805489,6.0,258577,-56.0,10954562,66.0,377871,516,2488825.0,18491.0,110838308,10906548,1402440339,-0.204448,84.913851,-50.136054
1,Asia,IndiaJapanS. KoreaTurkeyVietnamTaiwanIranIndon...,210611304,34137.0,1534227,27.0,178159095,20958.0,13570800,1637,7625593.0,34567.0,2225039342,92907035,3217529895,0.246528,-8.670678,0.275596
2,Australia/Oceania,AustraliaNew ZealandNew CaledoniaFrench Polyne...,14084808,-17.0,26937,-20.0,13843889,-15.0,86121,58,4639745.0,8847.0,88580174,21050084,43470408,-20.249721,92.14747,199.504774
3,Europe,FranceGermanyItalyRussiaSpainNetherlandsPoland...,223885190,19048.0,1825455,27.0,219752683,26677.0,2025568,2678,18332202.0,122650.0,2306362772,206717923,679045131,0.131406,-10.471199,0.322138
4,North America,USAMexicoCanadaGuatemalaCosta RicaCubaPanamaDo...,125972055,3142.0,1625744,-10.0,121501105,11790.0,1941259,1989,8712322.0,57675.0,1313452474,99161788,598087045,-0.208785,-110.557878,-0.31075
5,South America,BrazilColombiaChilePeruBoliviaEcuadorUruguayPa...,58405310,630.0,1222894,-12.0,55766970,792.0,544992,1483,1831633.0,30188.0,207917156,10727908,391680670,0.009221,-0.431678,0.006716


In [138]:
def continent_visual(c_list):
    for label in c_list:
        c_df = continent_df[['Continent',label]]
        c_df['Percentage'] = np.round(c_df[label]/ np.sum(c_df[label]), 2)
        c_df['Virus'] = ["Covid-19" for i in range(len(c_df))]
        
        fig = px.bar(c_df, x="Virus", y="Percentage", color="Continent", hover_data=[label])
        fig.update_layout(title={"text":f"{label}"})
        fig.show()
        gc.collect()

In [139]:
cases_list = ["Total_Cases", "Active_Cases", "New_Cases", "Serious", "Tot Cases/ 1M ppl"]
death_list = ["Total_Deaths", "New_Deaths", "Deaths/ 1M ppl"]
recovered_list = ["Total_Recovered", "New_Recovered", "% Inc_Recovered"]

In [140]:
continent_visual(cases_list)

### Visualize Countries:
#### For all Countries:

In [141]:
df_countries = df1.groupby("Country").sum().drop('World')
df_countries = df_countries.reset_index()

In [142]:
df_countries.columns

Index(['Country', 'Total_Cases', 'New_Cases', 'Total_Deaths', 'New_Deaths',
       'Total_Recovered', 'New_Recovered', 'Active_Cases', 'Serious',
       'Tot Cases/ 1M ppl', 'Deaths/ 1M ppl', 'Total_Tests', 'Tests/ 1M ppl',
       'Population', 'Continent', '% Inc_Cases', '% Inc_Deaths',
       '% Inc_Recovered'],
      dtype='object')

In [143]:
def continent_visual(c_list):
    for label in c_list:
        c_df = df_countries[["Country",label]]
        c_df['Percentage'] = np.round(c_df[label]/ np.sum(c_df[label]), 2)
        c_df['Virus'] = ["Covid-19" for i in range(len(c_df))]
        
        fig = px.bar(c_df, x="Virus", y="Percentage", color="Country", hover_data=[label])
        fig.update_layout(title={"text":f"{label}"})
        fig.show()
        gc.collect()

In [144]:
cases_list = ["Total_Cases", "Active_Cases", "New_Cases", "Serious", "Tot Cases/ 1M ppl"]
death_list = ["Total_Deaths", "New_Deaths", "Deaths/ 1M ppl"]
recovered_list = ["Total_Recovered", "New_Recovered", "% Inc_Recovered"]

In [145]:
continent_visual(cases_list)

In [146]:
df_countries.columns

Index(['Country', 'Total_Cases', 'New_Cases', 'Total_Deaths', 'New_Deaths',
       'Total_Recovered', 'New_Recovered', 'Active_Cases', 'Serious',
       'Tot Cases/ 1M ppl', 'Deaths/ 1M ppl', 'Total_Tests', 'Tests/ 1M ppl',
       'Population', 'Continent', '% Inc_Cases', '% Inc_Deaths',
       '% Inc_Recovered'],
      dtype='object')

#### For Top 5 or 10 Countries:

In [147]:
df2 = df1.drop([len(df1)-1]) #drop last row "Total".
country_df = df2.drop([0])

In [148]:
country_df.head()

Unnamed: 0,Country,Total_Cases,New_Cases,Total_Deaths,New_Deaths,Total_Recovered,New_Recovered,Active_Cases,Serious,Tot Cases/ 1M ppl,Deaths/ 1M ppl,Total_Tests,Tests/ 1M ppl,Population,Continent,% Inc_Cases,% Inc_Deaths,% Inc_Recovered
1,USA,106464679,1098.0,1158158,8.0,104311698,9159.0,994823,1636,317990.0,3459.0,1175946071,3512328,334805269,North America,0.001031,0.000691,0.00878
2,India,44818115,10093.0,531114,23.0,44229459,6248.0,57542,0,31862.0,378.0,923877832,656801,1406631776,Asia,0.02252,0.004331,0.014126
3,France,39896203,8595.0,166024,-1.0,39592740,6103.0,137439,869,608317.0,2531.0,271490188,4139547,65584518,Europe,0.021543,-0.000602,0.015414
4,Germany,38382386,-1.0,171992,-1.0,38149200,1200.0,61194,0,457567.0,2050.0,122332384,1458359,83883596,Europe,-3e-06,-0.000581,0.003146
5,Brazil,37358092,-1.0,700811,-1.0,36249161,-1.0,408120,0,173473.0,3254.0,63776166,296146,215353593,South America,-3e-06,-0.000143,-3e-06


In [149]:
LOOK_AT = 5
country = country_df.columns[1:14]
fig = go.Figure()
c = 0
for i in country_df.index:
    if c < LOOK_AT:
        fig.add_trace(go.Bar(name = country_df['Country'][i], x = country, y = country_df.loc[i][1:14]))
    else:
        break
    c +=1
fig.update_layout(title = {"text":f'top {LOOK_AT} countries affected '}, yaxis_type = "log")
fig.show()