In [25]:
import requests                    # library used to schedule requests 
from bs4 import BeautifulSoup      # class used to do web scraping
import pandas as pd                # library used to create data frames
import numpy as np 

In [26]:
base_url = "https://www.worldometers.info/coronavirus/"

In [27]:
country_list = []                         # list used to store country names
total_cases = []                          # list used to store total cases for the country
new_cases = []                            # list used to store new cases for the specific country 
total_deaths = []                         # list used to store total deaths for the specific country
new_deaths = []                           # list used to store new deaths for the specific country
total_recovered = []                      # list used to store total recovered cases for the specific country
active_cases = []                         # list used to store active cases for the specific country
serious_cases = []                        # list used to store serious cases for the specific country
total_cases_per_million = []   # list used to store total cases per million population for the specific country
total_deaths_per_million = []  # list used to store total deaths per million population for the specific country
total_tests = []                          # list used to store total tests done by the specific country
total_tests_per_million = []   # list used to store total tests done per million population for the specific country

In [28]:
response = requests.get(base_url)

In [29]:
soup = BeautifulSoup(response.text, "html.parser")
    
    # finding the live corona table and exracting the tbody    
tables = soup.find_all("div", class_="main_table_countries_div")[0].find_all("table")
bodys = tables[0].find_all("tbody")

In [30]:
len(bodys)

3

In [31]:
for i in range(0, len(bodys)):
    # extracting all the table rows (tr)
    trs = bodys[i].find_all("tr")      
    
    for tr in trs:
        # extracting all the table data and storing into specific lists
        tds = tr.find_all("td")                                 
        country_list.append(tds[0].text.strip())
        total_cases.append(tds[1].text.strip())
        new_cases.append(tds[2].text.strip())
        total_deaths.append(tds[3].text.strip())
        new_deaths.append(tds[4].text.strip())
        total_recovered.append(tds[5].text.strip())
        active_cases.append(tds[6].text.strip())
        serious_cases.append(tds[7].text.strip())
        total_cases_per_million.append(tds[8].text.strip())
        total_deaths_per_million.append(tds[9].text.strip())
        total_tests.append(tds[10].text.strip())
        total_tests_per_million.append(tds[11].text.strip())
        
country_list

['North America',
 'Europe',
 'Asia',
 'South America',
 'Oceania',
 'Africa',
 '',
 'World',
 'USA',
 'Spain',
 'Italy',
 'UK',
 'France',
 'Germany',
 'Turkey',
 'Russia',
 'Iran',
 'Brazil',
 'Canada',
 'Belgium',
 'Peru',
 'Netherlands',
 'India',
 'Switzerland',
 'Ecuador',
 'Portugal',
 'Saudi Arabia',
 'Sweden',
 'Ireland',
 'Mexico',
 'Pakistan',
 'Singapore',
 'Chile',
 'Israel',
 'Austria',
 'Belarus',
 'Japan',
 'Qatar',
 'Poland',
 'UAE',
 'Romania',
 'Ukraine',
 'S. Korea',
 'Indonesia',
 'Denmark',
 'Serbia',
 'Philippines',
 'Bangladesh',
 'Norway',
 'Czechia',
 'Dominican Republic',
 'Colombia',
 'Australia',
 'Panama',
 'Malaysia',
 'South Africa',
 'Egypt',
 'Finland',
 'Morocco',
 'Argentina',
 'Kuwait',
 'Algeria',
 'Moldova',
 'Luxembourg',
 'Kazakhstan',
 'Bahrain',
 'Thailand',
 'Hungary',
 'Greece',
 'Oman',
 'Afghanistan',
 'Nigeria',
 'Iraq',
 'Armenia',
 'Uzbekistan',
 'Croatia',
 'Ghana',
 'Azerbaijan',
 'Cameroon',
 'Iceland',
 'Bosnia and Herzegovina',
 'E

In [32]:
# stripping the first 7 and last 8 records as they are not reequired
country_list = country_list[7:-8]
total_cases = total_cases[7:-8]
new_cases = new_cases[7:-8]
total_deaths = total_deaths[7:-8]
new_deaths = new_deaths[7:-8]
total_recovered = total_recovered[7:-8]
active_cases = active_cases[7:-8]
serious_cases = serious_cases[7:-8]
total_cases_per_million = total_cases_per_million[7:-8]
total_deaths_per_million = total_deaths_per_million[7:-8]
total_tests = total_tests[7:-8]
total_tests_per_million = total_tests_per_million[7:-8]

In [33]:
zipped = zip(country_list,
             total_cases,
             new_cases,
             total_deaths,
             new_deaths,
             total_recovered, 
             active_cases, 
             serious_cases,
             total_cases_per_million,
             total_deaths_per_million,
             total_tests,
             total_tests_per_million)


# creating data frame by using the zipped data
df = pd.DataFrame(list(zipped), columns=["Country", 
                                             "Total Cases", 
                                             "New Cases",
                                             "Total Deaths", 
                                             "New Deaths", 
                                             "Total Recovered", 
                                             "Active Cases", 
                                             "Serious Cases", 
                                             "Total Cases Per Million", 
                                             "Total Deaths Per Million", 
                                             "Total Tests",
                                             "Total Tests Per Million"])
df

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,Active Cases,Serious Cases,Total Cases Per Million,Total Deaths Per Million,Total Tests,Total Tests Per Million
0,World,3402018,+3545,239622,+174,1083901,2078495,51366,436,30.7,,
1,USA,1131492,+462,65776,+23,161563,904153,16481,3418,199,6699878,20241
2,Spain,242988,,24824,,142450,75714,2500,5197,531,1528833,32699
3,Italy,207428,,28236,,78249,100943,1578,3431,467,2053425,33962
4,UK,177454,,27510,,,149600,1559,2614,405,1023824,15082
...,...,...,...,...,...,...,...,...,...,...,...,...
210,Western Sahara,6,,,,5,1,,10,,,
211,Anguilla,3,,,,3,0,,200,,,
212,Comoros,1,,,,,1,,1,,,
213,Saint Pierre Miquelon,1,,,,,1,,173,,,


In [34]:
df["Total Cases"] = df["Total Cases"].str.replace(r"^\s*$", "0")
df["New Cases"] = df["New Cases"].str.replace(r"^\s*$", "0")
df["Total Deaths"] = df["Total Deaths"].str.replace(r"^\s*$", "0")
df["New Deaths"] = df["New Deaths"].str.replace(r"^\s*$", "0")
df["Total Recovered"] =  df["Total Recovered"].str.replace(r"^\s*$", "0")
df["Active Cases"] = df["Active Cases"].str.replace(r"^\s*$", "0")

In [35]:
df.head(5)

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,Active Cases,Serious Cases,Total Cases Per Million,Total Deaths Per Million,Total Tests,Total Tests Per Million
0,World,3402018,3545,239622,174,1083901.0,2078495,51366,436,30.7,,
1,USA,1131492,462,65776,23,161563.0,904153,16481,3418,199.0,6699878.0,20241.0
2,Spain,242988,0,24824,0,142450.0,75714,2500,5197,531.0,1528833.0,32699.0
3,Italy,207428,0,28236,0,78249.0,100943,1578,3431,467.0,2053425.0,33962.0
4,UK,177454,0,27510,0,,149600,1559,2614,405.0,1023824.0,15082.0


In [36]:
df["Serious Cases"] = df["Serious Cases"].str.replace(r"^\s*$", "-1")
df["Total Cases Per Million"] = df["Total Cases Per Million"].str.replace(r"^\s*$", "-1")
df["Total Deaths Per Million"] = df["Total Deaths Per Million"].str.replace(r"^\s*$", "-1")
df["Total Tests"] = df["Total Tests"].str.replace(r"^\s*$", "-1")
df["Total Tests Per Million"] = df["Total Tests Per Million"].str.replace(r"^\s*$", "-1")

In [37]:
df.head()

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,Active Cases,Serious Cases,Total Cases Per Million,Total Deaths Per Million,Total Tests,Total Tests Per Million
0,World,3402018,3545,239622,174,1083901.0,2078495,51366,436,30.7,-1,-1
1,USA,1131492,462,65776,23,161563.0,904153,16481,3418,199.0,6699878,20241
2,Spain,242988,0,24824,0,142450.0,75714,2500,5197,531.0,1528833,32699
3,Italy,207428,0,28236,0,78249.0,100943,1578,3431,467.0,2053425,33962
4,UK,177454,0,27510,0,,149600,1559,2614,405.0,1023824,15082


In [38]:
df.replace("N/A", "-2", inplace=True)

In [39]:
df.head()

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,Active Cases,Serious Cases,Total Cases Per Million,Total Deaths Per Million,Total Tests,Total Tests Per Million
0,World,3402018,3545,239622,174,1083901,2078495,51366,436,30.7,-1,-1
1,USA,1131492,462,65776,23,161563,904153,16481,3418,199.0,6699878,20241
2,Spain,242988,0,24824,0,142450,75714,2500,5197,531.0,1528833,32699
3,Italy,207428,0,28236,0,78249,100943,1578,3431,467.0,2053425,33962
4,UK,177454,0,27510,0,-2,149600,1559,2614,405.0,1023824,15082


In [40]:
df["Total Cases"] = df["Total Cases"].str.replace(",", "")
df["New Cases"] = df["New Cases"].str.replace(",", "")
df["Total Deaths"] = df["Total Deaths"].str.replace(",", "")
df["New Deaths"] = df["New Deaths"].str.replace(",", "")
df["Total Recovered"] = df["Total Recovered"].str.replace(",", "")
df["Active Cases"] = df["Active Cases"].str.replace(",", "")
df["Serious Cases"] = df["Serious Cases"].str.replace(",", "")
df["Total Cases Per Million"] = df["Total Cases Per Million"].str.replace(",", "")
df["Total Deaths Per Million"] = df["Total Deaths Per Million"].str.replace(",", "")
df["Total Tests"] = df["Total Tests"].str.replace(",", "")
df["Total Tests Per Million"] = df["Total Tests Per Million"].str.replace(",", "")

In [41]:
df.head(5)

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,Active Cases,Serious Cases,Total Cases Per Million,Total Deaths Per Million,Total Tests,Total Tests Per Million
0,World,3402018,3545,239622,174,1083901,2078495,51366,436,30.7,-1,-1
1,USA,1131492,462,65776,23,161563,904153,16481,3418,199.0,6699878,20241
2,Spain,242988,0,24824,0,142450,75714,2500,5197,531.0,1528833,32699
3,Italy,207428,0,28236,0,78249,100943,1578,3431,467.0,2053425,33962
4,UK,177454,0,27510,0,-2,149600,1559,2614,405.0,1023824,15082


In [42]:
df["New Cases"] = df["New Cases"].str.replace("+", "")
df["New Deaths"] = df["New Deaths"].str.replace("+", "")

In [43]:
df.head()

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,Active Cases,Serious Cases,Total Cases Per Million,Total Deaths Per Million,Total Tests,Total Tests Per Million
0,World,3402018,3545,239622,174,1083901,2078495,51366,436,30.7,-1,-1
1,USA,1131492,462,65776,23,161563,904153,16481,3418,199.0,6699878,20241
2,Spain,242988,0,24824,0,142450,75714,2500,5197,531.0,1528833,32699
3,Italy,207428,0,28236,0,78249,100943,1578,3431,467.0,2053425,33962
4,UK,177454,0,27510,0,-2,149600,1559,2614,405.0,1023824,15082


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Country                   215 non-null    object
 1   Total Cases               215 non-null    object
 2   New Cases                 215 non-null    object
 3   Total Deaths              215 non-null    object
 4   New Deaths                215 non-null    object
 5   Total Recovered           215 non-null    object
 6   Active Cases              215 non-null    object
 7   Serious Cases             215 non-null    object
 8   Total Cases Per Million   215 non-null    object
 9   Total Deaths Per Million  215 non-null    object
 10  Total Tests               215 non-null    object
 11  Total Tests Per Million   215 non-null    object
dtypes: object(12)
memory usage: 20.3+ KB


In [45]:

# creating a dictionary for each of the column with key = column name and value = data type 
convert_dict = {
        "Country" : str,
        "Total Cases" : int,
        "New Cases" : int,
        "Total Deaths" : int, 
        "New Deaths" : int, 
        "Total Recovered" : int, 
        "Active Cases" : int, 
        "Serious Cases" : int, 
        "Total Cases Per Million" : float, 
        "Total Deaths Per Million" : float, 
        "Total Tests" : int,
        "Total Tests Per Million" : float
    }

In [46]:
df = df.astype(convert_dict)      # used to convert data type of the data frame
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Country                   215 non-null    object 
 1   Total Cases               215 non-null    int32  
 2   New Cases                 215 non-null    int32  
 3   Total Deaths              215 non-null    int32  
 4   New Deaths                215 non-null    int32  
 5   Total Recovered           215 non-null    int32  
 6   Active Cases              215 non-null    int32  
 7   Serious Cases             215 non-null    int32  
 8   Total Cases Per Million   215 non-null    float64
 9   Total Deaths Per Million  215 non-null    float64
 10  Total Tests               215 non-null    int32  
 11  Total Tests Per Million   215 non-null    float64
dtypes: float64(3), int32(8), object(1)
memory usage: 13.6+ KB
None


In [47]:
df.head(10)

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,Active Cases,Serious Cases,Total Cases Per Million,Total Deaths Per Million,Total Tests,Total Tests Per Million
0,World,3402018,3545,239622,174,1083901,2078495,51366,436.0,30.7,-1,-1.0
1,USA,1131492,462,65776,23,161563,904153,16481,3418.0,199.0,6699878,20241.0
2,Spain,242988,0,24824,0,142450,75714,2500,5197.0,531.0,1528833,32699.0
3,Italy,207428,0,28236,0,78249,100943,1578,3431.0,467.0,2053425,33962.0
4,UK,177454,0,27510,0,-2,149600,1559,2614.0,405.0,1023824,15082.0
5,France,167346,0,24594,0,50212,92540,3878,2564.0,377.0,1100228,16856.0
6,Germany,164077,0,6736,0,129000,28341,2189,1958.0,80.0,2547052,30400.0
7,Turkey,122392,0,3258,0,53808,65326,1480,1451.0,39.0,1075048,12747.0
8,Russia,114431,0,1169,0,13220,100042,2300,784.0,8.0,3700000,25354.0
9,Iran,95646,0,6091,0,76318,13237,2899,1139.0,73.0,475023,5656.0
