In [1]:
from bs4 import BeautifulSoup

import requests

import pandas as pd

import re

In [2]:
wiki_url = 'https://en.wikipedia.org/wiki/COVID-19_pandemic_by_country_and_territory'
table_id = 'thetable'
response=requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')

In [3]:
covidtable = soup.find('table', attrs={'id':table_id})
df = pd.read_html(str(covidtable))
df = pd.DataFrame(df[0])

In [4]:
print(df.head())

         Location[a]                       Cases[b] Deaths[c]  Recov.[d]  \
  Unnamed: 0_level_1           World[e] 102,303,716 2,212,694 56,584,997   
0                NaN   United States[f]    26208291    442046   11166500   
1                NaN              India    10733131    154147   10409160   
2                NaN             Brazil     9130790    222926    7960643   
3                NaN          Russia[g]     3832080     72697    3837550   
4                NaN  United Kingdom[h]     3796088    105571    No data   

       Ref.  
        [4]  
0      [16]  
1      [17]  
2  [18][19]  
3      [20]  
4      [22]  


In [5]:
#now we clean the data. First by removing the Reference column then by removing the "NaN" value which resulted from the images on Wikipedia
del df["Ref."]

df = df.drop([238,239])
df = df.dropna(1)

print(df.head())

         Location[a]    Cases[b] Deaths[c]  Recov.[d]
            World[e] 102,303,716 2,212,694 56,584,997
0   United States[f]    26208291    442046   11166500
1              India    10733131    154147   10409160
2             Brazil     9130790    222926    7960643
3          Russia[g]     3832080     72697    3837550
4  United Kingdom[h]     3796088    105571    No data


In [6]:
#the references (little letters in brackets) were a little annoying to remove, but it was possible to do so by using regular expressions (regex)
df = df.replace("\[(.*?]*)\]", "", regex=True)
print(df.head())

      Location[a]    Cases[b] Deaths[c]  Recov.[d]
         World[e] 102,303,716 2,212,694 56,584,997
0   United States    26208291    442046   11166500
1           India    10733131    154147   10409160
2          Brazil     9130790    222926    7960643
3          Russia     3832080     72697    3837550
4  United Kingdom     3796088    105571    No data


In [7]:
#now we rename the column names manually since references were not removed from them
df.columns = ['Location', 'Cases', 'Deaths', 'Recovery']

#While renaming, the "world" row vanished. But I deduced that it was because it actually part of the column name string since the "[e]" in "World" was not removed and because it did not have a row ID

df.head()

Unnamed: 0,Location,Cases,Deaths,Recovery
0,United States,26208291,442046,11166500
1,India,10733131,154147,10409160
2,Brazil,9130790,222926,7960643
3,Russia,3832080,72697,3837550
4,United Kingdom,3796088,105571,No data


In [8]:
pd.option_context('display.max_rows', None, 'display.max_columns', None)
print(df)

                           Location     Cases   Deaths  Recovery
0                     United States  26208291   442046  11166500
1                             India  10733131   154147  10409160
2                            Brazil   9130790   222926   7960643
3                            Russia   3832080    72697   3837550
4                    United Kingdom   3796088   105571   No data
..                              ...       ...      ...       ...
233                  American Samoa         4        0         3
234                           Samoa         2        0         2
235  Federated States of Micronesia         1        0         0
236                         Vanuatu         1        0         1
237                        Tanzania   No data  No data   No data

[238 rows x 4 columns]
