In [25]:
library(httr)

url <- "https://en.wikipedia.org/wiki/COVID-19_testing"
response <- GET(url)
response


Response [https://en.wikipedia.org/wiki/COVID-19_testing]
  Date: 2026-02-03 00:30
  Status: 200
  Content-Type: text/html; charset=UTF-8
  Size: 1.39 MB
<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-fea...
<head>
<meta charset="UTF-8">
<title>COVID-19 testing - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-heade...
RLSTATE={"ext.globalCssJs.user.styles":"ready","site.styles":"ready","user.st...
<script>(RLQ=window.RLQ||[]).push(function(){mw.loader.impl(function(){return...
}];});});</script>
<link rel="stylesheet" href="/w/load.php?lang=en&amp;modules=ext.cite.styles%...
...

In [26]:
library(rvest)

html_page <- read_html(response)
tables <- html_page %>% html_table(fill = TRUE)
length(tables)


In [27]:
for (i in seq_along(tables)) {
  cat("\nTable", i, ":\n")
  print(colnames(tables[[i]]))
}



Table 1 :
[1] "X1"

Table 2 :
[1] "Samples source" "Positive rate" 

Table 3 :
[1] "Country or region"       "Date[a]"                
[3] "Tested"                  "Units[b]"               
[5] "Confirmed(cases)"        "Confirmed /tested,%"    
[7] "Tested /population,%"    "Confirmed /population,%"
[9] "Ref."                   

Table 4 :
  [1] "vteCOVID-19 pandemic" "vteCOVID-19 pandemic" ""                    
  [4] ""                     ""                     ""                    
  [7] ""                     ""                     ""                    
 [10] ""                     ""                     ""                    
 [13] ""                     ""                     ""                    
 [16] ""                     ""                     ""                    
 [19] ""                     ""                     ""                    
 [22] ""                     ""                     ""                    
 [25] ""                     ""                     "" 

In [45]:
covid_df <- tables[[3]]
head(covid_df)


Country or region,Date[a],Tested,Units[b],Confirmed(cases),"Confirmed /tested,%","Tested /population,%","Confirmed /population,%",Ref.
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
Afghanistan,17 Dec 2020,154767,samples,49621,32.1,0.4,0.13,[248]
Albania,18 Feb 2021,428654,samples,96838,22.6,15.0,3.4,[249]
Algeria,2 Nov 2020,230553,samples,58574,25.4,0.53,0.13,[250][251]
Andorra,23 Feb 2022,300307,samples,37958,12.6,387.0,49.0,[252]
Angola,2 Feb 2021,399228,samples,20981,5.3,1.3,0.067,[253]
Antigua and Barbuda,6 Mar 2021,15268,samples,832,5.4,15.9,0.86,[254]


In [46]:
library(dplyr)
library(stringr)

covid_df_clean <- covid_df %>%
  rename(country = `Country or region`) %>%
  mutate(
    across(where(is.character), ~str_trim(.)),
    across(where(is.character), ~gsub(",", "", .))
  )



In [47]:
write.csv(covid_df_clean, "covid_cleaned.csv", row.names = FALSE)


In [48]:
covid_subset <- covid_df_clean %>%
  select(
    country,
    Tested,
    `Confirmed(cases)`,
    `Tested /population,%`,
    `Confirmed /population,%`
  )

head(covid_subset)



country,Tested,Confirmed(cases),"Tested /population,%","Confirmed /population,%"
<chr>,<chr>,<chr>,<chr>,<chr>
Afghanistan,154767,49621,0.4,0.13
Albania,428654,96838,15.0,3.4
Algeria,230553,58574,0.53,0.13
Andorra,300307,37958,387.0,49.0
Angola,399228,20981,1.3,0.067
Antigua and Barbuda,15268,832,15.9,0.86


In [49]:
covid_subset <- covid_df_clean %>%
  select(
    country,
    Tested,
    `Confirmed(cases)`,
    `Tested /population,%`,
    `Confirmed /population,%`
  )

head(covid_subset)


country,Tested,Confirmed(cases),"Tested /population,%","Confirmed /population,%"
<chr>,<chr>,<chr>,<chr>,<chr>
Afghanistan,154767,49621,0.4,0.13
Albania,428654,96838,15.0,3.4
Algeria,230553,58574,0.53,0.13
Andorra,300307,37958,387.0,49.0
Angola,399228,20981,1.3,0.067
Antigua and Barbuda,15268,832,15.9,0.86


In [50]:
sort(unique(covid_df_clean$country))


In [51]:
grep("^United", covid_df_clean$country, value = TRUE)


In [52]:
covid_df_clean %>%
  filter(country %in% c("United States", "India")) %>%
  select(
    country,
    Tested,
    `Confirmed(cases)`,
    `Confirmed /population,%`
  )


country,Tested,Confirmed(cases),"Confirmed /population,%"
<chr>,<chr>,<chr>,<chr>
India,866177937,43585554,31.7
United States,929349291,90749469,27.4


In [53]:
covid_df_clean %>%
  arrange(desc(`Confirmed /population,%`)) %>%
  slice(1) %>%
  select(country, `Confirmed /population,%`)


country,"Confirmed /population,%"
<chr>,<chr>
Nepal,Formatting error: invalid input when rounding


In [54]:
covid_df_clean %>%
  filter(`Confirmed /population,%` < 1) %>%
  select(country)


country
<chr>
Afghanistan
Algeria
Angola
Antigua and Barbuda
Bangladesh
Benin
Brunei
Burkina Faso
Burundi
Cambodia
