In [3]:
### Wikipedia allows scraping of its website for demo purposes. So I will be scraping its page on list of countries and dependencies by population. 
### I will extract country name and population.


In [4]:
### First, import necessary libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd


In [5]:
### Second, list the target website's url 

url = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"

In [9]:
### Next, will be to extract the content of the above page

import requests
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, "html.parser")
else:
    print("Request failed:", response.status_code)
    soup = None


Request failed: 403


In [11]:
### Encountered a block from Wikipedia which thinks my code is a bot command.
### To resolve this, I add a user agent to my commands

In [12]:
import requests
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36"
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, "html.parser")
    print("Page fetched successfully ✅")
else:
    print("Request failed:", response.status_code)
    soup = None


Page fetched successfully ✅


In [13]:
### Page was fetched successfully.

In [15]:
### Next, 

table = soup.find("table", {"class": "wikitable"})

In [17]:
### To extract rows,
rows = table.find_all("tr")


In [18]:
data = []
for row in rows[1:]:  # skip header
    cols = row.find_all(["td", "th"])
    cols = [col.get_text(strip=True) for col in cols]
    
    if len(cols) >= 3:
        country = cols[1]
        population = cols[2]
        data.append({"Country": country, "Population": population})

In [19]:
### To save my work into a datframe;
df = pd.DataFrame(data)

In [20]:
### Finally, save work in json, csv files
df.to_csv("wikipedia_population.csv", index=False)
df.to_json("wikipedia_population.json", orient="records", indent=2)


In [21]:
### To check the result (the first 10 lines)
print(df.head(10))

         Country Population
0  8,232,000,000       100%
1  1,417,492,000      17.3%
2  1,408,280,000      17.2%
3    340,110,988       4.1%
4    284,438,782       3.5%
5    241,499,431       2.9%
6    223,800,000       2.7%
7    213,421,037       2.6%
8    169,828,911       2.1%
9    146,028,325       1.8%


In [23]:
### This wikipedia site has been scrapped successfully 