In [75]:
%pip install beautifulsoup4


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [76]:
# Import the necessary libraries
## Extracing and processing country indicators from https://data.un.org/en/index.html
import requests #for extracting web content
import pandas as pd, numpy as np #for data manipulation
from bs4 import BeautifulSoup #for parsing web content

In [77]:
#import country names data
country_names = pd.read_csv("Countries1.csv")

#extract alphas column, converting values to lower case
country_codes = country_names['Alpha2'].str.lower()

In [78]:
#define target fields to extract
TARGET_FIELDS = {
    "Region": "region",
    "Population (000, 2025)": "population_th",
    "GDP: Gross domestic product (million current US$)": "GDP",
    "Life expectancy at birth (females/males, years)": "life_expectancy"
}

In [79]:
def extract_target_indicators(tables):
    
    result = {
        "region": None,
        "population_th": None,
        "gdp": None,
        "life_expectancy": None
    }
    
    # ---- GENERAL INFO TABLE ----
    general_rows = tables[0].find_all("tr")
    
    for row in general_rows:
        cols = [td.get_text(strip=True) for td in row.find_all("td")]
        if len(cols) < 3:
            continue
        
        label = cols[0].lower()
        value = cols[2]
        
        if "region" in label:
            result["region"] = value
            
        if "population" in label:
            result["population_th"] = value
    
    # ---- ECONOMIC TABLE ----
    econ_rows = tables[1].find_all("tr")
    
    for row in econ_rows:
        cols = [td.get_text(strip=True) for td in row.find_all("td")]
        if len(cols) < 4:
            continue
        
        label = cols[0].lower()
        
        if "gross domestic product" in label:
            result["gdp"] = cols[2]   # choose 2020 column
    
    # ---- SOCIAL TABLE ----
    social_rows = tables[2].find_all("tr")
    
    for row in social_rows:
        cols = [td.get_text(strip=True) for td in row.find_all("td")]
        if len(cols) < 4:
            continue
        
        label = cols[0].lower()
        
        if "life expectancy" in label:
            result["life_expectancy"] = cols[2]   # choose 2020
    
    return result


In [80]:
def extract_tables(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code != 200:
            print(f"Skipping (not found): {url}")
            return None
        
        soup = BeautifulSoup(response.text, "html.parser")
        tables = soup.find_all("table", class_="pure-table")
        
        return tables
    
    except Exception as e:
        print(f"Error loading {url}: {e}")
        return None


In [81]:
all_country_data = []


for country_name, iso_code in zip(
        country_names["Country"],
        country_names["Alpha2"].astype(str).str.lower()):
    
    url = f"https://data.un.org/en/iso/{iso_code}.html"
    
    print(f"Scraping {country_name}...")
    
    tables = extract_tables(url)
    
    if not tables:
        continue
    
    indicators = extract_target_indicators(tables)
    
    indicators["country"] = country_name
    indicators["iso_alpha2"] = iso_code.upper()
    
    all_country_data.append(indicators)

all_country_data_df = pd.DataFrame(all_country_data)

Scraping Afghanistan...
Scraping Albania...
Scraping Algeria...
Scraping American Samoa...
Scraping Andorra...
Scraping Angola...
Scraping Anguilla...
Scraping Antigua and Barbuda...
Scraping Argentina...
Scraping Armenia...
Scraping Aruba...
Scraping Australia...
Scraping Austria...
Scraping Azerbaijan...
Scraping Bahamas...
Scraping Bahrain...
Scraping Bangladesh...
Scraping Barbados...
Scraping Belarus...
Scraping Belgium...
Scraping Belize...
Scraping Benin...
Scraping Bermuda...
Scraping Bhutan...
Scraping Bolivia (Plurinational State of)...
Scraping Bonaire, Sint Eustatius and Saba...
Scraping Bosnia and Herzegovina...
Scraping Botswana...
Scraping Brazil...
Scraping British Virgin Islands...
Scraping Brunei Darussalam...
Scraping Bulgaria...
Scraping Burkina Faso...
Scraping Burundi...
Scraping Cabo Verde...
Scraping Cambodia...
Scraping Cameroon...
Scraping Canada...
Scraping Cayman Islands...
Scraping Central African Republic...
Scraping Chad...
Scraping Channel Islands...
Scr

In [84]:
print(all_country_data_df.head())

            region  population_th       gdp         country iso_alpha2  \
0    Southern Asia        43844.0   19983.0     Afghanistan         AF   
1  Southern Europe         2772.0   15163.0         Albania         AL   
2  Northern Africa        47435.0  158975.0         Algeria         DZ   
3        Polynesia           46.0       NaN  American Samoa         AS   
4  Southern Europe           83.0    2891.0         Andorra         AD   

   life_expectancy_female  life_expectancy_male  
0                    64.7                  58.5  
1                    80.3                  75.4  
2                    74.9                  71.7  
3                    75.7                  70.0  
4                    83.8                  75.8  


In [83]:
#Clean and save the data
import re

all_country_data_df["population_th"] = (
    all_country_data_df["population_th"]
    .astype(str)
    .str.replace(" ", "", regex=False)
    .str.replace(r"[^0-9]", "", regex=True)
)

all_country_data_df["population_th"] = pd.to_numeric(
    all_country_data_df["population_th"],
    errors="coerce"
)

all_country_data_df["gdp"] = (
    all_country_data_df["gdp"]
    .astype(str)
    .str.replace(" ", "", regex=False)
    .str.replace(r"[^0-9.-]", "", regex=True)
)

all_country_data_df["gdp"] = pd.to_numeric(
    all_country_data_df["gdp"],
    errors="coerce"
)

life_split = all_country_data_df["life_expectancy"].str.split("/", expand=True)

all_country_data_df["life_expectancy_female"] = pd.to_numeric(
    life_split[0].str.replace(r"[^0-9.]", "", regex=True),
    errors="coerce"
)

all_country_data_df["life_expectancy_male"] = pd.to_numeric(
    life_split[1].str.replace(r"[^0-9.]", "", regex=True),
    errors="coerce"
)

all_country_data_df = all_country_data_df.drop(columns=["life_expectancy"])


print(all_country_data_df.head())

all_country_data_df.to_csv("data/cleaned_country_xlsx/all_country_data.csv", index=False)


            region  population_th       gdp         country iso_alpha2  \
0    Southern Asia        43844.0   19983.0     Afghanistan         AF   
1  Southern Europe         2772.0   15163.0         Albania         AL   
2  Northern Africa        47435.0  158975.0         Algeria         DZ   
3        Polynesia           46.0       NaN  American Samoa         AS   
4  Southern Europe           83.0    2891.0         Andorra         AD   

   life_expectancy_female  life_expectancy_male  
0                    64.7                  58.5  
1                    80.3                  75.4  
2                    74.9                  71.7  
3                    75.7                  70.0  
4                    83.8                  75.8  
