In [106]:
import pandas as pd
import requests

In [108]:
# load country longitude and latitude coordinates from csv
# https://developers.google.com/public-data/docs/canonical/countries_csv 
countries = "countries_coordinates.csv"
df_countries = pd.read_csv(countries)
df_countries

Unnamed: 0,country_abbr,latitude,longitude,country_name
0,AD,42.546245,1.601554,Andorra
1,AE,23.424076,53.847818,United Arab Emirates
2,AF,33.939110,67.709953,Afghanistan
3,AG,17.060816,-61.796428,Antigua and Barbuda
4,AI,18.220554,-63.068615,Anguilla
...,...,...,...,...
239,YE,15.552727,48.516388,Yemen
240,YT,-12.827500,45.166244,Mayotte
241,ZA,-30.559482,22.937506,South Africa
242,ZM,-13.133897,27.849332,Zambia


In [110]:
# check if the latitude and longitude values from df_countries are correctly passed to the get_air_quality function
# Modify the apply() function to print the values before they are passed:

df_countries[['avg_pm10', 'avg_pm2_5', 'avg_no2', 'avg_so2', 'avg_o3']] = df_countries.apply(
    lambda row: print(f"Fetching data for: {row['country_name']} - Lat: {row['latitude']}, Lon: {row['longitude']}") or get_air_quality(row['latitude'], row['longitude']), 
    axis=1, result_type='expand'
)

Fetching data for: Andorra - Lat: 42.546245, Lon: 1.601554
API request for coordinates: Lat=42.546245, Lon=1.601554
Received data for Lat=42.546245, Lon=1.601554: {'latitude': 42.5, 'longitude': 1.6000004, 'generationtime_ms': 0.0020265579223632812, 'utc_offset_seconds': 3600, 'timezone': 'Europe/Andorra', 'timezone_abbreviation': 'GMT+1', 'elevation': 1943.0}
Fetching data for: United Arab Emirates - Lat: 23.424076, Lon: 53.847818
API request for coordinates: Lat=23.424076, Lon=53.847818
Received data for Lat=23.424076, Lon=53.847818: {'latitude': 23.400002, 'longitude': 53.800003, 'generationtime_ms': 0.0026226043701171875, 'utc_offset_seconds': 14400, 'timezone': 'Asia/Dubai', 'timezone_abbreviation': 'GMT+4', 'elevation': 155.0}
Fetching data for: Afghanistan - Lat: 33.93911, Lon: 67.709953
API request for coordinates: Lat=33.93911, Lon=67.709953
Received data for Lat=33.93911, Lon=67.709953: {'latitude': 33.9, 'longitude': 67.7, 'generationtime_ms': 0.0025033950805664062, 'utc_off

In [104]:
# Retrieve average air quality from https://air-quality-api.open-meteo.com per country for 2023

def get_air_quality(lat, lon):
    """Fetches air quality data for each countries latitude and longitude."""
    print(f"API request for coordinates: Lat={lat}, Lon={lon}")  # Debugging statement
    url1 = "https://air-quality-api.open-meteo.com/v1/air-quality"
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": "2023-01-01",
        "end_date": "2023-12-31",
        "daily": ["pm10", "pm2_5", "no2", "so2", "o3"],  # Sadly, every country has a different timely resolution. How can I solve this issue?
        "timezone": "auto"
    }
    response = requests.get(url1, params=params)
    
    if response.status_code == 200:
        data = response.json()
        print(f"Received data for Lat={lat}, Lon={lon}: {data}")  # Print API response
        daily_data = data.get("daily", {})

        # Ensure pollutant data is available before computing averages
        # pollutants = ["pm10", "pm2_5", "no2", "so2", "o3"]
        # averages = []
        # for pollutant in pollutants:
        #    values = daily_data.get(pollutant, [])
        #    avg_value = sum(values) / len(values) if values else None
        #    averages.append(avg_value)
        # return tuple(averages)
        
        # Calculate annual averages for each pollutant
        avg_pm10 = sum(daily_data["pm10"]) / len(daily_data["pm10"]) if "pm10" in daily_data else None
        avg_pm2_5 = sum(daily_data["pm2_5"]) / len(daily_data["pm2_5"]) if "pm2_5" in daily_data else None
        avg_no2 = sum(daily_data["no2"]) / len(daily_data["no2"]) if "no2" in daily_data else None
        avg_so2 = sum(daily_data["so2"]) / len(daily_data["so2"]) if "so2" in daily_data else None
        avg_o3 = sum(daily_data["o3"]) / len(daily_data["o3"]) if "o3" in daily_data else None
        
        return avg_pm10, avg_pm2_5, avg_no2, avg_so2, avg_o3
        
    else:
        print(f"Error fetching data for lat={lat}, lon={lon}: {response.status_code}")
        return None, None, None, None, None

    # Log Missing or Empty Data:
    if not daily_data:
        print(f"No air quality data for lat={lat}, lon={lon}")
        
# Add air quality data to the countries dataframe
df_countries[['avg_pm10', 'avg_pm2_5', 'avg_no2', 'avg_so2', 'avg_o3']] = df_countries.apply(
    lambda row: get_air_quality(row['latitude'], row['longitude']), axis=1, result_type='expand'
)

# Save results to CSV
df_countries.to_csv("countries_air_quality_2023.csv", index=False)

print("Data collection complete. Results saved to air_quality_2023.csv")

API request for coordinates: Lat=42.546245, Lon=1.601554
Received data for Lat=42.546245, Lon=1.601554: {'latitude': 42.5, 'longitude': 1.6000004, 'generationtime_ms': 0.0030994415283203125, 'utc_offset_seconds': 3600, 'timezone': 'Europe/Andorra', 'timezone_abbreviation': 'GMT+1', 'elevation': 1943.0}
API request for coordinates: Lat=23.424076, Lon=53.847818
Received data for Lat=23.424076, Lon=53.847818: {'latitude': 23.400002, 'longitude': 53.800003, 'generationtime_ms': 0.0022649765014648438, 'utc_offset_seconds': 14400, 'timezone': 'Asia/Dubai', 'timezone_abbreviation': 'GMT+4', 'elevation': 155.0}
API request for coordinates: Lat=33.93911, Lon=67.709953
Received data for Lat=33.93911, Lon=67.709953: {'latitude': 33.9, 'longitude': 67.7, 'generationtime_ms': 0.0030994415283203125, 'utc_offset_seconds': 16200, 'timezone': 'Asia/Kabul', 'timezone_abbreviation': 'GMT+4:30', 'elevation': 3777.0}
API request for coordinates: Lat=17.060816, Lon=-61.796428
Received data for Lat=17.060816

In [122]:
# Retrieve average air quality from https://air-quality-api.open-meteo.com per country for 2023

def get_air_quality(lat, lon):
    """Fetches air quality data for given latitude and longitude dynamically handling different resolutions."""
    print(f"Fetching air quality data for: Lat={lat}, Lon={lon}")

    url = "https://air-quality-api.open-meteo.com/v1/air-quality"
    
    params_daily = {
        "latitude": lat,
        "longitude": lon,
        "start_date": "2023-01-01",
        "end_date": "2023-12-31",
        "daily": ["pm10", "pm2_5", "no2", "so2", "o3"],
        "timezone": "UTC"
    }
    
    response = requests.get(url, params=params_daily)
    data = response.json()

    # Check if "daily" data exists
    if "daily" in data and any(data["daily"].get(p) for p in ["pm10", "pm2_5", "no2", "so2", "o3"]):
        print("✅ Using daily data")
        daily_data = data["daily"]
        pollutants = ["pm10", "pm2_5", "no2", "so2", "o3"]
        averages = [sum(daily_data.get(p, [])) / len(daily_data.get(p, [])) if daily_data.get(p) else None for p in pollutants]
        return tuple(averages)

    # If no daily data, try hourly
    print("⚠️ No daily data. Trying hourly data instead...")
    params_hourly = {
        "latitude": lat,
        "longitude": lon,
        "start_date": "2023-01-01",
        "end_date": "2023-12-31",
        "hourly": ["pm10", "pm2_5", "no2", "so2", "o3"],
        "timezone": "UTC"
    }
    
    response = requests.get(url, params=params_hourly)
    data = response.json()

    if "hourly" in data and any(data["hourly"].get(p) for p in ["pm10", "pm2_5", "no2", "so2", "o3"]):
        print("✅ Using hourly data")
        hourly_data = data["hourly"]
        pollutants = ["pm10", "pm2_5", "no2", "so2", "o3"]
        
        # Compute daily averages from hourly data
        averages = []
        for pollutant in pollutants:
            values = hourly_data.get(pollutant, [])
            avg_value = sum(values) / len(values) if values else None
            averages.append(avg_value)
        
        return tuple(averages)

    print(f"❌ No air quality data found for Lat={lat}, Lon={lon}")
    return None, None, None, None, None

# Example: Processing all countries in the dataframe
df_countries[['avg_pm10', 'avg_pm2_5', 'avg_no2', 'avg_so2', 'avg_o3']] = df_countries.apply(
    lambda row: get_air_quality(row['latitude'], row['longitude']), axis=1, result_type='expand'
)

# Save results to CSV
df_countries.to_csv("air_quality_2023.csv", index=False)
print("Data collection complete. Results saved to air_quality_2023.csv")


Fetching air quality data for: Lat=42.546245, Lon=1.601554
⚠️ No daily data. Trying hourly data instead...
❌ No air quality data found for Lat=42.546245, Lon=1.601554
Fetching air quality data for: Lat=23.424076, Lon=53.847818
⚠️ No daily data. Trying hourly data instead...
❌ No air quality data found for Lat=23.424076, Lon=53.847818
Fetching air quality data for: Lat=33.93911, Lon=67.709953
⚠️ No daily data. Trying hourly data instead...
❌ No air quality data found for Lat=33.93911, Lon=67.709953
Fetching air quality data for: Lat=17.060816, Lon=-61.796428
⚠️ No daily data. Trying hourly data instead...
❌ No air quality data found for Lat=17.060816, Lon=-61.796428
Fetching air quality data for: Lat=18.220554, Lon=-63.068615
⚠️ No daily data. Trying hourly data instead...
❌ No air quality data found for Lat=18.220554, Lon=-63.068615
Fetching air quality data for: Lat=41.153332, Lon=20.168331
⚠️ No daily data. Trying hourly data instead...
❌ No air quality data found for Lat=41.153332, 

In [118]:
# Wait 2 seconds between each request
import time
time.sleep(2)

In [120]:
# Retrieve average air quality from https://air-quality-api.open-meteo.com per country for 2023

def get_air_quality(lat, lon):
    """Fetches air quality data, ensuring we capture any missing or partial data issues."""
    print(f"Fetching data for: Lat={lat}, Lon={lon}")

    url = "https://air-quality-api.open-meteo.com/v1/air-quality"

    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": "2023-01-01",
        "end_date": "2023-12-31",
        "daily": ["pm10", "pm2_5", "no2", "so2", "o3"],
        "timezone": "UTC"
    }

    response = requests.get(url, params=params)
    if response.status_code != 200:
        print(f"❌ Error {response.status_code}: No data for {lat}, {lon}")
        return None, None, None, None, None

    data = response.json()
    print(f"📊 Full response for {lat}, {lon}: {data}")

    if "daily" in data and any(p in data["daily"] for p in ["pm10", "pm2_5", "no2", "so2", "o3"]):
        print(f"✅ Using daily data for {lat}, {lon}")
        daily_data = data["daily"]
        pollutants = ["pm10", "pm2_5", "no2", "so2", "o3"]
        averages = [sum(daily_data.get(p, [])) / len(daily_data.get(p, [])) if daily_data.get(p) else None for p in pollutants]
        return tuple(averages)
    
    print(f"⚠️ No daily data for {lat}, {lon}. Trying hourly...")
    params["hourly"] = ["pm10", "pm2_5", "no2", "so2", "o3"]
    del params["daily"]

    response = requests.get(url, params=params)
    data = response.json()
    print(f"📊 Full hourly response for {lat}, {lon}: {data}")

    if "hourly" in data and any(p in data["hourly"] for p in ["pm10", "pm2_5", "no2", "so2", "o3"]):
        print(f"✅ Using hourly data for {lat}, {lon}")
        hourly_data = data["hourly"]
        pollutants = ["pm10", "pm2_5", "no2", "so2", "o3"]
        averages = [sum(hourly_data.get(p, [])) / len(hourly_data.get(p, [])) if hourly_data.get(p) else None for p in pollutants]
        return tuple(averages)

    print(f"❌ No air quality data for {lat}, {lon}")
    return None, None, None, None, None


In [98]:
df_countries

Unnamed: 0,country_abbr,latitude,longitude,country_name,avg_pm10,avg_pm2_5,avg_no2,avg_so2,avg_o3
0,AD,42.546245,1.601554,Andorra,,,,,
1,AE,23.424076,53.847818,United Arab Emirates,,,,,
2,AF,33.939110,67.709953,Afghanistan,,,,,
3,AG,17.060816,-61.796428,Antigua and Barbuda,,,,,
4,AI,18.220554,-63.068615,Anguilla,,,,,
...,...,...,...,...,...,...,...,...,...
239,YE,15.552727,48.516388,Yemen,,,,,
240,YT,-12.827500,45.166244,Mayotte,,,,,
241,ZA,-30.559482,22.937506,South Africa,,,,,
242,ZM,-13.133897,27.849332,Zambia,,,,,


In [124]:
# Verify If the API Is Providing Data for the Specified Dates
# Some locations may not have complete air quality records for all of 2023. Try a shorter date range.

In [128]:
# This approach does not work at all, because OpenMeteo gives AQI values for the specific location with in the country.
# Rather use Webscraping of IQAIR homepage, which gives a list of averaged data per country for 2023.

import requests
from bs4 import BeautifulSoup
import csv

url2 = "https://www.iqair.com/us/world-most-polluted-countries"

In [133]:
# Function for web scraping IQAIR for air pollution values:

def scrape_iqair_pm25():
    url2 = "https://www.iqair.com/us/world-most-polluted-countries"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
    
    response = requests.get(url2, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve data: {response.status_code}")
        return
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the table containing pollution data
    table = soup.find("table")
    if not table:
        print("Could not find the data table.")
        return
    
    # Extract table headers (optional)
    headers = [header.text.strip() for header in table.find_all("th")]
    
    # Extract table rows
    rows = table.find_all("tr")[1:]  # Skip header row
    
    data = []
    for row in rows:
        cols = row.find_all("td")
        if len(cols) >= 2:  # Ensure at least country and PM2.5 value exist
            country = cols[1].text.strip()
            pm25_value = cols[2].text.strip()
            data.append([country, pm25_value])
    
    # Save to CSV
    with open("iqair_pm25_2023.csv", "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Country/Region", "PM2.5 (µg/m³)"])
        writer.writerows(data)
    
    print("Data successfully scraped and saved to iqair_pm25_2023.csv")

if __name__ == "__main__":
    scrape_iqair_pm25()


Data successfully scraped and saved to iqair_pm25_2023.csv


In [137]:
# Obtain Population density data:

# URL of the website
url3 = 'https://database.earth/population/density/2023'

# Send a GET request to the website
response = requests.get(url3)
response.raise_for_status()  # Check for request errors

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# Find the table containing the data
table = soup.find('table')

# Initialize a list to store the data
data = []

# Extract table headers
headers = ['Country', 'Population Density (people/km²)']

# Iterate over table rows
for row in table.find_all('tr')[1:]:  # Skip the header row
    cols = row.find_all('td')
    if len(cols) >= 3:
        country = cols[1].text.strip()
        density = cols[2].text.strip()
        data.append([country, density])

# Save the data to a CSV file
csv_filename = 'population_density_2023.csv'
with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(headers)  # Write headers
    writer.writerows(data)  # Write data

print(f'Data saved to {csv_filename}')

Data saved to population_density_2023.csv


In [145]:
# Urbanization Rate - Web scraping:

# URL of the website
url4 = "https://worldpopulationreview.com/country-rankings/most-urbanized-countries"

# Send GET request
headers = {"User-Agent": "Mozilla/5.0"}  # Adding headers to mimic a browser request
response = requests.get(url4, headers=headers)

# Check if request was successful
if response.status_code == 200:
    # Parse the page content
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Find the table containing the data
    table = soup.find("table")
    
    # Extract table headers
    headers = [header.text.strip() for header in table.find_all("th")]
    
    # Extract table rows
    data = []
    for row in table.find_all("tr")[1:]:  # Skip header row
        cols = row.find_all("td")
        if cols:
            country = cols[0].text.strip()
            urbanization_rate = cols[1].text.strip().replace("%", "")  # Remove percentage sign
            data.append([country, float(urbanization_rate)])
    
    # Convert to DataFrame
    df = pd.DataFrame(data, columns=["Country", "Urbanization Rate (%)"])
    
    # Display the first few rows
    print(df.head())
    
    # Save to CSV (optional)
    df.to_csv("urbanization_rates_2023.csv", index=False)
    
else:
    print("Failed to retrieve data, status code:", response.status_code)

# Does not work, cause there is no table, but a map, which cannot be scraped

IndexError: list index out of range