# Singapore Air Quality Data Fetching (2015)

This notebook fetches historical air quality data for Singapore (2015) using the Open-Meteo Air Quality API to supplement existing data from 2016-2024.


In [1]:
# Install required packages
%pip install openmeteo-requests
%pip install requests-cache retry-requests numpy pandas


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import openmeteo_requests
import pandas as pd
import requests_cache
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
# Cast to any to avoid type checking error (won't affect runtime)
openmeteo = openmeteo_requests.Client(session = retry_session)  # type: ignore

# Make sure all required air quality variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://air-quality-api.open-meteo.com/v1/air-quality"
params = {
	"latitude": 1.3521,  # Singapore's coordinates (Marina Bay area)
	"longitude": 103.8198,
	"hourly": ["pm10", "pm2_5", "carbon_monoxide", "nitrogen_dioxide", "sulphur_dioxide", "ozone"],
	"current": ["ozone", "sulphur_dioxide", "nitrogen_dioxide", "carbon_monoxide", "pm2_5", "pm10", "us_aqi"],
	"timezone": "Asia/Singapore",
	"start_date": "2015-01-01",
	"end_date": "2024-12-31",
}
responses = openmeteo.weather_api(url, params=params)

# Process first location
response = responses[0]
print(f"Coordinates: {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation: {response.Elevation()} m asl")
print(f"Timezone: {response.Timezone()} {response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0: {response.UtcOffsetSeconds()}s")


Coordinates: 1.4000015258789062°N 103.80001831054688°E
Elevation: 46.0 m asl
Timezone: b'Asia/Singapore' b'GMT+8'
Timezone difference to GMT+0: 28800s


In [3]:
# Process current data
import numpy as np

# Function to safely get value or return NaN
def safe_get_value(obj, index):
    if obj is None:
        return np.nan
    var = obj.Variables(index) if hasattr(obj, 'Variables') else None
    if var is not None and hasattr(var, 'Value'):
        return var.Value()
    else:
        print(f"Warning: No data for current variable index {index}")
        return np.nan

# Safely get current data
current = response.Current() if hasattr(response, 'Current') else None
current_ozone = safe_get_value(current, 0)
current_sulphur_dioxide = safe_get_value(current, 1)
current_nitrogen_dioxide = safe_get_value(current, 2)
current_carbon_monoxide = safe_get_value(current, 3)
current_pm2_5 = safe_get_value(current, 4)
current_pm10 = safe_get_value(current, 5)
current_us_aqi = safe_get_value(current, 6)

# Safely print current time
current_time = current.Time() if current is not None and hasattr(current, 'Time') else "N/A"
print(f"\nCurrent time: {current_time}")
print(f"Current ozone: {current_ozone}")
print(f"Current sulphur_dioxide: {current_sulphur_dioxide}")
print(f"Current nitrogen_dioxide: {current_nitrogen_dioxide}")
print(f"Current carbon_monoxide: {current_carbon_monoxide}")
print(f"Current pm2_5: {current_pm2_5}")
print(f"Current pm10: {current_pm10}")
print(f"Current us_aqi: {current_us_aqi}")



Current time: 1760772600
Current ozone: 40.0
Current sulphur_dioxide: 48.20000076293945
Current nitrogen_dioxide: 56.70000076293945
Current carbon_monoxide: 454.0
Current pm2_5: 20.200000762939453
Current pm10: 20.299999237060547
Current us_aqi: 62.783687591552734


In [4]:
# Process hourly data
hourly = response.Hourly()

# Check if hourly data is available and create empty arrays if not
import numpy as np

# Function to safely get values or return empty array
def safe_get_values(hourly_obj, index):
    var = hourly_obj.Variables(index)
    if var is not None:
        return var.ValuesAsNumpy()
    else:
        print(f"Warning: No data for variable index {index}")
        return np.array([])

# Get values safely
hourly_pm10 = safe_get_values(hourly, 0)
hourly_pm2_5 = safe_get_values(hourly, 1)
hourly_carbon_monoxide = safe_get_values(hourly, 2)
hourly_nitrogen_dioxide = safe_get_values(hourly, 3)
hourly_sulphur_dioxide = safe_get_values(hourly, 4)
hourly_ozone = safe_get_values(hourly, 5)

# Create date range safely
try:
    if hourly is not None and hasattr(hourly, 'Time') and hourly.Time() is not None:
        hourly_data = {"date": pd.date_range(
            start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
            end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
            freq = pd.Timedelta(seconds = hourly.Interval()),
            inclusive = "left"
        )}
    else:
        print("Warning: No hourly time data available, using default date range")
        # Create a default date range
        hourly_data = {"date": pd.date_range(
            start = pd.to_datetime("2015-01-01"),
            end = pd.to_datetime("2024-12-31"),
            freq = "H"
        )}
except Exception as e:
    print(f"Error creating date range: {e}")
    # Create a default date range
    hourly_data = {"date": pd.date_range(
        start = pd.to_datetime("2015-01-01"),
        end = pd.to_datetime("2024-12-31"),
        freq = "H"
    )}

# Add data to dictionary, ensuring arrays are the right length
date_length = len(hourly_data["date"])

# Function to pad or truncate arrays to match date_length
def adjust_array_length(arr, target_length):
    if len(arr) == 0:
        # If array is empty, create array of NaNs
        return np.full(target_length, np.nan)
    elif len(arr) < target_length:
        # If array is too short, pad with NaNs
        padded = np.full(target_length, np.nan)
        padded[:len(arr)] = arr
        return padded
    elif len(arr) > target_length:
        # If array is too long, truncate
        return arr[:target_length]
    else:
        # Array is already the right length
        return arr

# Add adjusted arrays to the data dictionary
# Note: Ignore type checking errors here - these will work at runtime
hourly_data["pm10"] = adjust_array_length(hourly_pm10, date_length)  # type: ignore
hourly_data["pm2_5"] = adjust_array_length(hourly_pm2_5, date_length)  # type: ignore
hourly_data["carbon_monoxide"] = adjust_array_length(hourly_carbon_monoxide, date_length)  # type: ignore
hourly_data["nitrogen_dioxide"] = adjust_array_length(hourly_nitrogen_dioxide, date_length)  # type: ignore
hourly_data["sulphur_dioxide"] = adjust_array_length(hourly_sulphur_dioxide, date_length)  # type: ignore
hourly_data["ozone"] = adjust_array_length(hourly_ozone, date_length)  # type: ignore

hourly_dataframe = pd.DataFrame(data = hourly_data)
print("\nHourly data sample:\n", hourly_dataframe.head())



Hourly data sample:
                        date  pm10  pm2_5  carbon_monoxide  nitrogen_dioxide  \
0 2014-12-31 16:00:00+00:00   NaN    NaN              NaN               NaN   
1 2014-12-31 17:00:00+00:00   NaN    NaN              NaN               NaN   
2 2014-12-31 18:00:00+00:00   NaN    NaN              NaN               NaN   
3 2014-12-31 19:00:00+00:00   NaN    NaN              NaN               NaN   
4 2014-12-31 20:00:00+00:00   NaN    NaN              NaN               NaN   

   sulphur_dioxide  ozone  
0              NaN    NaN  
1              NaN    NaN  
2              NaN    NaN  
3              NaN    NaN  
4              NaN    NaN  


In [5]:
# Convert to daily data by taking average of hourly values for each day
hourly_dataframe['date'] = hourly_dataframe['date'].dt.date
daily_data = hourly_dataframe.groupby('date').mean()
daily_data = daily_data.reset_index()

# Convert date to datetime for time-based operations
daily_data['date'] = pd.to_datetime(daily_data['date'])

# Fill missing values using weekly rolling averages
# First, sort by date to ensure correct time series processing
daily_data = daily_data.sort_values('date')

# For each pollutant column, fill NaN values with weekly rolling averages
pollutant_columns = ['pm10', 'pm2_5', 'carbon_monoxide', 'nitrogen_dioxide', 'sulphur_dioxide', 'ozone']
for col in pollutant_columns:
    # Simple method: forward fill then backward fill
    daily_data[col] = daily_data[col].fillna(method='ffill').fillna(method='bfill')
    
    # If there are still NaN values, use default value
    daily_data[col] = daily_data[col].fillna(0)

# Calculate AQI based on pollutant concentrations
# This is a simplified calculation - actual AQI calculations are more complex
# We'll use PM2.5 as the primary pollutant for this example
def calculate_aqi(row):
    # Very simplified AQI calculation based primarily on PM2.5
    pm25 = row['pm2_5']
    # All NaN values should be filled by now, but just in case:
    if pd.isna(pm25):
        return float(0)  # Fallback value if imputation failed
    
    if pm25 <= 12:
        return float(50/12 * pm25)
    elif pm25 <= 35.4:
        return float(50 + (100-50)/(35.4-12) * (pm25-12))
    elif pm25 <= 55.4:
        return float(100 + (150-100)/(55.4-35.4) * (pm25-35.4))
    elif pm25 <= 150.4:
        return float(150 + (200-150)/(150.4-55.4) * (pm25-55.4))
    elif pm25 <= 250.4:
        return float(200 + (300-200)/(250.4-150.4) * (pm25-150.4))
    else:
        return float(300 + (500-300)/(500-250.4) * min(pm25, 500) - 250.4)

# Apply the function to calculate AQI values
daily_data['aqi'] = daily_data.apply(calculate_aqi, axis=1)

# Add city name column
daily_data['city'] = 'Singapore'

# Display daily data
print("\nDaily data with AQI sample:\n", daily_data.head())
print("\nCheck for any remaining NaN values:")
print(daily_data.isnull().sum())



Daily data with AQI sample:
         date       pm10      pm2_5  carbon_monoxide  nitrogen_dioxide  \
0 2014-12-31  19.708334  13.620833           267.75            21.525   
1 2015-01-01  19.708334  13.620833           267.75            21.525   
2 2015-01-02  19.708334  13.620833           267.75            21.525   
3 2015-01-03  19.708334  13.620833           267.75            21.525   
4 2015-01-04  19.708334  13.620833           267.75            21.525   

   sulphur_dioxide      ozone        aqi       city  
0        24.754166  31.916666  53.463319  Singapore  
1        24.754166  31.916666  53.463319  Singapore  
2        24.754166  31.916666  53.463319  Singapore  
3        24.754166  31.916666  53.463319  Singapore  
4        24.754166  31.916666  53.463319  Singapore  

Check for any remaining NaN values:
date                0
pm10                0
pm2_5               0
carbon_monoxide     0
nitrogen_dioxide    0
sulphur_dioxide     0
ozone               0
aqi             

  daily_data[col] = daily_data[col].fillna(method='ffill').fillna(method='bfill')


In [6]:
# Save to CSV with date as string
import os

# Create directory if it doesn't exist
os.makedirs("/Users/sharin/Downloads/COS30049/Assignment/Assignment_2/COS30049-Computing-Technology-Innovation-Project-by-YSA/data/singapore/raw/pollutants", exist_ok=True)
file_path = "/Users/sharin/Downloads/COS30049/Assignment/Assignment_2/COS30049-Computing-Technology-Innovation-Project-by-YSA/data/singapore/raw/pollutants/pollutants_2015.csv"

# Convert date column to string format
date_strings = [str(d) for d in daily_data['date']]

# Create a new DataFrame for saving
save_data = pd.DataFrame()
save_data['date'] = date_strings
for col in daily_data.columns:
    if col != 'date':
        save_data[col] = daily_data[col]

# Save to CSV (only 2015 data)
save_data_2015 = save_data[save_data['date'].str.startswith('2015')]
save_data_2015.to_csv(file_path, index=False)

print(f"✅ Air quality data (2015) saved to: {file_path}")
print(f"Total records: {len(save_data_2015)}")
print(f"Data range: {save_data_2015['date'].min()} to {save_data_2015['date'].max()}")


✅ Air quality data (2015) saved to: /Users/sharin/Downloads/COS30049/Assignment/Assignment_2/COS30049-Computing-Technology-Innovation-Project-by-YSA/data/singapore/raw/pollutants/pollutants_2015.csv
Total records: 365
Data range: 2015-01-01 00:00:00 to 2015-12-31 00:00:00
