# Notebook 1: Data Loading and Preprocessing
## Introduction
# Loads PM2.5 data from OpenAQ API and weather from Open-Meteo. Merges, cleans (handles missing/outliers), and saves CSV.
# Justification: Official APIs for reliable data; merge on timestamp for hourly alignment. Cap PM2.5 at 150.5 µg/m³ (Unhealthy AQI threshold) to handle outliers without bias, preserving data for SDG 11 (urban air quality).

In [1]:
import pandas as pd
import numpy as np
import requests
import time
import os

# Configuration
sensors_id = "12178556"  # Singapore PM2.5 Sensor ID
latitude = 1.3521
longitude = 103.8198
location_name = "Singapore"
openaq_api_key = "4f1e60991c483fd961169d77137baa593d9568f4fa71585725860294b370bc43"  # Replace with actual key
base_processed_file = f'sensor_{sensors_id}_{location_name}_pm25_weather_hourly_data_processed_capped.csv'

# Check if processed data exists
if os.path.exists(base_processed_file):
    print(f"Loading existing processed data from {base_processed_file}")
    merged_df = pd.read_csv(base_processed_file, index_col='timestamp', parse_dates=True)
else:
    # Fetch PM2.5 (add error handling)
    all_pm25_records = []
    page = 1
    limit_per_page = 1000
    date_from_str = '2022-05-01'
    date_to_str = '2024-04-30'
    while True:
        base_url = f"https://api.openaq.org/v3/sensors/{sensors_id}/hours"
        params = {"date_from": date_from_str, "date_to": date_to_str, "limit": limit_per_page, "page": page}
        headers = {"X-API-Key": openaq_api_key}
        response = requests.get(base_url, params=params, headers=headers)
        if response.status_code == 200:
            data = response.json()
            results = data.get('results', [])
            if not results: break
            for result in results:
                all_pm25_records.append({'timestamp': result['period']['datetimeFrom']['utc'], 'pm25_value': result['value']})
            page += 1
            time.sleep(2)
        else:
            print(f"API error: {response.text}")
            break

    aq_df_raw = pd.DataFrame(all_pm25_records)
    aq_df_raw['timestamp'] = pd.to_datetime(aq_df_raw['timestamp'], utc=True)
    aq_df_raw.set_index('timestamp', inplace=True)
    aq_df_raw.sort_index(inplace=True)
    aq_df_raw['pm25_value'] = pd.to_numeric(aq_df_raw['pm25_value'], errors='coerce')
    aq_df_raw.dropna(inplace=True)
    aq_df_raw = aq_df_raw[aq_df_raw['pm25_value'] >= 0]

    # Cap outliers
    PM25_CAP = 150.5  # Justification: Aligns with AQI 'Unhealthy' threshold; prevents model skew from rare high values.
    aq_df_raw['pm25_value'] = np.where(aq_df_raw['pm25_value'] > PM25_CAP, PM25_CAP, aq_df_raw['pm25_value'])

    # Fetch weather
    weather_url = "https://archive-api.open-meteo.com/v1/archive"
    weather_params = {
        "latitude": latitude, "longitude": longitude,
        "start_date": aq_df_raw.index.min().strftime('%Y-%m-%d'),
        "end_date": aq_df_raw.index.max().strftime('%Y-%m-%d'),
        "hourly": "temperature_2m,relative_humidity_2m,wind_speed_10m,wind_direction_10m,precipitation",
        "timezone": "UTC"
    }
    response_weather = requests.get(weather_url, params=weather_params)
    weather_data = response_weather.json()['hourly']
    weather_df = pd.DataFrame(weather_data)
    weather_df['time'] = pd.to_datetime(weather_df['time'], utc=True)
    weather_df.set_index('time', inplace=True)
    weather_df.rename(columns={'temperature_2m': 'temp', 'relative_humidity_2m': 'humidity', 'wind_speed_10m': 'wind_speed', 'wind_direction_10m': 'wind_dir', 'precipitation': 'precipitation'}, inplace=True)

    # Merge and clean
    merged_df = pd.merge(aq_df_raw, weather_df, left_index=True, right_index=True, how='inner')
    merged_df.interpolate(method='linear', limit_direction='both', inplace=True)
    merged_df.dropna(inplace=True)
    merged_df.to_csv(base_processed_file)
    print(f"Saved processed data to {base_processed_file}")


Saved processed data to sensor_12178556_Singapore_pm25_weather_hourly_data_processed_capped.csv
