# Historical dataset

In [14]:
import os
import requests
import pandas as pd
from datetime import datetime, timedelta, timezone
from google.colab import userdata

# -------------------------------
# CONFIGURATION
# -------------------------------
LAT, LON = 24.8607, 67.0011  # Karachi
API_KEY = userdata.get('OPENWEATHER_API_KEY')

if not API_KEY:
    raise ValueError("‚ö†Ô∏è API key not found. Make sure you added it to Colab Secrets.")
print("API key loaded ‚úÖ")


# -------------------------------
# HELPER FUNCTIONS
# -------------------------------
def to_unix(date_str):
    """Convert 'YYYY-MM-DD' to UNIX timestamp."""
    dt = datetime.strptime(date_str, "%Y-%m-%d")
    return int(dt.replace(tzinfo=timezone.utc).timestamp())


API key loaded ‚úÖ


# Open-Meteo API (weather data)

In [None]:
def fetch_weather_data(start_date, end_date):
    """
    Fetch hourly historical weather data (temperature, humidity, wind speed)
    from Open-Meteo API.
    """
    url = (
        f"https://archive-api.open-meteo.com/v1/archive?"
        f"latitude={LAT}&longitude={LON}"
        f"&start_date={start_date}&end_date={end_date}"
        f"&hourly=temperature_2m,relative_humidity_2m,wind_speed_10m"
    )

    response = requests.get(url, timeout=15)
    data = response.json()

    if "hourly" not in data or not data["hourly"]["time"]:
        raise ValueError("‚ö†Ô∏è No weather data returned. Check date range.")

    weather_df = pd.DataFrame({
        "datetime": data["hourly"]["time"],
        "temp": data["hourly"]["temperature_2m"],
        "humidity": data["hourly"]["relative_humidity_2m"],
        "wind_speed": data["hourly"]["wind_speed_10m"],
    })

    weather_df["datetime"] = pd.to_datetime(weather_df["datetime"])
    return weather_df


# OpenWeatherMap API (aqi + pollutants)

In [11]:
def fetch_aqi_data(start_date, end_date):
    """
    Fetch historical AQI + pollutant data from OpenWeatherMap API.
    """
    start_ts = to_unix(start_date)
    end_ts = to_unix(end_date)

    url = (
        f"http://api.openweathermap.org/data/2.5/air_pollution/history"
        f"?lat={LAT}&lon={LON}&start={start_ts}&end={end_ts}&appid={API_KEY}"
    )

    response = requests.get(url, timeout=15)
    data = response.json()

    if "list" not in data or not data["list"]:
        raise ValueError("‚ö†Ô∏è No air pollution data returned. Check date range or API key.")

    records = []
    for entry in data["list"]:
        ts = entry["dt"]
        air = entry["main"]
        comp = entry["components"]
        date_time = datetime.fromtimestamp(ts, tz=timezone.utc)


        records.append({
            "datetime": date_time,
            "aqi": air["aqi"],
            "pm2_5": comp.get("pm2_5"),
            "pm10": comp.get("pm10"),
            "co": comp.get("co"),
            "no2": comp.get("no2"),
            "so2": comp.get("so2"),
            "o3": comp.get("o3"),
            # "nh3": comp.get("nh3")
        })

    aqi_df = pd.DataFrame(records)
    return aqi_df

# Combined API's

In [12]:
def combine_weather_aqi(start_date, end_date, filename="historical_data.csv"):
    """
    Combine weather and AQI data by datetime (nearest hour) and save to CSV.
    """
    print("üå§Ô∏è Fetching weather data...")
    weather_df = fetch_weather_data(start_date, end_date)

    print("üí® Fetching air quality data...")
    aqi_df = fetch_aqi_data(start_date, end_date)

      # Ensure both datetime columns are timezone-aware (UTC)
    weather_df["datetime"] = pd.to_datetime(weather_df["datetime"], utc=True)
    aqi_df["datetime"] = pd.to_datetime(aqi_df["datetime"], utc=True)

    # Merge based on nearest timestamp (within 1 hour)
    combined_df = pd.merge_asof(
        aqi_df.sort_values("datetime"),
        weather_df.sort_values("datetime"),
        on="datetime",
        direction="nearest",
        tolerance=pd.Timedelta("1h")
    )

    combined_df.insert(1, "city", "Karachi")
    combined_df.to_csv(filename, index=False)

    print(f"‚úÖ Combined data saved to '{filename}' ({len(combined_df)} records)")
    return combined_df


In [13]:
# -------------------------------
# RUN THE SCRIPT
# -------------------------------
if __name__ == "__main__":
    # You can set any custom dates here (must be in 'YYYY-MM-DD' format)
    start_date = "2025-01-01"
    end_date = "2025-10-12"


    df = combine_weather_aqi(start_date, end_date)
    print(df.head())


üå§Ô∏è Fetching weather data...
üí® Fetching air quality data...
‚úÖ Combined data saved to 'historical_data.csv' (6577 records)
                   datetime     city  aqi   pm2_5    pm10       co    no2  \
0 2025-01-01 00:00:00+00:00  Karachi  5.0  121.64  170.55  1321.79  24.33   
1 2025-01-01 01:00:00+00:00  Karachi  5.0  120.37  166.26  1295.09  25.02   
2 2025-01-01 02:00:00+00:00  Karachi  5.0  128.43  174.78  1482.01  33.59   
3 2025-01-01 03:00:00+00:00  Karachi  5.0  158.69  214.13  2189.64  61.69   
4 2025-01-01 04:00:00+00:00  Karachi  5.0  178.27  247.68  2777.10  82.25   

     so2     o3  temp  humidity  wind_speed  
0  19.31  73.67  16.0        47        10.1  
1  19.79  75.82  15.5        48        10.5  
2  22.65  72.96  15.3        49         9.7  
3  29.09  54.36  15.3        49         9.5  
4  34.33  43.27  16.8        45         8.4  
