In [3]:
# ---------------------------------------
# ‚úÖ Historical Data Fetch Script (6 months, daily averages)
# ‚úÖ Notebook-ready: works inside /notebooks
# ---------------------------------------

import os
import requests
import pandas as pd
from datetime import datetime, timedelta, timezone
import time
from dotenv import load_dotenv

# ---------------------------------------
# 1Ô∏è‚É£ Load .env (notebook-friendly)
# ---------------------------------------
env_path = os.path.abspath("../.env")  # go up from /notebooks to project root
load_dotenv(env_path)

OPENWEATHER_API_KEY = os.getenv("OPENWEATHER_API_KEY")
if not OPENWEATHER_API_KEY:
    raise ValueError("‚ùå OpenWeather API key not found in .env")

print("‚úÖ OpenWeather API key loaded.")

# ---------------------------------------
# 2Ô∏è‚É£ Define South Asian capitals
# ---------------------------------------
cities = {
    "Kabul": {"lat": 34.5553, "lon": 69.2075},
    "Dhaka": {"lat": 23.8103, "lon": 90.4125},
    "Thimphu": {"lat": 27.4712, "lon": 89.6339},
    "New Delhi": {"lat": 28.6139, "lon": 77.2090},
    "Mal√©": {"lat": 4.1755, "lon": 73.5093},
    "Kathmandu": {"lat": 27.7172, "lon": 85.3240},
    "Islamabad": {"lat": 33.6844, "lon": 73.0479},
    "Colombo": {"lat": 6.9271, "lon": 79.8612},
}

# ---------------------------------------
# 3Ô∏è‚É£ Define time range (6 months)
# ---------------------------------------
end_date = datetime.now(timezone.utc).date()
start_date = end_date - timedelta(days=180)

print(f"Fetching data from {start_date} to {end_date}")

# ---------------------------------------
# 4Ô∏è‚É£ Helper function: Fetch AQ in 5-day chunks + aggregate daily
# ---------------------------------------
def fetch_openweather_aq(lat, lon, start_dt, end_dt):
    aq_records = []
    current_start = start_dt
    while current_start < end_dt:
        current_end = min(current_start + timedelta(days=5), end_dt)
        start_unix = int(datetime.combine(current_start, datetime.min.time(), tzinfo=timezone.utc).timestamp())
        end_unix = int(datetime.combine(current_end, datetime.min.time(), tzinfo=timezone.utc).timestamp())
        
        url = (
            f"http://api.openweathermap.org/data/2.5/air_pollution/history?"
            f"lat={lat}&lon={lon}&start={start_unix}&end={end_unix}&appid={OPENWEATHER_API_KEY}"
        )
        r = requests.get(url)
        if r.status_code == 200:
            data = r.json()
            for item in data.get("list", []):
                dt = datetime.utcfromtimestamp(item["dt"]).date()
                record = {"date": dt}
                # Map components to OpenAQ-style names
                mapping = {
                    "pm2_5": "pm25",
                    "pm10": "pm10",
                    "no2": "no2",
                    "so2": "so2",
                    "co": "co",
                    "o3": "o3",
                    "nh3": "nh3"
                }
                for k, v in item["components"].items():
                    if k in mapping:
                        record[mapping[k]] = v
                aq_records.append(record)
        else:
            print(f"‚ö†Ô∏è AQ API failed for {lat},{lon} chunk {current_start} ‚Üí {current_end}")
        current_start = current_end
        time.sleep(1)  # Respect API limits

    # Convert to DataFrame
    aq_df = pd.DataFrame(aq_records)
    if not aq_df.empty:
        # Group by date to get daily averages
        aq_daily = aq_df.groupby('date')[["pm25","pm10","no2","so2","co","o3","nh3"]].mean().reset_index()
        return aq_daily
    else:
        return pd.DataFrame()  # empty

# ---------------------------------------
# 5Ô∏è‚É£ Loop through cities and fetch data
# ---------------------------------------
all_data = []

for city, coords in cities.items():
    print(f"\nüìç Fetching data for {city} ...")
    lat, lon = coords["lat"], coords["lon"]

    # --- Weather Data (Open-Meteo) ---
    weather_url = (
        f"https://archive-api.open-meteo.com/v1/archive?"
        f"latitude={lat}&longitude={lon}"
        f"&start_date={start_date}&end_date={end_date}"
        f"&daily=temperature_2m_max,temperature_2m_min,precipitation_sum,"
        f"windspeed_10m_max,relative_humidity_2m_max,relative_humidity_2m_min"
        f"&timezone=UTC"
    )
    w = requests.get(weather_url)
    if w.status_code != 200:
        print(f"‚ùå Weather data failed for {city}")
        continue
    w_json = w.json()
    weather_df = pd.DataFrame({
        "date": w_json["daily"]["time"],
        "temp_max": w_json["daily"]["temperature_2m_max"],
        "temp_min": w_json["daily"]["temperature_2m_min"],
        "humidity_max": w_json["daily"]["relative_humidity_2m_max"],
        "humidity_min": w_json["daily"]["relative_humidity_2m_min"],
        "precipitation": w_json["daily"]["precipitation_sum"],
        "wind_speed": w_json["daily"]["windspeed_10m_max"]
    })
    weather_df["date"] = pd.to_datetime(weather_df["date"]).dt.date

    # --- Air Quality (OpenWeather) + daily average ---
    aq_df = fetch_openweather_aq(lat, lon, start_date, end_date)
    if aq_df.empty:
        print(f"‚ö†Ô∏è No AQ data for {city}, only weather will be saved.")
        merged = weather_df.copy()
    else:
        merged = pd.merge(weather_df, aq_df, on="date", how="left")

    merged["city"] = city
    all_data.append(merged)
    print(f"‚úÖ Done for {city} ({len(merged)} records)")

# ---------------------------------------
# 6Ô∏è‚É£ Combine all cities and save CSV
# ---------------------------------------
if all_data:
    final_df = pd.concat(all_data, ignore_index=True)
    save_path = "../data/south_asia_6months_data.csv"
    final_df.to_csv(save_path, index=False)
    print("\n‚úÖ All data saved to:", save_path)
else:
    print("‚ùå No data collected.")


‚úÖ OpenWeather API key loaded.
Fetching data from 2025-05-11 to 2025-11-07

üìç Fetching data for Kabul ...


  dt = datetime.utcfromtimestamp(item["dt"]).date()


‚úÖ Done for Kabul (181 records)

üìç Fetching data for Dhaka ...


  dt = datetime.utcfromtimestamp(item["dt"]).date()


‚úÖ Done for Dhaka (181 records)

üìç Fetching data for Thimphu ...


  dt = datetime.utcfromtimestamp(item["dt"]).date()


‚úÖ Done for Thimphu (181 records)

üìç Fetching data for New Delhi ...


  dt = datetime.utcfromtimestamp(item["dt"]).date()


‚úÖ Done for New Delhi (181 records)

üìç Fetching data for Mal√© ...


  dt = datetime.utcfromtimestamp(item["dt"]).date()


‚úÖ Done for Mal√© (181 records)

üìç Fetching data for Kathmandu ...


  dt = datetime.utcfromtimestamp(item["dt"]).date()


‚úÖ Done for Kathmandu (181 records)

üìç Fetching data for Islamabad ...


  dt = datetime.utcfromtimestamp(item["dt"]).date()


‚úÖ Done for Islamabad (181 records)

üìç Fetching data for Colombo ...


  dt = datetime.utcfromtimestamp(item["dt"]).date()


‚úÖ Done for Colombo (181 records)

‚úÖ All data saved to: ../data/south_asia_6months_data.csv
