In [1]:
CITY = "HagsÃ¤tra, Stockholm"
LAT, LON = 59.2371, 17.9819 
AQICN_STATION_ID = "A59356"

In [4]:
import os
import datetime as dt
from typing import Dict, Any

import pandas as pd
import requests
import hopsworks

In [7]:
AQ_CSV_PATH = os.getenv("AQ_CSV_PATH", "data/aqicn_historical.csv")

BACKFILL_DAYS = 400

In [9]:
project = hopsworks.login() 
fs = project.get_feature_store()

2025-11-01 18:44:09,641 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-11-01 18:44:09,646 INFO: Initializing external client
2025-11-01 18:44:09,647 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-11-01 18:44:10,932 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1278100


In [10]:
print("Connected to project:", project.name)

Connected to project: airqualityapp


## HELPERS

In [13]:
def _get_open_meteo_archive(lat: float, lon: float, start: str, end: str) -> Dict[str, Any]:
    # Fetch daily weather  for a date range (inclusive).
    url = (
        "https://archive-api.open-meteo.com/v1/era5"
        f"?latitude={lat}&longitude={lon}"
        "&daily=wind_speed_10m_max,wind_direction_10m_dominant,wind_gusts_10m_max,"
        "temperature_2m_max,temperature_2m_min,precipitation_sum"
        f"&start_date={start}&end_date={end}&timezone=UTC"
    )
    r = requests.get(url, timeout=60)
    r.raise_for_status()
    return r.json()

In [14]:
def _to_utc_midnight(series: pd.Series) -> pd.Series:

    dt_index = pd.to_datetime(series, utc=True, errors="coerce")
    # Normalize to 00:00:00 UTC for daily event_time consistency
    return dt_index.normalize()


def _weather_payload_to_df(payload: Dict[str, Any], city: str) -> pd.DataFrame:
    daily = payload["daily"]
    df = pd.DataFrame(
        {
            "date": _to_utc_midnight(daily["time"]),
            "city": city,
            "temp_max": daily["temperature_2m_max"],
            "temp_min": daily["temperature_2m_min"],
            "precip_sum": daily["precipitation_sum"],
            "wind_speed_max": daily["wind_speed_10m_max"],
            "wind_gusts_max": daily["wind_gusts_10m_max"],
            "wind_direction_dominant": daily["wind_direction_10m_dominant"],
        }
    )
    # Enforce schema/order
    cols = [
        "city",
        "date",
        "temp_max",
        "temp_min",
        "precip_sum",
        "wind_speed_max",
        "wind_gusts_max",
        "wind_direction_dominant",
    ]
    return df[cols].sort_values("date").dropna(subset=["date"])

In [16]:
def _read_aq_csv(path: str, city: str) -> pd.DataFrame:
    # Read AQICN exported CSV -> DataFrame with (city, date, pm2_5).
    if not os.path.exists(path):
        raise FileNotFoundError(
            f"Air-quality CSV not found at '{path}'. "
            "Place a file with columns: date, pm2_5 (YYYY-MM-DD)."
        )
    df = pd.read_csv(path)
    # Flexible column casing
    col_map = {c.lower(): c for c in df.columns}
    date_col = col_map.get("date")
    pm_col = col_map.get("pm2_5") or col_map.get("pm2.5") or col_map.get("pm25")
    if not date_col or not pm_col:
        raise ValueError(
            f"CSV must contain 'date' and 'pm2_5' columns. Found: {list(df.columns)}"
        )

    out = pd.DataFrame(
        {
            "city": city,
            "date": _to_utc_midnight(df[date_col]),
            "pm2_5": pd.to_numeric(df[pm_col], errors="coerce"),
        }
    )
    out = out.dropna(subset=["date"]).sort_values("date")
    # Remove obvious duplicates
    out = out.drop_duplicates(subset=["city", "date"])
    return out

## Bulding Weather BACKFILL