In [1]:
import sys
from pathlib import Path
import warnings

warnings.filterwarnings("ignore", module="IPython")


def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False


def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book


def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml


if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip subdirectories from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ("src",):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ("airquality",):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ("notebooks",):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir)
    print("Local environment")

print(f"Root dir: {root_dir}")

# Add the root directory to the `PYTHONPATH`
if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

# Set the environment variables from the file <root_dir>/.env
from src import config

settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Root dir: /Users/filipsjostrand/School/id2223/pm25-forecast-openmeteo-aqicn
Added the following directory to the PYTHONPATH: /Users/filipsjostrand/School/id2223/pm25-forecast-openmeteo-aqicn
HopsworksSettings initialized!


In [2]:
import datetime
import time
import requests
import pandas as pd
import hopsworks
from src.airquality import util
from src import config
import json
import os
import warnings
import pandas as pd


warnings.filterwarnings("ignore")

# Setup

In [3]:
project = hopsworks.login(engine="python")
fs = project.get_feature_store()
secrets = hopsworks.get_secrets_api()
AQICN_API_KEY = secrets.get_secret("AQICN_API_KEY").value

# Retrieve feature groups
air_quality_fg = fs.get_feature_group(
    name="air_quality_all",
    version=1,
)
weather_fg = fs.get_feature_group(
    name="weather_all",
    version=1,
)

locations_str = secrets.get_secret("SENSOR_LOCATIONS_JSON").value
locations = json.loads(locations_str)

today = datetime.date.today()

2025-11-11 16:51:19,888 INFO: Initializing external client
2025-11-11 16:51:19,889 INFO: Base URL: https://c.app.hopsworks.ai:443






2025-11-11 16:51:21,480 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279172


# Methods

In [4]:
def get_daily_weather_forecast(city, latitude, longitude):
    hourly_df = util.get_hourly_weather_forecast(city, latitude, longitude)
    hourly_df = hourly_df.set_index("date")
    daily_df = hourly_df.between_time("11:59", "12:01")
    daily_df = daily_df.reset_index()
    daily_df["date"] = pd.to_datetime(daily_df["date"]).dt.date
    daily_df["date"] = pd.to_datetime(daily_df["date"])
    daily_df["city"] = city
    return daily_df


def fetch_data_for_location(location):
    country = location["country"]
    city = location["city"]
    street = location["street"]
    aqicn_url = location["aqicn_url"]
    latitude = location["latitude"]
    longitude = location["longitude"]

    aq_today_df = util.get_pm25(aqicn_url, country, city, street, today, AQICN_API_KEY)
    daily_df = get_daily_weather_forecast(city, latitude, longitude)
    return aq_today_df, daily_df

# Script

In [5]:
aqs = []
weathers = []
for sensor, location in locations.items():
    aq_today_df, weather_daily_forecast_df = fetch_data_for_location(location)

    # Air quality FG shape
    aq_today_df = aq_today_df.assign(
        sensor_id=str(sensor),
        street=location["street"],
        city=location["city"],
        country=location["country"],
        feed_url=location["aqicn_url"],
    )
    aq_today_df["date"] = pd.to_datetime(aq_today_df["date"])

    # Weather FG shape
    weather_daily_forecast_df = weather_daily_forecast_df.assign(
        sensor_id=str(sensor),
        city=location["city"],
        latitude=location["latitude"],
        longitude=location["longitude"],
    )
    weather_daily_forecast_df["date"] = pd.to_datetime(
        weather_daily_forecast_df["date"]
    )

    aqs.append(aq_today_df)
    weathers.append(weather_daily_forecast_df)

Coordinates 57.75°N 12.0°E
Elevation 20.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 57.75°N 12.0°E
Elevation 12.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 57.75°N 12.0°E
Elevation 76.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 57.75°N 12.0°E
Elevation 51.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 57.75°N 12.0°E
Elevation 21.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 57.5°N 12.0°E
Elevation 55.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 57.75°N 11.75°E
Elevation 7.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 57.75°N 12.0°E
Elevation 24.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 57.75°N 12.0°E
Elevation 14.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 57.75°N 12.0°E
Elevation 72.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
C

In [None]:
aq_df = pd.concat(aqs)
aq_df["pm25"] = pd.to_numeric(aq_df["pm25"], errors="coerce").astype("float64")
aq_df["date"] = pd.to_datetime(aq_df["date"])
aq_df = aq_df[["date", "sensor_id", "pm25", "street", "city", "country", "feed_url"]]
air_quality_fg.insert(aq_df)

2025-11-11 16:51:48,861 INFO: 	1 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279172/fs/1265787/fg/1668688


Uploading Dataframe: 100.00% |██████████| Rows 15/15 | Elapsed Time: 00:00 | Remaining Time: 00:00


(Job('air_quality_all_1_offline_fg_materialization', 'SPARK'),
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "pm25",
           "min_value": -0.1,
           "max_value": 500.0,
           "strict_min": true
         },
         "meta": {
           "expectationId": 739396
         }
       },
       "result": {
         "observed_value": 3.0,
         "element_count": 15,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2025-11-11T03:51:48.000861Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     }
   ],
   "evaluation_parameters": {},
   "statistics": {
     "evaluated_expectations": 1,
     "successful_expe

In [7]:
weather_df = pd.concat(weathers)
weather_df["date"] = pd.to_datetime(weather_df["date"])
weather_fg.insert(weather_df)

2025-11-11 16:51:56,713 INFO: 	2 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279172/fs/1265787/fg/1668689


Uploading Dataframe: 100.00% |██████████| Rows 105/105 | Elapsed Time: 00:00 | Remaining Time: 00:00


(Job('weather_all_1_offline_fg_materialization', 'SPARK'),
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "precipitation_sum",
           "min_value": -0.1,
           "max_value": 1000.0,
           "strict_min": true
         },
         "meta": {
           "expectationId": 739398
         }
       },
       "result": {
         "observed_value": 0.0,
         "element_count": 105,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2025-11-11T03:51:56.000713Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     },
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_c