In [None]:
import pandas as pd
import joblib
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np

## Settings

In [None]:
TARGET = "NO2"
OSM_ID = 8269826
MAP_HEX_SIZE = 7
COMMENT = None # String or None
TARGET_COLOR = "blue"
INFERENCE_COLOR = "firebrick"
if COMMENT:
    ML_TEST_DATA_FILE = f"../data/{TARGET}_test_dataset_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}_{COMMENT}.csv"
    ML_MODEL = f"../data/random_forest_{TARGET}_gios_{COMMENT}.pkl"
    TEST_TARGET_DATA = f"../data/{TARGET}_excluded_dataset_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}_{COMMENT}.csv"
else:
    ML_TEST_DATA_FILE = f"../data/{TARGET}_test_dataset_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}.csv"
    ML_MODEL = f"../data/random_forest_{TARGET}_gios.pkl"
    TEST_TARGET_DATA = f"../data/{TARGET}_excluded_dataset_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}.csv"
SELECTED_PARAMETERS = [
    "tree_cover",
    "grassland",
    "population_density",
    "low_vegetation",
    "medium_vegetation",
    "high_vegetation",
    "road",
    "residential_1",
    "residential_2",
    "residential_3",
    "residential_4",
    "non-residential_1",
    "non-residential_2",
    "non-residential_3",
    "non-residential_4",
    "temperature",
    "temperature_trend_3h",
    "temperature_trend_6h",
    "temperature_anomaly",
    "relative_humidity",
    "relative_humidity_trend_3h",
    "relative_humidity_trend_6h",
    "pressure",
    "pressure_trend_3h",
    "pressure_trend_6h",
    "precipitation",
    "precipitation_trend_3h",
    "precipitation_trend_6h",
    "wind_u",
    "wind_v",
    "traffic_daily_fraction",
    "traffic_yearly_mean_fraction",
    "no2_anomaly",
]
SELECTED_DATES = [
    "2023-02-28",
    "2023-03-22",
    "2023-08-22",
    "2023-10-18",
]

## Set default font for graphs

In [None]:
mpl.rcParams["font.family"] = "Palatino Linotype"

## Test model inference

In [None]:
df_inference = pd.read_csv(ML_TEST_DATA_FILE)
df_inference = df_inference[["timestamp_utc", "h3_index"] + SELECTED_PARAMETERS]
df_inference.dropna(inplace=True)
df_inference.head()

In [None]:
df_output = df_inference[["timestamp_utc", "h3_index"]].copy()
df_inference = df_inference.drop(columns=["timestamp_utc", "h3_index"]).copy()

In [None]:
rf_model = joblib.load(ML_MODEL)

In [None]:
df_output[f"{TARGET}_inference"] = rf_model.predict(df_inference)
df_output.head()

In [None]:
df_test = pd.read_csv(TEST_TARGET_DATA)
df_test

In [None]:
df_corr = pd.merge(df_test, df_output, on=["timestamp_utc", "h3_index"], how="inner")
df_corr.head()

In [None]:
df_corr[["no2_gios", f"{TARGET}_inference"]].corr(method="pearson")

In [None]:
from sklearn.metrics import root_mean_squared_error

In [None]:
root_mean_squared_error(df_corr["no2_gios"], df_corr[f"{TARGET}_inference"])

In [None]:
df_corr_monthly = df_corr.copy()
df_corr_monthly["timestamp_utc"] = pd.to_datetime(df_corr_monthly["timestamp_utc"])
df_corr_monthly["month"] = df_corr_monthly["timestamp_utc"].dt.month
df_corr_monthly.drop(columns=["h3_index", "timestamp_utc"], inplace=True)
df_corr_monthly = df_corr_monthly.groupby(by="month").agg({
    "no2_gios": ["mean", "std"],
    f"{TARGET}_inference":  ["mean", "std"],
})
df_corr_monthly

In [None]:
x = np.array(df_corr_monthly.index.tolist(), dtype=int)
y1 = df_corr_monthly["no2_gios"]["mean"].values.astype(float)
y1_std = df_corr_monthly["no2_gios"]["std"].values.astype(float)
y2 = df_corr_monthly[f"{TARGET}_inference"]["mean"].values.astype(float)
y2_std = df_corr_monthly[f"{TARGET}_inference"]["std"].values.astype(float)

fig, axes = plt.subplots(1, 2, figsize=(18, 8), sharey=False, dpi=300)

major_ticks = list(range(1, 13))
major_ticks_labels = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
minor_ticks = [i for i in range(1, 13) if i not in major_ticks]

axes[0].plot(x, y1, label="Ground stations", color=TARGET_COLOR)
axes[0].fill_between(x, y1 - y1_std, y1 + y1_std, alpha=0.2, label="± std", color=TARGET_COLOR)
axes[0].set_title("a)", loc="left", fontsize=16, fontweight="bold")
axes[0].set_xlabel("Month", fontsize=16)
axes[0].set_ylabel("Mean NO₂ concentration (μg/m³)", fontsize=16)
axes[0].set_xlim(1, 12)
axes[0].set_xticks(major_ticks)
axes[0].set_xticklabels(major_ticks_labels, fontsize=16, rotation=45)
axes[0].set_xticks(minor_ticks, minor=True)
axes[0].set_ylim(0, 45)
axes[0].tick_params(axis="both", which="major", labelsize=16, length=7)
axes[0].tick_params(axis="both", which="minor", labelsize=0, length=4)
axes[0].legend(fontsize=16)
axes[0].grid(True, which="major", linestyle="--", alpha=0.8)
axes[0].grid(True, which="minor", linestyle=":", alpha=0.4)

axes[1].plot(x, y2, label="ML model inference", color=INFERENCE_COLOR)
axes[1].fill_between(x, y2 - y2_std, y2 + y2_std, alpha=0.2, label="± std", color=INFERENCE_COLOR)
axes[1].set_title("b)", loc="left", fontsize=16, fontweight="bold")
axes[1].set_xlabel("Month", fontsize=16)
axes[1].set_ylabel("Mean NO₂ concentration (μg/m³)", fontsize=16)
axes[1].set_xlim(1, 12)
axes[1].set_xticks(major_ticks)
axes[1].set_xticklabels(major_ticks_labels, fontsize=16, rotation=45)
axes[1].set_xticks(minor_ticks, minor=True)
axes[1].set_ylim(0, 45)
axes[1].tick_params(axis="both", which="major", labelsize=16, length=7)
axes[1].tick_params(axis="both", which="minor", labelsize=0, length=4)
axes[1].legend(fontsize=16)
axes[1].grid(True, which="major", linestyle="--", alpha=0.8)
axes[1].grid(True, which="minor", linestyle=":", alpha=0.4)

plt.tight_layout(rect=[0, 0, 1, 0.95])

if COMMENT:
    plt.savefig(f"inference_test_month_{TARGET}_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}_{COMMENT}.png")
else:
    plt.savefig(f"inference_test_month__{TARGET}_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}.png")

plt.show()

In [None]:
df_corr_hourly = df_corr.copy()
df_corr_hourly["timestamp_utc"] = pd.to_datetime(df_corr_hourly["timestamp_utc"])
df_corr_hourly["hour"] = df_corr_hourly["timestamp_utc"].dt.hour
df_corr_hourly.drop(columns=["h3_index", "timestamp_utc"], inplace=True)
df_corr_hourly = df_corr_hourly.groupby(by="hour").agg({
    "no2_gios": ["mean", "std"],
    f"{TARGET}_inference":  ["mean", "std"],
})
df_corr_hourly

In [None]:
x = np.array(df_corr_hourly.index.tolist(), dtype=int)
y1 = df_corr_hourly["no2_gios"]["mean"].values.astype(float)
y1_std = df_corr_hourly["no2_gios"]["std"].values.astype(float)
y2 = df_corr_hourly[f"{TARGET}_inference"]["mean"].values.astype(float)
y2_std = df_corr_hourly[f"{TARGET}_inference"]["std"].values.astype(float)

fig, axes = plt.subplots(1, 2, figsize=(18, 8), sharey=False, dpi=300)

major_ticks = [i for i in range(0, 24, 4)]
minor_ticks = [i for i in range(0, 24) if i not in major_ticks]
axes[0].plot(x, y1, label="Ground stations", color=TARGET_COLOR)
axes[0].fill_between(x, y1 - y1_std, y1 + y1_std, alpha=0.2, label="± std", color=TARGET_COLOR)
axes[0].set_title("a)", loc="left", fontsize=16, fontweight="bold")
axes[0].set_xlabel("Hour", fontsize=16)
axes[0].set_ylabel("Mean NO₂ concentration (μg/m³)", fontsize=16)
axes[0].set_xlim(0, 23)
axes[0].set_xticks(major_ticks)
axes[0].set_xticks(minor_ticks, minor=True)
axes[0].set_ylim(0, 50)
axes[0].tick_params(axis="both", which="major", labelsize=16, length=7)
axes[0].tick_params(axis="both", which="minor", labelsize=0, length=4)
axes[0].legend(fontsize=16)
axes[0].grid(True, which="major", linestyle="--", alpha=0.8)
axes[0].grid(True, which="minor", linestyle=":", alpha=0.4)

axes[1].plot(x, y2, label="ML model inference", color=INFERENCE_COLOR)
axes[1].fill_between(x, y2 - y2_std, y2 + y2_std, alpha=0.2, label="± std", color=INFERENCE_COLOR)
axes[1].set_title("b)", loc="left", fontsize=16, fontweight="bold")
axes[1].set_xlabel("Hour", fontsize=16)
axes[1].set_ylabel("Mean NO₂ concentration (μg/m³)", fontsize=16)
axes[1].set_xlim(0, 23)
axes[1].set_xticks(major_ticks)
axes[1].set_xticks(minor_ticks, minor=True)
axes[1].set_ylim(0, 50)
axes[1].tick_params(axis="both", which="major", labelsize=16, length=7)
axes[1].tick_params(axis="both", which="minor", labelsize=0, length=4)
axes[1].legend(fontsize=16)
axes[1].grid(True, which="major", linestyle="--", alpha=0.8)
axes[1].grid(True, which="minor", linestyle=":", alpha=0.4)

plt.tight_layout(rect=[0, 0, 1, 0.95])

if COMMENT:
    plt.savefig(f"inference_test_hour_{TARGET}_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}_{COMMENT}.png")
else:
    plt.savefig(f"inference_test_hour__{TARGET}_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}.png")

plt.show()