In [None]:
import pandas as pd
import joblib
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import numpy as np

## Settings

In [None]:
TARGET = "NO2"
OSM_ID = 8269826
MAP_HEX_SIZE = 7 # 7 or 9
SELECTED_YEARS = (2023, 2024)
years_str = "_".join(str(year) for year in SELECTED_YEARS)
COMMENT = "artificial_S5P_scaled_wind_shift" # String or None
TARGET_COLOR = "blue"
if MAP_HEX_SIZE == 7:
    INFERENCE_COLOR = "forestgreen"
elif MAP_HEX_SIZE == 9:
    INFERENCE_COLOR = "firebrick"
ML_TEST_DATA_FILE = f"../data/{TARGET}_test_dataset_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}_year_{years_str}.csv"
if COMMENT:
    ML_MODEL = f"../data/random_forest_{TARGET}_gios_{COMMENT}.pkl"
else:
    ML_MODEL = f"../data/random_forest_{TARGET}_gios.pkl"
SELECTED_PARAMETERS = [
    "tree_cover",
    "grassland",
    "population_density",
    "low_vegetation",
    "medium_vegetation",
    "high_vegetation",
    "road",
    "residential_1",
    "residential_2",
    "residential_3",
    "residential_4",
    "non-residential_1",
    "non-residential_2",
    "non-residential_3",
    "non-residential_4",
    "temperature",
    "temperature_trend_3h",
    "temperature_trend_6h",
    "temperature_anomaly",
    "relative_humidity",
    "relative_humidity_trend_3h",
    "relative_humidity_trend_6h",
    "pressure",
    "pressure_trend_3h",
    "pressure_trend_6h",
    "precipitation",
    "wind_speed",
    "traffic_mean_count",
    "day_of_year_sin",
    "day_of_year_cos",
    "working_day",
]

## Set default font for graphs

In [None]:
mpl.rcParams["font.family"] = "Palatino Linotype"

## Helper functions

In [None]:
from typing import Tuple

def rounded_range(data: pd.Series, resolution: int = 10) -> Tuple[int, int]:
    bottom = round(data.min() / resolution - 0.5) * resolution
    top = round(data.max() / resolution + 0.5) * resolution

    return (bottom, top)

## Test model inference

In [None]:
df_test = pd.read_csv(ML_TEST_DATA_FILE)
df_test.dropna(inplace=True)
df_test.head()

In [None]:
df_target = df_test[["h3_index", "timestamp_utc", "no2_gios"]].copy()
df_target.head()

In [None]:
df_output = df_test[["timestamp_utc", "h3_index"]].copy()
df_test = df_test[SELECTED_PARAMETERS]

In [None]:
rf_model = joblib.load(ML_MODEL)
qt_model = joblib.load(ML_MODEL.replace("random_forest", "random_forest_EDM_qt_model"))
qt_obs = joblib.load(ML_MODEL.replace("random_forest", "random_forest_EDM_qt_obs"))

In [None]:
rf_pred = rf_model.predict(df_test)
y_pred_val = np.asarray(rf_pred).reshape(-1, 1)

rf_pred_edm = qt_obs.inverse_transform(qt_model.transform(y_pred_val)).ravel()

In [None]:
df_output[f"{TARGET}_inference"] = rf_pred
df_output[f"{TARGET}_inference_edm"] = rf_pred_edm

In [None]:
df_output["timestamp_utc"] = pd.to_datetime(df_output["timestamp_utc"])
df_target["timestamp_utc"] = pd.to_datetime(df_target["timestamp_utc"])

In [None]:
df_corr = pd.merge(df_target, df_output, on=["h3_index", "timestamp_utc"], how="inner")
df_corr.head()

In [None]:
df_corr[["no2_gios", f"{TARGET}_inference", f"{TARGET}_inference_edm"]].corr(method="pearson")

In [None]:
df_corr["timestamp_utc"] = pd.to_datetime(df_corr["timestamp_utc"])
df_corr["hour"] = df_corr["timestamp_utc"].dt.hour

In [None]:
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from scipy.stats import wasserstein_distance

In [None]:
from sklearn.metrics import r2_score

rmse = root_mean_squared_error(df_corr["no2_gios"], df_corr[f"{TARGET}_inference"])
mae = mean_absolute_error(df_corr["no2_gios"], df_corr[f"{TARGET}_inference"])
wd = wasserstein_distance(df_corr["no2_gios"], df_corr[f"{TARGET}_inference"])
r_squared = r2_score(df_corr["no2_gios"], df_corr[f"{TARGET}_inference"])

print("Model performance for validation data:")
print(f"Root mean Squared Error: {rmse:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Wasserstein Distance: {wd:.2f}")
print(f"R-squared value: {r_squared:.2f}")

In [None]:
from sklearn.metrics import r2_score

rmse_edm = root_mean_squared_error(df_corr["no2_gios"], df_corr[f"{TARGET}_inference_edm"])
mae_edm = mean_absolute_error(df_corr["no2_gios"], df_corr[f"{TARGET}_inference_edm"])
wd_edm = wasserstein_distance(df_corr["no2_gios"], df_corr[f"{TARGET}_inference_edm"])
r_squared_edm = r2_score(df_corr["no2_gios"], df_corr[f"{TARGET}_inference_edm"])

print("Model performance for validation data after EDM correction:")
print(f"Root mean Squared Error: {rmse_edm:.2f}")
print(f"Mean Absolute Error: {mae_edm:.2f}")
print(f"Wasserstein Distance: {wd_edm:.2f}")
print(f"R-squared value: {r_squared_edm:.2f}")

In [None]:
_, top_y = rounded_range(df_corr["no2_gios"], resolution=10)
_, top_rf_pred_edm = rounded_range(df_corr[f"{TARGET}_inference_edm"], resolution=10)

axis_min = 0
axis_max = max(top_y, top_rf_pred_edm)

plt.figure(figsize=(10, 8), dpi=300)

sns.scatterplot(x=df_corr["no2_gios"], y=df_corr[f"{TARGET}_inference"], color="blue", label="Predictions on validation dataset")
sns.scatterplot(x=df_corr["no2_gios"], y=df_corr[f"{TARGET}_inference_edm"], color="green", label="EDM-corrected prediction")
p = sns.regplot(x=df_corr["no2_gios"], y=df_corr[f"{TARGET}_inference_edm"], scatter=False, color="green", label="Regression line (validation)")
slope, intercept, r, p, sterr = scipy.stats.linregress(x=p.get_lines()[0].get_xdata(), y=p.get_lines()[0].get_ydata())

plt.plot([axis_min, axis_max], [axis_min, axis_max], "r--", label="Line 1:1")

plt.xlabel("Measured NO₂ concentration (μg/m³)", fontsize=16)
plt.ylabel("Predicted NO₂ concentration (μg/m³)", fontsize=16)
plt.title(f"EDM corrected random forest regression prediction of NO₂ concentration\nRMSE: {rmse_edm:.2f}, MAE: {mae_edm:.2f}, R²: {r_squared_edm:.2f}, y = {slope:.3f} x + {intercept:.3f}", fontsize=16, fontweight="bold")
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.legend(fontsize=16)
plt.grid(True)
plt.xlim(axis_min, axis_max)
plt.ylim(axis_min, axis_max)
plt.gca().set_aspect("equal", adjustable="box")
plt.tight_layout()

if COMMENT:
    prediction_test_file_name = f"ML_rf_tets_data_EDM_{TARGET}_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}_{COMMENT}.png"
else:
    prediction_test_file_name = f"ML_rf_test_data_EDM_{TARGET}_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}.png"

plt.savefig(prediction_test_file_name)
plt.show()

In [None]:
df_corr["residuals"] = df_corr["no2_gios"] - df_corr["NO2_inference"]
df_corr["residuals_edm"] = df_corr["no2_gios"] - df_corr["NO2_inference_edm"]

In [None]:
print("Residuals statistics:")
print(f"Mean: {float(np.mean(df_corr["residuals"])):.2f}")
print(f"MAE: {float(np.mean(np.abs(df_corr["residuals"]))):.2f}")
print(f"RMSE: {float(np.sqrt(np.mean(df_corr["residuals"]**2))):.2f}")
print(f"Skew: {float(scipy.stats.skew(df_corr["residuals"], bias=False)):.2f}")

In [None]:
print("EDM corrected residuals statistics:")
print(f"Mean: {float(np.mean(df_corr["residuals_edm"])):.2f}")
print(f"MAE: {float(np.mean(np.abs(df_corr["residuals_edm"]))):.2f}")
print(f"RMSE: {float(np.sqrt(np.mean(df_corr["residuals_edm"]**2))):.2f}")
print(f"Skew: {float(scipy.stats.skew(df_corr["residuals_edm"], bias=False)):.2f}")

In [None]:
plt.figure(figsize=(10, 8), dpi=300)

bin_edges = np.arange(df_corr["residuals"].min() - (df_corr["residuals"].min() % 2), df_corr["residuals"].max() + 2, 2)

plt.hist(df_corr["residuals"], bins=bin_edges, density=True, color="blue", rwidth=0.99)

plt.title("Random forest - histogram of residuals", fontsize=16, fontweight="bold")
plt.xlabel("Residual", fontsize=16)
plt.ylabel("Density", fontsize=16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.xlim(-50, 50)
plt.ylim(0, 0.08)
plt.grid(visible=True, axis="y")
plt.tight_layout()

if COMMENT:
    residuals_file_name = f"rf_residuals_hist_{COMMENT}_hex_{MAP_HEX_SIZE}.png"
else:
    residuals_file_name = f"rf_residuals_hist_hex_{MAP_HEX_SIZE}.png"

plt.savefig(residuals_file_name)
plt.show()

In [None]:
plt.figure(figsize=(10, 8), dpi=300)

bin_edges = np.arange(df_corr["residuals_edm"].min() - (df_corr["residuals_edm"].min() % 2), df_corr["residuals_edm"].max() + 2, 2)

plt.hist(df_corr["residuals_edm"], bins=bin_edges, density=True, color="blue", rwidth=0.99)

plt.title("Random forest - histogram of EDM corrected residuals", fontsize=16, fontweight="bold")
plt.xlabel("Residual", fontsize=16)
plt.ylabel("Density", fontsize=16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.xlim(-50, 50)
plt.ylim(0, 0.08)
plt.grid(visible=True, axis="y")
plt.tight_layout()

if COMMENT:
    residuals_file_name = f"rf_residuals_hist_EDM_{COMMENT}_hex_{MAP_HEX_SIZE}.png"
else:
    residuals_file_name = f"rf_residuals_hist_EDM_hex_{MAP_HEX_SIZE}.png"

plt.savefig(residuals_file_name)
plt.show()

In [None]:
df_corr_monthly = df_corr.copy()
df_corr_monthly["timestamp_utc"] = pd.to_datetime(df_corr_monthly["timestamp_utc"])
df_corr_monthly["month"] = df_corr_monthly["timestamp_utc"].dt.month
df_corr_monthly.drop(columns=["h3_index", "timestamp_utc"], inplace=True)
df_corr_monthly = df_corr_monthly.groupby(by="month").agg({
    "no2_gios": ["mean", "median", "std"],
    f"{TARGET}_inference":  ["mean", "median", "std"],
})
df_corr_monthly

In [None]:
x = np.array(df_corr_monthly.index.tolist(), dtype=int)
y1 = df_corr_monthly["no2_gios"]["mean"].values.astype(float)
y1_std = df_corr_monthly["no2_gios"]["std"].values.astype(float)
y2 = df_corr_monthly[f"{TARGET}_inference"]["mean"].values.astype(float)
y2_std = df_corr_monthly[f"{TARGET}_inference"]["std"].values.astype(float)

fig, axes = plt.subplots(1, 2, figsize=(18, 8), sharey=False, dpi=300)

major_ticks = list(range(1, 13))
major_ticks_labels = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
minor_ticks = [i for i in range(1, 13) if i not in major_ticks]

axes[0].plot(x, y1, label="Ground stations", color=TARGET_COLOR)
axes[0].fill_between(x, y1 - y1_std, y1 + y1_std, alpha=0.2, label="± std", color=TARGET_COLOR)
axes[0].set_xlabel("Month", fontsize=16)
axes[0].set_ylabel("Mean NO₂ concentration (μg/m³)", fontsize=16)
axes[0].set_xlim(1, 12)
axes[0].set_xticks(major_ticks)
axes[0].set_xticklabels(major_ticks_labels, fontsize=16, rotation=45)
axes[0].set_xticks(minor_ticks, minor=True)
axes[0].set_ylim(0, 50)
axes[0].tick_params(axis="both", which="major", labelsize=16, length=7)
axes[0].tick_params(axis="both", which="minor", labelsize=0, length=4)
axes[0].legend(fontsize=16)
axes[0].grid(True, which="major", linestyle="--", alpha=0.8)
axes[0].grid(True, which="minor", linestyle=":", alpha=0.4)

axes[1].plot(x, y2, label="ML model inference", color=INFERENCE_COLOR)
axes[1].fill_between(x, y2 - y2_std, y2 + y2_std, alpha=0.2, label="± std", color=INFERENCE_COLOR)
axes[1].set_xlabel("Month", fontsize=16)
axes[1].set_ylabel("Mean NO₂ concentration (μg/m³)", fontsize=16)
axes[1].set_xlim(1, 12)
axes[1].set_xticks(major_ticks)
axes[1].set_xticklabels(major_ticks_labels, fontsize=16, rotation=45)
axes[1].set_xticks(minor_ticks, minor=True)
axes[1].set_ylim(0, 50)
axes[1].tick_params(axis="both", which="major", labelsize=16, length=7)
axes[1].tick_params(axis="both", which="minor", labelsize=0, length=4)
axes[1].legend(fontsize=16)
axes[1].grid(True, which="major", linestyle="--", alpha=0.8)
axes[1].grid(True, which="minor", linestyle=":", alpha=0.4)

plt.tight_layout(rect=[0, 0, 1, 0.95])

if COMMENT:
    plt.savefig(f"inference_test_month_mean_{TARGET}_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}_{COMMENT}.png")
else:
    plt.savefig(f"inference_test_month_mean_{TARGET}_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}.png")

plt.show()

In [None]:
x = np.array(df_corr_monthly.index.tolist(), dtype=int)
y1 = df_corr_monthly["no2_gios"]["median"].values.astype(float)
y1_std = df_corr_monthly["no2_gios"]["std"].values.astype(float)
y2 = df_corr_monthly[f"{TARGET}_inference"]["median"].values.astype(float)
y2_std = df_corr_monthly[f"{TARGET}_inference"]["std"].values.astype(float)

fig, axes = plt.subplots(1, 2, figsize=(18, 8), sharey=False, dpi=300)

major_ticks = list(range(1, 13))
major_ticks_labels = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
minor_ticks = [i for i in range(1, 13) if i not in major_ticks]

axes[0].plot(x, y1, label="Ground stations", color=TARGET_COLOR)
axes[0].fill_between(x, y1 - y1_std, y1 + y1_std, alpha=0.2, label="± std", color=TARGET_COLOR)
axes[0].set_xlabel("Month", fontsize=16)
axes[0].set_ylabel("Median NO₂ concentration (μg/m³)", fontsize=16)
axes[0].set_xlim(1, 12)
axes[0].set_xticks(major_ticks)
axes[0].set_xticklabels(major_ticks_labels, fontsize=16, rotation=45)
axes[0].set_xticks(minor_ticks, minor=True)
axes[0].set_ylim(0, 50)
axes[0].tick_params(axis="both", which="major", labelsize=16, length=7)
axes[0].tick_params(axis="both", which="minor", labelsize=0, length=4)
axes[0].legend(fontsize=16)
axes[0].grid(True, which="major", linestyle="--", alpha=0.8)
axes[0].grid(True, which="minor", linestyle=":", alpha=0.4)

axes[1].plot(x, y2, label="ML model inference", color=INFERENCE_COLOR)
axes[1].fill_between(x, y2 - y2_std, y2 + y2_std, alpha=0.2, label="± std", color=INFERENCE_COLOR)
axes[1].set_xlabel("Month", fontsize=16)
axes[1].set_ylabel("Median NO₂ concentration (μg/m³)", fontsize=16)
axes[1].set_xlim(1, 12)
axes[1].set_xticks(major_ticks)
axes[1].set_xticklabels(major_ticks_labels, fontsize=16, rotation=45)
axes[1].set_xticks(minor_ticks, minor=True)
axes[1].set_ylim(0, 50)
axes[1].tick_params(axis="both", which="major", labelsize=16, length=7)
axes[1].tick_params(axis="both", which="minor", labelsize=0, length=4)
axes[1].legend(fontsize=16)
axes[1].grid(True, which="major", linestyle="--", alpha=0.8)
axes[1].grid(True, which="minor", linestyle=":", alpha=0.4)

plt.tight_layout(rect=[0, 0, 1, 0.95])

if COMMENT:
    plt.savefig(f"inference_test_month_median_{TARGET}_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}_{COMMENT}.png")
else:
    plt.savefig(f"inference_test_month_median_{TARGET}_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}.png")

plt.show()

In [None]:
df_corr_hourly = df_corr.copy()
df_corr_hourly["timestamp_utc"] = pd.to_datetime(df_corr_hourly["timestamp_utc"])
df_corr_hourly["hour"] = df_corr_hourly["timestamp_utc"].dt.hour
df_corr_hourly.drop(columns=["h3_index", "timestamp_utc"], inplace=True)
df_corr_hourly = df_corr_hourly.groupby(by="hour").agg({
    "no2_gios": ["median", "std"],
    f"{TARGET}_inference":  ["median", "std"],
})
df_corr_hourly

In [None]:
x = np.array(df_corr_hourly.index.tolist(), dtype=int)
y1 = df_corr_hourly["no2_gios"]["median"].values.astype(float)
y1_std = df_corr_hourly["no2_gios"]["std"].values.astype(float)
y2 = df_corr_hourly[f"{TARGET}_inference"]["median"].values.astype(float)
y2_std = df_corr_hourly[f"{TARGET}_inference"]["std"].values.astype(float)

fig, axes = plt.subplots(1, 2, figsize=(18, 8), sharey=False, dpi=300)

major_ticks = [i for i in range(0, 24, 4)]
minor_ticks = [i for i in range(0, 24) if i not in major_ticks]
axes[0].plot(x, y1, label="Ground stations", color=TARGET_COLOR)
axes[0].fill_between(x, y1 - y1_std, y1 + y1_std, alpha=0.2, label="± std", color=TARGET_COLOR)
axes[0].set_title("a)", loc="left", fontsize=16, fontweight="bold")
axes[0].set_xlabel("Hour", fontsize=16)
axes[0].set_ylabel("Median NO₂ concentration (μg/m³)", fontsize=16)
axes[0].set_xlim(0, 23)
axes[0].set_xticks(major_ticks)
axes[0].set_xticks(minor_ticks, minor=True)
axes[0].set_ylim(0, 50)
axes[0].tick_params(axis="both", which="major", labelsize=16, length=7)
axes[0].tick_params(axis="both", which="minor", labelsize=0, length=4)
axes[0].legend(fontsize=16)
axes[0].grid(True, which="major", linestyle="--", alpha=0.8)
axes[0].grid(True, which="minor", linestyle=":", alpha=0.4)

axes[1].plot(x, y2, label="ML model inference", color=INFERENCE_COLOR)
axes[1].fill_between(x, y2 - y2_std, y2 + y2_std, alpha=0.2, label="± std", color=INFERENCE_COLOR)
axes[1].set_title("b)", loc="left", fontsize=16, fontweight="bold")
axes[1].set_xlabel("Hour", fontsize=16)
axes[1].set_ylabel("Median NO₂ concentration (μg/m³)", fontsize=16)
axes[1].set_xlim(0, 23)
axes[1].set_xticks(major_ticks)
axes[1].set_xticks(minor_ticks, minor=True)
axes[1].set_ylim(0, 50)
axes[1].tick_params(axis="both", which="major", labelsize=16, length=7)
axes[1].tick_params(axis="both", which="minor", labelsize=0, length=4)
axes[1].legend(fontsize=16)
axes[1].grid(True, which="major", linestyle="--", alpha=0.8)
axes[1].grid(True, which="minor", linestyle=":", alpha=0.4)

plt.tight_layout(rect=[0, 0, 1, 0.95])

if COMMENT:
    plt.savefig(f"inference_test_hour_{TARGET}_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}_{COMMENT}.png")
else:
    plt.savefig(f"inference_test_hour__{TARGET}_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}.png")

plt.show()