In [None]:
from sqlalchemy import create_engine, text
import db_login
import pandas as pd
import h3

## Settings

In [None]:
OSM_ID = 8269826
MAP_HEX_SIZE = 7
# List of H3 hex IDs or None
EXCLUDED_TARGET_HEXES = [
    h3.latlng_to_cell(50.2646, 18.975, MAP_HEX_SIZE),
    h3.latlng_to_cell(50.3291, 19.2312, MAP_HEX_SIZE),
]
COMMENT = "artificial_min" # String or None
# List of dates for inference in the the "YYYY-mm-DD" format or None
SELECTED_DATES = [
    "2023-02-27",
    "2023-03-23",
    "2023-08-22",
    "2023-10-19",
]
ARTIFICIAL_MIN_HEXES = [
    h3.latlng_to_cell(50.2444, 18.4376, MAP_HEX_SIZE),
    h3.latlng_to_cell(50.0394, 19.1668, MAP_HEX_SIZE),
]

## Database connection

In [None]:
conn_template = "mysql+pymysql://{user}:{password}@{host}:{port}/{database}".format
conn = conn_template(
    user=db_login.user,
    password=db_login.passwd,
    host=db_login.host,
    port=int(db_login.port),
    database=db_login.db,
)
engine = create_engine(conn)

## Get meteo data

In [None]:
query_meteo = """
SELECT 
    timestamp_utc,
    temperature,
    relative_humidity,
    pressure,
    dew_point,
    precipitation,
    wind_u,
    wind_v
FROM meteo.ERA5
"""

df_meteo = pd.read_sql_query(query_meteo, engine)
df_meteo.head()

## Get traffic data

In [None]:
query_traffic = """
SELECT timestamp_utc, daily_fraction, yearly_mean_fraction
FROM traffic.SCPR
"""

df_traffic = pd.read_sql_query(query_traffic, engine)
df_traffic.rename(
    columns={"daily_fraction": "traffic_daily_fraction", "yearly_mean_fraction": "traffic_yearly_mean_fraction"},
    inplace=True,
)
df_traffic.head()

## Get tree cover density

In [None]:
query_tree_cover = """
SELECT tcd.h3_index, tcd.tree_cover_density
FROM clms.tree_cover_density tcd
INNER JOIN (
    SELECT h3_index
    FROM h3_maps.hex_%(hex_size)s
    WHERE osm_id = %(osm_id)s
) AS hex_map
ON tcd.h3_index=hex_map.h3_index;
"""

In [None]:
df_tree_cover = pd.read_sql_query(query_tree_cover, engine, params={"hex_size": MAP_HEX_SIZE, "osm_id": OSM_ID})
df_tree_cover.head()

In [None]:
df_tree_cover["tree_cover"] = (df_tree_cover["tree_cover_density"] / 100).round(3)
df_tree_cover.drop(columns=["tree_cover_density"], inplace=True)
df_tree_cover.head()

## Get grassland

In [None]:
query_grassland = """
SELECT grs.h3_index, grs.grassland, grs.other
FROM clms.grassland grs
INNER JOIN (
    SELECT h3_index
    FROM h3_maps.hex_%(hex_size)s
    WHERE osm_id = %(osm_id)s
) AS hex_map
ON grs.h3_index=hex_map.h3_index;
"""

In [None]:
df_grassland = pd.read_sql_query(query_grassland, engine, params={"hex_size": MAP_HEX_SIZE, "osm_id": OSM_ID})
df_grassland.head()

In [None]:
cols_to_sum = df_grassland.columns.difference(["h3_index"])
df_grassland["sum"] = df_grassland[cols_to_sum].sum(axis=1)
df_grassland.head()

In [None]:
cols_to_normalize = df_grassland.columns.difference(["h3_index", "sum"])
df_grassland[cols_to_normalize] = df_grassland[cols_to_normalize].div(df_grassland["sum"], axis=0)
df_grassland[cols_to_normalize] = df_grassland[cols_to_normalize].round(4)
df_grassland.drop(columns=["sum", "other"], inplace=True)
df_grassland.head()

## Get population

In [None]:
query_population = """
SELECT pop.h3_index, pop.population
FROM ghsl.population pop
INNER JOIN (
    SELECT h3_index
    FROM h3_maps.hex_%(hex_size)s
    WHERE osm_id = %(osm_id)s
) AS hex_map
ON pop.h3_index=hex_map.h3_index;
"""

In [None]:
df_population = pd.read_sql_query(query_population, engine, params={"hex_size": MAP_HEX_SIZE, "osm_id": OSM_ID})
df_population.head()

In [None]:
def calc_pop_density(population, h3_index):
    pop_density = population / h3.cell_area(h3_index)

    return pop_density

In [None]:
df_population["population_density"] = df_population.apply(lambda row: calc_pop_density(row.population, row.h3_index), axis=1).round(2)
df_population.drop(columns=["population"], inplace=True)
df_population.head()

## Get built-up characteristic

In [None]:
query_builtup = """
SELECT
    bup.h3_index,
    bup.low_vegetation,
    bup.medium_vegetation,
    bup.high_vegetation,
    bup.water,
    bup.road,
    bup.residential_1,
    bup.residential_2,
    bup.residential_3,
    bup.residential_4,
    bup.residential_5,
    bup.`non-residential_1`,
    bup.`non-residential_2`,
    bup.`non-residential_3`,
    bup.`non-residential_4`,
    bup.`non-residential_5`,
    bup.other
FROM ghsl.builtup_c bup
INNER JOIN (
    SELECT h3_index
    FROM h3_maps.hex_%(hex_size)s
    WHERE osm_id = %(osm_id)s
) AS hex_map
ON bup.h3_index=hex_map.h3_index;
"""

In [None]:
df_builtup = pd.read_sql_query(query_builtup, engine, params={"hex_size": MAP_HEX_SIZE, "osm_id": OSM_ID})
df_builtup.rename(columns={"other": "other_builtup"}, inplace=True)
df_builtup.head()

In [None]:
cols_to_sum = df_builtup.columns.difference(["h3_index"])
df_builtup["sum"] = df_builtup[cols_to_sum].sum(axis=1)
df_builtup.head()

In [None]:
cols_to_normalize = df_builtup.columns.difference(["h3_index", "sum"])
df_builtup[cols_to_normalize] = df_builtup[cols_to_normalize].div(df_builtup["sum"], axis=0)
df_builtup[cols_to_normalize] = df_builtup[cols_to_normalize].round(4)
df_builtup.drop(columns=["sum", "other_builtup"], inplace=True)
df_builtup.head()

## Get heat island data

In [None]:
query_heat = """
SELECT hi.h3_index, hi.temperature_mean AS temperature_anomaly
FROM meteo.heat_islands hi
INNER JOIN (
    SELECT h3_index
    FROM h3_maps.hex_%(hex_size)s
    WHERE osm_id = %(osm_id)s
) AS hex_map
ON hi.h3_index=hex_map.h3_index;
"""

In [None]:
df_heat = pd.read_sql_query(query_heat, engine, params={"hex_size": MAP_HEX_SIZE, "osm_id": OSM_ID})
df_heat.head()

## Get NO2 concentrations from Sentinel-5P

In [None]:
query_no2s5p = """
SELECT s5p.h3_index, s5p.timestamp_utc, s5p.value
FROM air_quality.NO2_S5P s5p
INNER JOIN (
    SELECT h3_index
    FROM h3_maps.hex_%(hex_size)s
    WHERE osm_id = %(osm_id)s
) AS hex_map
ON s5p.h3_index=hex_map.h3_index;
"""

In [None]:
df_no2s5p = pd.read_sql_query(query_no2s5p, engine, params={"hex_size": MAP_HEX_SIZE, "osm_id": OSM_ID})
df_no2s5p.rename(columns={"value": "no2_s5p"}, inplace=True)
df_no2s5p["timestamp_utc"] = pd.to_datetime(df_no2s5p["timestamp_utc"])
df_no2s5p["timestamp_utc"] =df_no2s5p["timestamp_utc"] + pd.to_timedelta("12:00:00")
df_no2s5p.head()

In [None]:
df_no2s5p["no2_anomaly"] = df_no2s5p["no2_s5p"] - df_no2s5p.groupby("timestamp_utc")["no2_s5p"].transform("mean")
df_no2s5p.head()

In [None]:
df_no2s5p.drop(columns=["timestamp_utc", "no2_s5p"], inplace=True)

In [None]:
df_no2s5p = df_no2s5p.groupby(by=["h3_index"], as_index=False).mean()

## Get NO2 concentrations from GIOŚ

In [None]:
query_no2gios = """
SELECT stations.h3_index, gios.timestamp_utc, gios.value
FROM air_quality.NO2_GIOS gios
INNER JOIN (
	SELECT giosmeta.station_id, hex_map.h3_index
	FROM air_quality.gios_metadata giosmeta
	INNER JOIN (
	    SELECT h3_index
	    FROM h3_maps.hex_%(hex_size)s
	    WHERE osm_id = %(osm_id)s
	) AS hex_map
	ON giosmeta.hex_%(hex_size)s=hex_map.h3_index
) AS stations
ON gios.station_id=stations.station_id;
"""

In [None]:
df_no2gios = pd.read_sql_query(query_no2gios, engine, params={"hex_size": MAP_HEX_SIZE, "osm_id": OSM_ID})
df_no2gios.rename(columns={"value": "no2_gios"}, inplace=True)
df_no2gios.head()

In [None]:
df_no2gios["timestamp_utc"] = pd.to_datetime(df_no2gios["timestamp_utc"])
df_no2gios["date"] = df_no2gios["timestamp_utc"].dt.date
df_no2gios["no2_gios"] = df_no2gios.groupby("date")["no2_gios"].transform(
    lambda row: row.fillna(row.mean())
)
df_no2gios.drop(columns=["date"], inplace=True)
df_no2gios.head()

In [None]:
df_no2gios.dropna(inplace=True)

In [None]:
df_no2gios_excl = df_no2gios[df_no2gios["h3_index"].isin(EXCLUDED_TARGET_HEXES)]

In [None]:
df_no2gios = df_no2gios[~df_no2gios["h3_index"].isin(EXCLUDED_TARGET_HEXES)]

In [None]:
if len(ARTIFICIAL_MIN_HEXES) > 0:
    min_values = df_no2gios.groupby("timestamp_utc")["no2_gios"].min().reset_index()
    
    artificial_rows = []
    for hex_id in ARTIFICIAL_MIN_HEXES:
        df_temp = min_values.copy()
        df_temp["h3_index"] = hex_id
        artificial_rows.append(df_temp)
    
    artificial_data = pd.concat(artificial_rows, ignore_index=True)
    
    artificial_data = artificial_data[["h3_index", "timestamp_utc", "no2_gios"]]
    df_no2gios = pd.concat([df_no2gios, artificial_data], ignore_index=True)

## Merge data

### Spatial data

In [None]:
df_spatial = pd.merge(df_tree_cover, df_grassland, on="h3_index", how="outer")
df_spatial = pd.merge(df_spatial, df_population, on="h3_index", how="outer")
df_spatial = pd.merge(df_spatial, df_builtup, on="h3_index", how="outer")
df_spatial = pd.merge(df_spatial, df_heat, on="h3_index", how="outer")
df_spatial = pd.merge(df_spatial, df_no2s5p, on="h3_index", how="outer")
df_spatial.fillna(0, inplace=True)
df_spatial.head()

In [None]:
df_spatial.to_csv(f"../data/spatial_dataset_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}.csv", index=False)

### Temporal data

In [None]:
df_temporal = pd.merge(df_meteo, df_traffic, on="timestamp_utc", how="outer")
df_temporal.fillna(0, inplace=True)
df_temporal.head()

### Train dataset

Feature parameters for all dates and hexes with target data (without the ones excluded for testing).

In [None]:
df_train_NO2 = pd.merge(df_no2gios, df_spatial, on="h3_index", how="left")
df_train_NO2.fillna(0, inplace=True)
df_train_NO2.head()

In [None]:
df_train_NO2 = pd.merge(df_train_NO2, df_temporal, on="timestamp_utc", how="left")
df_train_NO2.fillna(0, inplace=True)
df_train_NO2.head()

In [None]:
df_train_NO2["temperature_trend_3h"] = df_train_NO2["temperature"] - df_train_NO2["temperature"].shift(3)
df_train_NO2["temperature_trend_3h"] = df_train_NO2["temperature_trend_3h"].round(1)
df_train_NO2["temperature_trend_6h"] = df_train_NO2["temperature"] - df_train_NO2["temperature"].shift(6)
df_train_NO2["temperature_trend_6h"] = df_train_NO2["temperature_trend_6h"].round(1)
df_train_NO2["relative_humidity_trend_3h"] = df_train_NO2["relative_humidity"] - df_train_NO2["relative_humidity"].shift(3)
df_train_NO2["relative_humidity_trend_3h"] = df_train_NO2["relative_humidity_trend_3h"].round(1)
df_train_NO2["relative_humidity_trend_6h"] = df_train_NO2["relative_humidity"] - df_train_NO2["relative_humidity"].shift(6)
df_train_NO2["relative_humidity_trend_6h"] = df_train_NO2["relative_humidity_trend_6h"].round(1)
df_train_NO2["pressure_trend_3h"] = df_train_NO2["pressure"] - df_train_NO2["pressure"].shift(3)
df_train_NO2["pressure_trend_3h"] = df_train_NO2["pressure_trend_3h"].round(1)
df_train_NO2["pressure_trend_6h"] = df_train_NO2["pressure"] - df_train_NO2["pressure"].shift(6)
df_train_NO2["pressure_trend_6h"] = df_train_NO2["pressure_trend_6h"].round(1)
df_train_NO2["precipitation_trend_3h"] = df_train_NO2["precipitation"] - df_train_NO2["precipitation"].shift(3)
df_train_NO2["precipitation_trend_3h"] = df_train_NO2["precipitation_trend_3h"].round(3)
df_train_NO2["precipitation_trend_6h"] = df_train_NO2["precipitation"] - df_train_NO2["precipitation"].shift(6)
df_train_NO2["precipitation_trend_6h"] = df_train_NO2["precipitation_trend_6h"].round(3)
df_train_NO2.fillna(0, inplace=True)
df_train_NO2.head()

### Test NO2 dataset

Feature parameters for all dates but only for selected test hex.

In [None]:
df_spatial_excl = df_spatial[df_spatial["h3_index"].isin(EXCLUDED_TARGET_HEXES)]
df_spatial_excl

In [None]:
df_test_NO2 = df_spatial_excl.merge(df_temporal, how="cross")
df_test_NO2.head()

In [None]:
df_test_NO2["temperature_trend_3h"] = df_test_NO2["temperature"] - df_test_NO2["temperature"].shift(3)
df_test_NO2["temperature_trend_3h"] = df_test_NO2["temperature_trend_3h"].round(1)
df_test_NO2["temperature_trend_6h"] = df_test_NO2["temperature"] - df_test_NO2["temperature"].shift(6)
df_test_NO2["temperature_trend_6h"] = df_test_NO2["temperature_trend_6h"].round(1)
df_test_NO2["relative_humidity_trend_3h"] = df_test_NO2["relative_humidity"] - df_test_NO2["relative_humidity"].shift(3)
df_test_NO2["relative_humidity_trend_3h"] = df_test_NO2["relative_humidity_trend_3h"].round(1)
df_test_NO2["relative_humidity_trend_6h"] = df_test_NO2["relative_humidity"] - df_test_NO2["relative_humidity"].shift(6)
df_test_NO2["relative_humidity_trend_6h"] = df_test_NO2["relative_humidity_trend_6h"].round(1)
df_test_NO2["pressure_trend_3h"] = df_test_NO2["pressure"] - df_test_NO2["pressure"].shift(3)
df_test_NO2["pressure_trend_3h"] = df_test_NO2["pressure_trend_3h"].round(1)
df_test_NO2["pressure_trend_6h"] = df_test_NO2["pressure"] - df_test_NO2["pressure"].shift(6)
df_test_NO2["pressure_trend_6h"] = df_test_NO2["pressure_trend_6h"].round(1)
df_test_NO2["precipitation_trend_3h"] = df_test_NO2["precipitation"] - df_test_NO2["precipitation"].shift(3)
df_test_NO2["precipitation_trend_3h"] = df_test_NO2["precipitation_trend_3h"].round(3)
df_test_NO2["precipitation_trend_6h"] = df_test_NO2["precipitation"] - df_test_NO2["precipitation"].shift(6)
df_test_NO2["precipitation_trend_6h"] = df_test_NO2["precipitation_trend_6h"].round(3)
df_test_NO2.fillna(0, inplace=True)
df_test_NO2.head()

### Inference dataset

Feature parameters for all hexes in the area but limited to selected dates.

In [None]:
df_temporal_selected = df_temporal[df_temporal["timestamp_utc"].dt.strftime("%Y-%m-%d").isin(SELECTED_DATES)]

In [None]:
df_inference = df_spatial.merge(df_temporal_selected, how="cross")

In [None]:
df_inference["temperature_trend_3h"] = df_inference["temperature"] - df_inference["temperature"].shift(3)
df_inference["temperature_trend_3h"] = df_inference["temperature_trend_3h"].round(1)
df_inference["temperature_trend_6h"] = df_inference["temperature"] - df_inference["temperature"].shift(6)
df_inference["temperature_trend_6h"] = df_inference["temperature_trend_6h"].round(1)
df_inference["relative_humidity_trend_3h"] = df_inference["relative_humidity"] - df_inference["relative_humidity"].shift(3)
df_inference["relative_humidity_trend_3h"] = df_inference["relative_humidity_trend_3h"].round(1)
df_inference["relative_humidity_trend_6h"] = df_inference["relative_humidity"] - df_inference["relative_humidity"].shift(6)
df_inference["relative_humidity_trend_6h"] = df_inference["relative_humidity_trend_6h"].round(1)
df_inference["pressure_trend_3h"] = df_inference["pressure"] - df_inference["pressure"].shift(3)
df_inference["pressure_trend_3h"] = df_inference["pressure_trend_3h"].round(1)
df_inference["pressure_trend_6h"] = df_inference["pressure"] - df_inference["pressure"].shift(6)
df_inference["pressure_trend_6h"] = df_inference["pressure_trend_6h"].round(1)
df_inference["precipitation_trend_3h"] = df_inference["precipitation"] - df_inference["precipitation"].shift(3)
df_inference["precipitation_trend_3h"] = df_inference["precipitation_trend_3h"].round(3)
df_inference["precipitation_trend_6h"] = df_inference["precipitation"] - df_inference["precipitation"].shift(6)
df_inference["precipitation_trend_6h"] = df_inference["precipitation_trend_6h"].round(3)
df_inference.fillna(0, inplace=True)
df_inference.head()

## Check for missing data

In [None]:
df_train_NO2[df_train_NO2.isna().any(axis=1)]

In [None]:
df_no2gios_excl[df_no2gios_excl.isna().any(axis=1)]

In [None]:
df_test_NO2[df_test_NO2.isna().any(axis=1)]

In [None]:
df_inference[df_inference.isna().any(axis=1)]

## Export data to files

In [None]:
base_filename = f"../data/osm_{OSM_ID}_hex_{MAP_HEX_SIZE}.csv"
if COMMENT:
    base_filename = base_filename.replace(".csv", f"_{COMMENT}.csv")

In [None]:
df_train_NO2.to_csv(f"NO2_train_dataset_{base_filename}", index=False)

In [None]:
df_no2gios_excl.to_csv(f"NO2_excluded_dataset_{base_filename}", index=False)

In [None]:
df_test_NO2.to_csv(f"NO2_test_dataset_{base_filename}", index=False)

In [None]:
df_inference.to_csv(f"NO2_inference_dataset_{base_filename}", index=False)