In [20]:
import os
import sys
import json
import re
import numpy as np
import rasterio
import pandas as pd
import shapely
import geopandas as gpd

sys.path.append("../")
from config.config import BASE_PATH, PATH_TO_PATH_CONFIG_FILE
from src.utils import load_paths_from_yaml, replace_base_path
from src.inca_data_extraction import calculate_wind_speed
from src.fwi_system_calculator import calculate_ffmc

In [21]:
def add_static_feature_from_raster(events: gpd.GeoDataFrame,
                                   path_to_raster: str, feature_name: str) -> gpd.GeoDataFrame:
    """Adds an additional column to GeoDataFrame with values of raster at same location as point geometries

    Args:
        path_to_raster (str): path to raster that contains certain feature values (e,g, farmyard density)
        feature_name (str): name of feature column in GeoDataFrame
        events (gpd.GeoDataFrame): GeoDataFrame containing the date and location of the fire and non-fire events

    Returns:
        gpd.GeoDataFrame: fire and non-fire events with new column for feature values
    """

    with rasterio.open(path_to_raster) as src:
        events_updated = events.copy()
        coords = list(zip(events.geometry.x, events.geometry.y))
        events_updated = events.copy()
        events_updated[feature_name] = [x[0] for x in src.sample(coords)]

        events_updated.loc[events_updated[feature_name] ==
                           src.profile["nodata"], feature_name] = np.nan
    return events_updated


def get_nearest_pop_value(row) -> float:
    """choose population data from closest year

    Args:
        row (str): one row of dataframe

    Returns:
        float: population number per square km
    """
    year = pd.to_datetime(row.date).year
    nearest_year = min([2006, 2011, 2018, 2021], key=lambda x: abs(x-year))
    pop_col_name = f"pop_{nearest_year}"
    return row[pop_col_name]


def add_static_features(event_data: gpd.GeoDataFrame, feature_info: dict) -> gpd.GeoDataFrame:
    """adds static features from rasters to dataframe. 

    Args:
        feature_info (dict): Column names and paths to rasters are defined in feature_info dict

    Returns:
        gpd.GeoDataFrame: dataframe with labels and static features
    """

    for feature_name, rel_feature_layer_path in feature_info:
        path_to_feature_layer = os.path.join(BASE_PATH, rel_feature_layer_path)
        event_data = add_static_feature_from_raster(
            event_data, path_to_feature_layer, feature_name)

    # creating population column, with population values closest to event data (drop others)
    event_data['pop_dens'] = event_data.apply(
        get_nearest_pop_value, axis=1)
    event_data = event_data.drop(
        ["pop_2006", "pop_2011", "pop_2018", "pop_2021"], axis=1)

    return event_data

In [22]:
# Load paths from the YAML file
paths = load_paths_from_yaml(PATH_TO_PATH_CONFIG_FILE)
paths = replace_base_path(paths, BASE_PATH)

In [23]:
# load fire event dataset
event_data = gpd.read_file(paths["fire_events"]["final"])
event_data.reset_index(inplace=True)

In [24]:
file_path = paths["inca"]["training_data"]

# Create an empty list to store extracted data
data_list = []

with open(file_path, 'r') as file:
    for line in file:
        # Use regular expression to extract ID and JSON content
        match = re.match(r'(\d+)(\{.+})', line)
        if match:
            numeric_id = int(match.group(1))
            json_data = json.loads(match.group(2))

            # Extract required information from JSON
            last_timestamp = json_data["timestamps"][-1]
            last_T2M = json_data["features"][0]["properties"]["parameters"]["T2M"]["data"][-1]
            last_RH2M = json_data["features"][0]["properties"]["parameters"]["RH2M"]["data"][-1]
            last_UU = json_data["features"][0]["properties"]["parameters"]["UU"]["data"][-1]
            last_VV = json_data["features"][0]["properties"]["parameters"]["VV"]["data"][-1]

            # Extract RR data and check if it contains only NaN
            rr_data = json_data["features"][0]["properties"]["parameters"]["RR"]["data"]
            if any(val is None for val in rr_data):
                sum_RR = None
            else:
                sum_RR = np.sum(rr_data)

            # Extract coordinates
            coordinates = json_data["features"][0]["geometry"]["coordinates"]
            point = shapely.Point(coordinates)

            # Append data to the list
            data_list.append({
                "ID": numeric_id,
                "Timestamp": last_timestamp,
                "T2M": last_T2M,
                "RH2M": last_RH2M,
                "UU": last_UU,
                "VV": last_VV,
                "RR_sum_24h": sum_RR,
                "geometry": point
            })

# Create a GeoDataFrame
gdf = gpd.GeoDataFrame(data_list, geometry='geometry', crs="EPSG:4326")


In [33]:
# calculate windspeed and FFMC
gdf.dropna(inplace=True)
gdf["windspeed"] = gdf.apply(lambda row: calculate_wind_speed(row["UU"], row["VV"]), axis=1)
gdf["ffmc"] = gdf.apply(lambda row:  calculate_ffmc(85, row["RH2M"], row["T2M"], row["RR_sum_24h"], row["windspeed"]), axis=1)

In [34]:
# join ffmc with fire event data
train_data = pd.merge(event_data, gdf, left_on="index", right_on="ID")
train_data = train_data.loc[:, ["date", "Pufferradi", "fire", "ffmc", "geometry_x"]]
train_data.rename(columns={"geometry_x": "geometry"}, inplace=True)
train_data = gpd.GeoDataFrame(train_data, geometry="geometry", crs="EPSG:31287")

In [35]:
# add static features
feature_info = [
    ("pop_2006", paths["population_layers"]["2006"]["final"]),
    ("pop_2011", paths["population_layers"]["2011"]["final"]),
    ("pop_2018", paths["population_layers"]["2018"]["final"]),
    ("pop_2021", paths["population_layers"]["2021"]["final"]),
    ("farmyard_ds", paths["farmyard_density"]["final"]),
    ("hiking_ds", paths["roads"]["hikingtrails"]["final"]),
    ("forest_ds", paths["roads"]["forestroads"]["final"]),
    ("rail_dens", paths["railways"]["final"]),
    ("elevation", paths["topographical_layers"]["elevation"]["final"]),
    ("slope", paths["topographical_layers"]["slope"]["final"]),
    ("aspect", paths["topographical_layers"]["aspect"]["final"]),
    ("foresttype", paths["forest_type"]["final"])
]

train_data = add_static_features(train_data, feature_info)

In [39]:
train_data.fire.value_counts()

1    744
0    177
Name: fire, dtype: int64

In [42]:
train_data.to_file(paths["training_data"]["subset"])

  train_data.to_file(paths["training_data"]["subset"])
