In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("../../")

In [3]:
import pandas as pd
import geopandas as gpd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import bambi as bmb

from config.config import BASE_PATH, PATH_TO_PATH_CONFIG_FILE, PROJECT_EPSG
from src.utils import load_paths_from_yaml, replace_base_path
from src.modeling.encodings import convert_aspect_to_cardinal_direction, encode_nuts_id, nuts_lvl_3_encoding
from src.modeling.utils import temporal_train_test_split



In [4]:
paths = load_paths_from_yaml(PATH_TO_PATH_CONFIG_FILE)
paths = replace_base_path(paths, BASE_PATH)

### Read in data

In [5]:
# read in training data
training_data = gpd.read_file(paths["training_data"]["subset"])
training_data.date = pd.to_datetime(training_data.date)


In [6]:
# read in NUTS data
nuts_data = gpd.read_file(paths["nuts_data"])

### Data Cleaning & Preparation

In [7]:
training_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 921 entries, 0 to 920
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        921 non-null    datetime64[ns]
 1   Pufferradi  744 non-null    object        
 2   fire        921 non-null    int64         
 3   ffmc        921 non-null    float64       
 4   farmyard_d  921 non-null    float64       
 5   hiking_ds   921 non-null    float64       
 6   forest_ds   921 non-null    float64       
 7   rail_dens   921 non-null    float64       
 8   elevation   921 non-null    float64       
 9   slope       921 non-null    float64       
 10  aspect      921 non-null    float64       
 11  foresttype  886 non-null    float64       
 12  pop_dens    921 non-null    float64       
 13  geometry    921 non-null    geometry      
dtypes: datetime64[ns](1), float64(10), geometry(1), int64(1), object(1)
memory usage: 100.9+ KB


In [8]:
# rename columns
training_data.rename(columns={"farmyard_d": "farmyard_density", 
                              "hiking_ds": "hikingtrail_density", 
                              "forest_ds": "forestroad_density", 
                              "rail_dens": "railway_density", 
                              "foresttype": "forest_type", 
                              "pop_dens": "population_density"}, inplace=True)

# create column with nuts id for level 3
nuts_austria_lvl3 = nuts_data.loc[(nuts_data['CNTR_CODE'] == "AT") & (nuts_data['LEVL_CODE'] == 3)]
nuts_austria_lvl3.to_crs(PROJECT_EPSG, inplace=True)
training_data = training_data.sjoin(nuts_austria_lvl3.loc[:, ["NUTS_ID", "geometry"]])
training_data.drop(columns=["index_right"], inplace=True)

# create season column (0 = winter (1,2,3), 1 = spring (4,5,6), 2 = sommer (7,8,9), 3 = winter (10,11,12))
training_data["season"] = (training_data.date.dt.month -1) // 3 

# encode aspect 
training_data["aspect_encoded"] = training_data["aspect"].apply(convert_aspect_to_cardinal_direction)

# encode NUTS_ID
training_data["nuts_id_encoded"] = training_data["NUTS_ID"].apply(encode_nuts_id, mapping=nuts_lvl_3_encoding)

# replace None values of forest type with class "low and no vegetation (6)"
training_data["forest_type"].fillna(6, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


### Temporal train test split

In [12]:
X_train, X_test = temporal_train_test_split(training_data, "date", 0.7)
y_train, y_test = X_train["fire"], X_test["fire"]


### Feature Scaling

In [20]:
# Define the transformer
features_to_scale = ['ffmc', 'farmyard_density',
       'hikingtrail_density', 'forestroad_density', 'railway_density',
       'elevation', 'slope', 'population_density']
passthrough_columns = ['date', 'Pufferradi', 'fire', 'geometry', 'season', 'nuts_id_encoded', 'aspect_encoded', 'forest_type']

preprocessor = ColumnTransformer(
    transformers=[
        ('std_scaler', StandardScaler(), features_to_scale)
    ],
    remainder='drop'
)


train_data_transformed = preprocessor.fit_transform(X_train)
test_data_transformed = preprocessor.transform(X_test)

In [24]:

X_train_features_to_scale = pd.DataFrame(train_data_transformed, columns=features_to_scale)
X_test_features_to_scale = pd.DataFrame(test_data_transformed, columns=features_to_scale)

X_train = X_train_features_to_scale[passthrough_columns] = X_train.loc[:, passthrough_columns]

X_train = pd.concat([X_train.loc[:, passthrough_columns], X_train_features_to_scale], axis=1)
X_test = pd.concat([X_test.loc[:, passthrough_columns], X_test_features_to_scale], axis=1)

In [25]:
X_train

Unnamed: 0,date,Pufferradi,fire,geometry,season,nuts_id_encoded,aspect_encoded,forest_type,ffmc,farmyard_density,hikingtrail_density,forestroad_density,railway_density,elevation,slope,population_density
743,2012-03-03,300,1.0,POINT (522728.961 323463.211),0.0,13.0,5.0,6.0,,,,,,,,
742,2012-03-04,150,1.0,POINT (543781.483 390795.811),0.0,16.0,5.0,5.0,,,,,,,,
741,2012-03-05,50,1.0,POINT (619837.545 430594.145),0.0,4.0,5.0,5.0,,,,,,,,
740,2012-03-08,500,1.0,POINT (581080.494 379160.770),0.0,17.0,5.0,0.0,,,,,,,,
739,2012-03-09,50,1.0,POINT (653485.118 456344.785),0.0,1.0,3.0,5.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
639,NaT,,,,,,,,0.260730,-0.063006,-0.408619,0.097437,-0.166806,-0.844439,-0.860790,-0.413098
640,NaT,,,,,,,,-1.788347,-0.063006,-0.408619,-0.615602,-0.166806,-1.065771,-0.820890,-0.413098
641,NaT,,,,,,,,0.219736,-0.063006,-0.408619,-0.615602,-0.166806,0.229487,0.975020,-0.413098
642,NaT,,,,,,,,0.353244,-0.063006,-0.408619,0.434599,-0.166806,-0.449505,-0.507601,-0.173901


In [19]:
train_data_transformed[0]

array([0.07261644005703585, -0.06300606200675278, -0.4086187413730573,
       -0.615602331603915, -0.16680618779707798, 2.652453533873805,
       -0.7080039807006189, -0.41309781871649154,
       Timestamp('2012-03-03 00:00:00'), '300', 1, 227.39126586914062,
       6.0, <POINT (522728.961 323463.211)>, 'AT213', 0, 5, 13],
      dtype=object)

### Modeling

In [90]:
def create_formula_string(features: list, spatial_group: str, temporal_group: str, pooling: str):

    if pooling == "no":
        formula = f'fire["1"] ~ 0 + ({spatial_group}:{temporal_group})'
    elif pooling == "partial":
        formula = f'fire["1"] ~ 0 + (1|{spatial_group}:{temporal_group})'
    elif pooling == "full":
        formula = f'fire["1"] ~ 0'
    else:
        raise Exception("Pooling argument not valid. Choose one of [full, partial, no]")

    for feature in features:
        formula += f" + {feature}"
    return formula

In [95]:
spatial_grouping_variable = "nuts_id_encoded"
temporal_grouping_variable = "season"
pooling = "partial"
common_features = ['ffmc', 'farmyard_density',
       'hikingtrail_density', 'forestroad_density', 'railway_density',
       'elevation', 'slope', 'forest_type', 'population_density', 'season', 'aspect_encoded', 'nuts_id_encoded']
X_train = X_train.loc[:, common_features + ["fire"]]

priors = {
    f"1|{spatial_grouping_variable}:{temporal_grouping_variable}": bmb.Prior("Normal", mu=0, sigma=bmb.Prior("Exponential", lam=1)),
    "sigma": bmb.Prior("Exponential", lam=1)
}

# create formula string
formula = create_formula_string(common_features, spatial_grouping_variable, temporal_grouping_variable, pooling)
# log parameters

# create model
model = bmb.Model(formula=formula,
            data=X_train,
            family="bernoulli",
            priors=priors)

ValueError: 'data' contains 149 incomplete rows.

In [96]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 644 entries, 0 to 643
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   ffmc                 644 non-null    object        
 1   farmyard_density     644 non-null    object        
 2   hikingtrail_density  644 non-null    object        
 3   forestroad_density   644 non-null    object        
 4   railway_density      644 non-null    object        
 5   elevation            644 non-null    datetime64[ns]
 6   slope                495 non-null    object        
 7   forest_type          644 non-null    object        
 8   population_density   644 non-null    object        
 9   season               644 non-null    object        
 10  aspect_encoded       644 non-null    object        
 11  nuts_id_encoded      644 non-null    object        
 12  fire                 644 non-null    object        
dtypes: datetime64[ns](1), object(12)
me

In [97]:
X_train

Unnamed: 0,ffmc,farmyard_density,hikingtrail_density,forestroad_density,railway_density,elevation,slope,forest_type,population_density,season,aspect_encoded,nuts_id_encoded,fire
0,-0.615602,-0.166806,2.652454,-0.708004,-0.413098,2012-03-03,300,227.391266,6.0,0,5,13,-0.408619
1,-0.615602,-0.166806,-0.539812,-1.078875,-0.068134,2012-03-04,150,214.852341,5.0,0,5,16,0.527793
2,-0.615602,-0.166806,-0.871747,-0.685186,-0.283537,2012-03-05,50,203.67482,5.0,0,5,4,-0.408619
3,-0.615602,-0.166806,0.21068,-0.151278,-0.312212,2012-03-08,500,209.085068,0.0,0,5,17,-0.408619
4,-0.615602,-0.166806,-1.176502,-0.894181,-0.214123,2012-03-09,50,126.39402,5.0,0,3,1,0.40855
...,...,...,...,...,...,...,...,...,...,...,...,...,...
639,0.097437,-0.166806,-0.844439,-0.86079,-0.413098,2019-03-25,50,106.183594,0.0,0,2,1,-0.408619
640,-0.615602,-0.166806,-1.065771,-0.82089,-0.413098,2019-03-26,,282.043121,6.0,0,6,2,-0.408619
641,-0.615602,-0.166806,0.229487,0.97502,-0.413098,2019-03-28,,308.370148,4.0,0,7,27,-0.408619
642,0.434599,-0.166806,-0.449505,-0.507601,-0.173901,2019-03-29,300,202.076981,0.0,0,4,11,-0.408619
