In [52]:
%load_ext autoreload
%autoreload 2

In [53]:
import sys
sys.path.append("../../")

In [89]:
import pandas as pd
import geopandas as gpd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import bambi as bmb

from config.config import BASE_PATH, PATH_TO_PATH_CONFIG_FILE, PROJECT_EPSG
from src.utils import load_paths_from_yaml, replace_base_path
from src.modeling.encodings import convert_aspect_to_cardinal_direction, encode_nuts_id, nuts_lvl_3_encoding
from src.modeling.utils import temporal_train_test_split
from src.modeling.bayesian_models import create_st_blr

In [8]:
paths = load_paths_from_yaml(PATH_TO_PATH_CONFIG_FILE)
paths = replace_base_path(paths, BASE_PATH)

### Read in data

In [68]:
# read in training data
training_data = gpd.read_file(paths["training_data"]["subset"])
training_data.date = pd.to_datetime(training_data.date)


In [69]:
# read in NUTS data
nuts_data = gpd.read_file(paths["nuts_data"])

### Data Cleaning & Preparation

In [70]:
training_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 921 entries, 0 to 920
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        921 non-null    datetime64[ns]
 1   Pufferradi  744 non-null    object        
 2   fire        921 non-null    int64         
 3   ffmc        921 non-null    float64       
 4   farmyard_d  921 non-null    float64       
 5   hiking_ds   921 non-null    float64       
 6   forest_ds   921 non-null    float64       
 7   rail_dens   921 non-null    float64       
 8   elevation   921 non-null    float64       
 9   slope       921 non-null    float64       
 10  aspect      921 non-null    float64       
 11  foresttype  886 non-null    float64       
 12  pop_dens    921 non-null    float64       
 13  geometry    921 non-null    geometry      
dtypes: datetime64[ns](1), float64(10), geometry(1), int64(1), object(1)
memory usage: 100.9+ KB


In [71]:
# rename columns
training_data.rename(columns={"farmyard_d": "farmyard_density", 
                              "hiking_ds": "hikingtrail_density", 
                              "forest_ds": "forestroad_density", 
                              "rail_dens": "railway_density", 
                              "foresttype": "forest_type", 
                              "pop_dens": "population_density"}, inplace=True)

# create column with nuts id for level 3
nuts_austria_lvl3 = nuts_data.loc[(nuts_data['CNTR_CODE'] == "AT") & (nuts_data['LEVL_CODE'] == 3)]
nuts_austria_lvl3.to_crs(PROJECT_EPSG, inplace=True)
training_data = training_data.sjoin(nuts_austria_lvl3.loc[:, ["NUTS_ID", "geometry"]])
training_data.drop(columns=["index_right"], inplace=True)

# create season column (0 = winter (1,2,3), 1 = spring (4,5,6), 2 = sommer (7,8,9), 3 = winter (10,11,12))
training_data["season"] = (training_data.date.dt.month -1) // 3 

# encode aspect 
training_data["aspect_encoded"] = training_data["aspect"].apply(convert_aspect_to_cardinal_direction)

# encode NUTS_ID
training_data["nuts_id_encoded"] = training_data["NUTS_ID"].apply(encode_nuts_id, mapping=nuts_lvl_3_encoding)

# replace None values of forest type with class "low and no vegetation (6)"
training_data["forest_type"].fillna(6, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


### Temporal train test split

In [80]:
X_train, X_test = temporal_train_test_split(training_data, "date", 0.7)
y_train, y_test = X_train["fire"], X_test["fire"]


### Feature Scaling

In [86]:
# Define the transformer
features_to_scale = ['ffmc', 'farmyard_density',
       'hikingtrail_density', 'forestroad_density', 'railway_density',
       'elevation', 'slope', 'population_density']

preprocessor = ColumnTransformer(
    transformers=[
        ('std_scaler', StandardScaler(), features_to_scale)
    ],
    remainder='passthrough'
)


train_data_transformed = preprocessor.fit_transform(X_train)
test_data_transformed = preprocessor.transform(X_test)
X_train = pd.DataFrame(train_data_transformed, columns=X_train.columns)
X_test = pd.DataFrame(test_data_transformed, columns=X_test.columns)

### Modeling

In [90]:
def create_formula_string(features: list, spatial_group: str, temporal_group: str, pooling: str):

    if pooling == "no":
        formula = f'fire["1"] ~ 0 + ({spatial_group}:{temporal_group})'
    elif pooling == "partial":
        formula = f'fire["1"] ~ 0 + (1|{spatial_group}:{temporal_group})'
    elif pooling == "full":
        formula = f'fire["1"] ~ 0'
    else:
        raise Exception("Pooling argument not valid. Choose one of [full, partial, no]")

    for feature in features:
        formula += f" + {feature}"
    return formula

In [91]:
train_df

Unnamed: 0,date,Pufferradi,fire,ffmc,farmyard_density,hikingtrail_density,forestroad_density,railway_density,elevation,slope,aspect,forest_type,population_density,geometry,NUTS_ID,season,aspect_encoded,nuts_id_encoded
743,2012-03-03,300,1,83.028005,0.0,0.000000,0.000000,0.0,1816.184570,9.141391,227.391266,6.0,0.000000,POINT (522728.961 323463.211),AT213,0,5,13
742,2012-03-04,150,1,84.866463,0.0,50.502659,0.000000,0.0,482.732086,4.330055,214.852341,5.0,212.000000,POINT (543781.483 390795.811),AT223,0,5,16
741,2012-03-05,50,1,85.928877,0.0,0.000000,0.000000,0.0,344.078430,9.437416,203.674820,5.0,79.622655,POINT (619837.545 430594.145),AT122,0,5,4
740,2012-03-08,500,1,85.047707,0.0,0.000000,0.000000,0.0,796.222778,16.363840,209.085068,0.0,62.000000,POINT (581080.494 379160.770),AT224,0,5,17
739,2012-03-09,50,1,85.140994,0.0,44.071628,0.000000,0.0,216.778168,6.726098,126.394020,5.0,122.281547,POINT (653485.118 456344.785),AT112,0,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,2019-03-25,50,1,85.667564,0.0,0.000000,41.416047,0.0,355.485138,7.159290,106.183594,0.0,0.000000,POINT (633214.286 425910.893),AT112,0,2,1
790,2019-03-26,,0,56.915488,0.0,0.000000,0.000000,0.0,263.031982,7.676914,282.043121,6.0,0.000000,POINT (630606.252 360486.260),AT113,0,6,2
764,2019-03-28,,0,85.092341,0.0,0.000000,0.000000,0.0,804.078735,30.975384,308.370148,4.0,0.000000,POINT (403158.058 432818.866),AT323,0,7,27
250,2019-03-29,300,1,86.965689,0.0,0.000000,60.999740,0.0,520.454529,11.741232,202.076981,0.0,147.000000,POINT (444796.763 302927.192),AT211,0,4,11


In [None]:
spatial_grouping_variable = "nuts_id_encoded"
temporal_grouping_variable = "season"
pooling = "partial"
common_features = 

priors = {
    f"1|{spatial_grouping_variable}:{temporal_grouping_variable}": bmb.Prior("Normal", mu=0, sigma=bmb.Prior("Exponential", lam=1)),
    "sigma": bmb.Prior("Exponential", lam=1)
}

# create formula string
formula = create_formula_string(common_features, spatial_grouping_variable, temporal_grouping_variable, pooling)
# log parameters

# create model
model = bmb.Model(formula=formula,
            data=train_data,
            family="bernoulli",
            priors=priors)