In [None]:
import numpy as np                  # Version: 1.24.3
import pandas as pd                 # Version: 1.5.3
from   pandas.api.types import CategoricalDtype
import geopandas as gpd             # Version: 1.4.0
import fiona                        # Version: 1.16.0
import seaborn as sns               # Version: 0.13.2
import matplotlib.pyplot as plt     # Version: 3.8.2
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches
from   matplotlib.patches import Patch
from   matplotlib.font_manager import FontProperties
import scipy                        # Version: 1.10.1
from   scipy.stats import zscore, genpareto, gaussian_kde
import statsmodels                  # Version: 0.14.0
from   statsmodels.tsa.seasonal import STL
import sklearn                      # Version: 1.3.0
from   sklearn.preprocessing import MinMaxScaler, LabelEncoder
from   sklearn.feature_selection import mutual_info_classif

# Python version: 3.11.4
import os
import calendar

In [None]:
notebook_dir = os.getcwd()
parent_dir   = os.path.dirname(notebook_dir)
target_dir   = os.path.join(parent_dir, "Data")

# Historical wildfires
csv_path = os.path.join(target_dir, "01_historical_wildfires.csv")
fires    = pd.read_csv(csv_path)
fires["fire"] = True # Add a fire indicator column
fires = fires.rename(columns={"ignition_date": "Date", "state": "State"})

# Weather metrics
csv_path = os.path.join(target_dir, "02_weather_metrics.csv")
weather  = pd.read_csv(csv_path)

# NDVI
csv_path = os.path.join(target_dir, "03_NDVI.csv")
ndvi     = pd.read_csv(csv_path)
ndvi["Date"]  = pd.to_datetime(ndvi["Date"])
ndvi["year"]  = ndvi["Date"].dt.year
ndvi["month"] = ndvi["Date"].dt.month

# Merge weather and fires: LEFT JOIN so all weather dates are preserved
df = weather.merge(fires, on=["Date", "State"], how="left")
df["fire"]  = df["fire"].notna() # Complete fire indicator column
# Extract year and month from weather-fire dataset
df["Date"]  = pd.to_datetime(df["Date"])
df["year"]  = df["Date"].dt.year
df["month"] = df["Date"].dt.month

# Merge NDVI based on state-year-month
df = df.merge(ndvi.drop(columns="Date"), on=["State", "year", "month"], how="left")
df = df.drop(columns=["year", "month"])
df = df.loc[:, ~df.columns.str.startswith('Unnamed')] # Drop all columns whose names start with "Unnamed"

#### **Feature Engineering**

In [None]:
### FEATURE ENGINEERING ###

df_engineered = df.copy()
df_engineered['Date']      = pd.to_datetime(df_engineered['Date'])
df_engineered['YearMonth'] = df_engineered['Date'].dt.to_period('M')

stl_features = {
    'Temperature [°C]_mean': 'Temperature',
    'Relative humidity [%]_mean': 'Humidity',
    'NDVI_mean': 'NDVI'
}

# Solar radiation variable to de-seasonalize
solar_radiation_col  = 'Solar radiation [Jm2/day]_sum'

stl_monthly_features = []
# Process per state
for state in df_engineered['State'].unique():
    df_state = df_engineered[df_engineered['State'] == state]
    # Only keep numeric columns + YearMonth
    numeric_cols = df_state.select_dtypes(include=np.number).columns
    df_monthly   = df_state.groupby('YearMonth')[numeric_cols].mean().reset_index()
    df_monthly['YearMonth']  = df_monthly['YearMonth'].dt.to_timestamp()
    monthly_results = pd.DataFrame({'YearMonth': df_monthly['YearMonth']})
    monthly_results['State'] = state
    # Perform rolling STL for each variable
    for var, prefix in stl_features.items():
        series = df_monthly[var]
        stl = STL(series, period=12, robust=True)
        res = stl.fit()
        monthly_results[f'{prefix}_trend'] = res.trend
        monthly_results[f'{prefix}_residual'] = res.resid
    # De-seasonalize solar radiation (remove seasonal component)
    solar_series = df_monthly[solar_radiation_col]
    solar_stl = STL(solar_series, period=12, robust=True)
    solar_res = solar_stl.fit()
    monthly_results['Solar_de-seasonalized'] = solar_res.trend + solar_res.resid
    stl_monthly_features.append(monthly_results)

stl_monthly_df = pd.concat(stl_monthly_features, ignore_index=True)
df_engineered['YearMonth'] = df_engineered['Date'].dt.to_period('M').dt.to_timestamp()
# Merge
columns_to_merge = ['State', 'YearMonth'] + [col for col in stl_monthly_df.columns if col not in ['YearMonth', 'State']]
df_engineered = df_engineered.merge(stl_monthly_df, on=['State', 'YearMonth'], how='left')
df_engineered = df_engineered.drop(columns=['YearMonth'])

print(df_engineered.columns)
exp_csv_path = os.path.join(target_dir, "full_merged_data_all_features.csv")
df_engineered.to_csv(exp_csv_path, index = True)

Index(['State', 'Date', 'Temperature [°C]_mean', 'Relative humidity [%]_mean',
       'Wind speed [m/s]_mean', 'Precipitation [mm/day]_sum',
       'Solar radiation [Jm2/day]_sum', 'Temperature [°C]_min',
       'Relative humidity [%]_min', 'Wind speed [m/s]_min',
       'Precipitation [mm/day]_min', 'Solar radiation [Jm2/day]_min',
       'Temperature [°C]_max', 'Relative humidity [%]_max',
       'Wind speed [m/s]_max', 'Precipitation [mm/day]_max',
       'Solar radiation [Jm2/day]_max', 'Temperature [°C]_std',
       'Relative humidity [%]_std', 'Wind speed [m/s]_std',
       'Precipitation [mm/day]_std', 'Solar radiation [Jm2/day]_std',
       'total_area_ha', 'log_total_area_ha', 'fire_count', 'outlier_type',
       'ignition_cause', 'fire', 'NDVI_min', 'NDVI_max', 'NDVI_mean',
       'NDVI_std', 'NDVI_var', 'Temperature_trend', 'Temperature_residual',
       'Humidity_trend', 'Humidity_residual', 'NDVI_trend', 'NDVI_residual',
       'Solar_de-seasonalized'],
      dtype='object

#### **Feature Selection**

In [None]:
### PREPROCESSING (encoding + scaling) ###

full_df = df_engineered.copy()
full_df['Date'] = pd.to_datetime(full_df['Date'])

# Make target binary
full_df['fire'] = full_df['fire'].astype(int)

# Encode State
le = LabelEncoder()
full_df['State'] = le.fit_transform(full_df['State'])

# Split
train_df = full_df[(full_df['Date'].dt.year >= 2005) & (full_df['Date'].dt.year <= 2016)]
val_df   = full_df[(full_df['Date'].dt.year >= 2017) & (full_df['Date'].dt.year <= 2018)]
test_df  = full_df[(full_df['Date'].dt.year >= 2019) & (full_df['Date'].dt.year <= 2020)]

protected_cols = ['Date', 'total_area_ha', 'log_total_area_ha', 'fire_count', 'outlier_type', 'ignition_cause', 'fire']
all_columns = full_df.columns.tolist()

# Columns to scale and use for MI
features_to_scale = [col for col in all_columns if col not in protected_cols]
scaler = MinMaxScaler()
# Fit only on training features
scaler.fit(train_df[features_to_scale])
# Scale train separately
train_scaled = train_df.copy()
train_scaled[features_to_scale] = scaler.transform(train_df[features_to_scale])


### SELECTION: MUTUAL INFORMATION ANALYSIS ###

X_train = train_scaled[features_to_scale]
y_train = train_scaled['fire']

mi_scores = mutual_info_classif(X_train, y_train, discrete_features=False, random_state=42)
mi_series = pd.Series(mi_scores, index=features_to_scale).sort_values(ascending=False)

# Display ranked features
print(mi_series)

NDVI_residual                    0.138212
NDVI_std                         0.138151
NDVI_trend                       0.137508
Solar_de-seasonalized            0.137135
Temperature_residual             0.136522
Humidity_trend                   0.135862
NDVI_var                         0.135521
Humidity_residual                0.135198
NDVI_mean                        0.135190
Temperature_trend                0.134469
Temperature [°C]_max             0.070649
Temperature [°C]_mean            0.064331
Temperature [°C]_min             0.055774
Solar radiation [Jm2/day]_std    0.054594
Solar radiation [Jm2/day]_max    0.052395
Relative humidity [%]_mean       0.050675
Solar radiation [Jm2/day]_sum    0.050072
State                            0.048611
Relative humidity [%]_min        0.047061
Relative humidity [%]_max        0.045169
Temperature [°C]_std             0.030598
NDVI_max                         0.027498
NDVI_min                         0.016999
Precipitation [mm/day]_min       0

#### **Save Data**

In [None]:
### SAVE DATA S2 ###

columns_to_keep = ["total_area_ha", "log_total_area_ha", "fire_count", "outlier_type", "ignition_cause", "fire",
                   "State", "Date", "NDVI_residual", "NDVI_std", "NDVI_trend", "Solar_de-seasonalized", "Temperature_residual",
                   "Humidity_trend", "NDVI_var", "Humidity_residual", "NDVI_mean", "Temperature_trend", "Temperature [°C]_max",
                   "Solar radiation [Jm2/day]_std", "Relative humidity [%]_mean",
                   "Temperature [°C]_mean", "Solar radiation [Jm2/day]_sum"
                   ]

df_engineered_filtered = df_engineered[columns_to_keep]

exp_csv_path = os.path.join(target_dir, "S2_full_merged_data.csv")
df_engineered_filtered.to_csv(exp_csv_path, index = True)

full_df         = df_engineered_filtered.copy()
full_df['Date'] = pd.to_datetime(full_df['Date'])
full_df['fire'] = full_df['fire'].astype(int)

# Split
train_df = full_df[(full_df['Date'].dt.year >= 2005) & (full_df['Date'].dt.year <= 2016)]
val_df   = full_df[(full_df['Date'].dt.year >= 2017) & (full_df['Date'].dt.year <= 2018)]
test_df  = full_df[(full_df['Date'].dt.year >= 2019) & (full_df['Date'].dt.year <= 2020)]

protected_cols    = ["State", 'Date', 'total_area_ha', 'log_total_area_ha', 'fire_count', 'outlier_type', 'ignition_cause', 'fire']
all_columns       = full_df.columns.tolist()
features_to_scale = [col for col in all_columns if col not in protected_cols]

scaler = MinMaxScaler()
# Fit only on training features
scaler.fit(train_df[features_to_scale])
# Scale train/val/test separately
train_scaled = train_df.copy()
val_scaled   = val_df.copy()
test_scaled  = test_df.copy()

train_scaled[features_to_scale] = scaler.transform(train_df[features_to_scale])
val_scaled[features_to_scale]   = scaler.transform(val_df[features_to_scale])
test_scaled[features_to_scale]  = scaler.transform(test_df[features_to_scale])

full_scaled_df = pd.concat([train_scaled, val_scaled, test_scaled], axis=0)
exp_csv_path   = os.path.join(target_dir, "S2_full_merged_data_scaled.csv")
full_scaled_df.to_csv(exp_csv_path, index = True)

exp_csv_path = os.path.join(target_dir, "S2_scaled_train.csv")
train_scaled.to_csv(exp_csv_path, index = True)
exp_csv_path = os.path.join(target_dir, "S2_scaled_val.csv")
val_scaled.to_csv(exp_csv_path, index = True)
exp_csv_path = os.path.join(target_dir, "S2_scaled_test.csv")
test_scaled.to_csv(exp_csv_path, index = True)

In [None]:
### SAVE DATA S1 ###

columns_to_keep = ["total_area_ha", "log_total_area_ha", "fire_count", "outlier_type", "ignition_cause", "fire",
                   "State", "Date", "NDVI_residual", "NDVI_std", "NDVI_trend", "Solar_de-seasonalized", "Temperature_residual",
                   "Humidity_trend", "NDVI_var", "Humidity_residual", "NDVI_mean", "Temperature_trend", "Temperature [°C]_max",
                   "Solar radiation [Jm2/day]_std", "Relative humidity [%]_mean",
                   "Temperature [°C]_mean", "Solar radiation [Jm2/day]_sum"
                   ]

df_engineered_filtered = df_engineered[columns_to_keep]

states_to_exclude = ["Queensland", "South Australia", "Victoria"]
df_engineered_filtered = df_engineered_filtered[~df_engineered_filtered["State"].isin(states_to_exclude)]

exp_csv_path = os.path.join(target_dir, "S1_full_merged_data.csv")
df_engineered_filtered.to_csv(exp_csv_path, index = True)

full_df         = df_engineered_filtered.copy()
full_df['Date'] = pd.to_datetime(full_df['Date'])
full_df['fire'] = full_df['fire'].astype(int)

# Split
train_df = full_df[(full_df['Date'].dt.year >= 2005) & (full_df['Date'].dt.year <= 2016)]
val_df   = full_df[(full_df['Date'].dt.year >= 2017) & (full_df['Date'].dt.year <= 2018)]
test_df  = full_df[(full_df['Date'].dt.year >= 2019) & (full_df['Date'].dt.year <= 2020)]

protected_cols    = ["State", 'Date', 'total_area_ha', 'log_total_area_ha', 'fire_count', 'outlier_type', 'ignition_cause', 'fire']
all_columns       = full_df.columns.tolist()
features_to_scale = [col for col in all_columns if col not in protected_cols]

scaler = MinMaxScaler()
# Fit only on training features
scaler.fit(train_df[features_to_scale])
# Scale train/val/test separately
train_scaled = train_df.copy()
val_scaled   = val_df.copy()
test_scaled  = test_df.copy()

train_scaled[features_to_scale] = scaler.transform(train_df[features_to_scale])
val_scaled[features_to_scale]   = scaler.transform(val_df[features_to_scale])
test_scaled[features_to_scale]  = scaler.transform(test_df[features_to_scale])

full_scaled_df = pd.concat([train_scaled, val_scaled, test_scaled], axis=0)
exp_csv_path   = os.path.join(target_dir, "S1_full_merged_data_scaled.csv")
full_scaled_df.to_csv(exp_csv_path, index = True)

exp_csv_path = os.path.join(target_dir, "S1_scaled_train.csv")
train_scaled.to_csv(exp_csv_path, index = True)
exp_csv_path = os.path.join(target_dir, "S1_scaled_val.csv")
val_scaled.to_csv(exp_csv_path, index = True)
exp_csv_path = os.path.join(target_dir, "S1_scaled_test.csv")
test_scaled.to_csv(exp_csv_path, index = True)