In [1]:
import pandas as pd
import numpy as np
import os
import sys
import itertools
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Add project root to path
current_path = os.path.abspath('.')
root_path = None

while current_path != "/":
    if os.path.exists(os.path.join(current_path, "data", "final_data", "total_data.csv")):
        root_path = current_path
        break
    current_path = os.path.dirname(current_path)

if root_path is None:
    raise FileNotFoundError("Could not find project root.")

sys.path.append(os.path.join(root_path, 'src', 'dataset_preprocess'))
sys.path.append(os.path.join(root_path, 'src'))

# Now import your feature engineering functions
from feature_engineering_functions import (
    add_min_max_quantile_features,
    add_trend_diff_features,
    add_extreme_event_flags,
    add_interaction_features,
    add_rolling_std_features,
    add_ratio_features,
    add_cumulative_climate_load,
    add_soil_dryness_index
)

In [3]:
# Map feature engineering functions
feature_engineering_functions = {
    'MinMaxQuantile': add_min_max_quantile_features,
    'TrendDiff': add_trend_diff_features,
    'ExtremeFlags': add_extreme_event_flags,
    'InteractionFeatures': add_interaction_features,
    'StdFeatures': add_rolling_std_features,
    'RatioFeatures': add_ratio_features,
    'CumulativeLoad': add_cumulative_climate_load,
    'SoilDrynessIndex': add_soil_dryness_index
}

In [4]:
# Load original data
data_path = os.path.join(root_path, 'data', 'final_data', 'total_data.csv')
df_base = pd.read_csv(data_path)

In [5]:
from preprocess_functions import DataPreprocess
from train_model import TrainModel


In [None]:
results = []

for feature_name, feature_function in feature_engineering_functions.items():
    print(f"=== Training with feature set: {feature_name} ===")
    df = df_base.copy()
    dp = DataPreprocess(df)

    # data preprocessing
    df = dp.fill_climate_nan_value(method='mean')
    # feature engineering
    df = feature_function(df)
    # training
    train_module = TrainModel(df)
    train_module.train_all_models()
    results.append({
        'FeatureSet': feature_name,
        'results': train_module.models
    })

=== Training with feature set: MinMaxQuantile ===
Training set size: 3358 samples
Test set size: 840 samples
Number of features after encoding: 162
=== Training with feature set: TrendDiff ===
Training set size: 3358 samples
Test set size: 840 samples
Number of features after encoding: 147
=== Training with feature set: ExtremeFlags ===
Training set size: 3358 samples
Test set size: 840 samples
Number of features after encoding: 140
=== Training with feature set: InteractionFeatures ===
Training set size: 3358 samples
Test set size: 840 samples
Number of features after encoding: 139
=== Training with feature set: StdFeatures ===
Training set size: 3358 samples
Test set size: 840 samples
Number of features after encoding: 142
=== Training with feature set: RatioFeatures ===
Training set size: 3358 samples
Test set size: 840 samples
Number of features after encoding: 139
=== Training with feature set: CumulativeLoad ===
Training set size: 3358 samples
Test set size: 840 samples
Number of

In [7]:
# Show Results
results_df = pd.DataFrame(results)
print("\n=== Model Performance Summary ===")
print(results_df)


=== Model Performance Summary ===
            FeatureSet                                            results
0       MinMaxQuantile  {'XGBoost': {'model': None, 'result': {}}, 'Lo...
1            TrendDiff  {'XGBoost': {'model': None, 'result': {}}, 'Lo...
2         ExtremeFlags  {'XGBoost': {'model': None, 'result': {}}, 'Lo...
3  InteractionFeatures  {'XGBoost': {'model': None, 'result': {}}, 'Lo...
4          StdFeatures  {'XGBoost': {'model': None, 'result': {}}, 'Lo...
5        RatioFeatures  {'XGBoost': {'model': None, 'result': {}}, 'Lo...
6       CumulativeLoad  {'XGBoost': {'model': None, 'result': {}}, 'Lo...
7     SoilDrynessIndex  {'XGBoost': {'model': None, 'result': {}}, 'Lo...


In [8]:
# Now train on combinations of 2 or more features
feature_keys = list(feature_engineering_functions.keys())
all_combinations = []

In [10]:
for r in range(2, len(feature_keys) + 1):
    combos = list(itertools.combinations(feature_keys, r))
    all_combinations.extend(combos)

In [11]:
all_combinations

[('MinMaxQuantile', 'TrendDiff'),
 ('MinMaxQuantile', 'ExtremeFlags'),
 ('MinMaxQuantile', 'InteractionFeatures'),
 ('MinMaxQuantile', 'StdFeatures'),
 ('MinMaxQuantile', 'RatioFeatures'),
 ('MinMaxQuantile', 'CumulativeLoad'),
 ('MinMaxQuantile', 'SoilDrynessIndex'),
 ('TrendDiff', 'ExtremeFlags'),
 ('TrendDiff', 'InteractionFeatures'),
 ('TrendDiff', 'StdFeatures'),
 ('TrendDiff', 'RatioFeatures'),
 ('TrendDiff', 'CumulativeLoad'),
 ('TrendDiff', 'SoilDrynessIndex'),
 ('ExtremeFlags', 'InteractionFeatures'),
 ('ExtremeFlags', 'StdFeatures'),
 ('ExtremeFlags', 'RatioFeatures'),
 ('ExtremeFlags', 'CumulativeLoad'),
 ('ExtremeFlags', 'SoilDrynessIndex'),
 ('InteractionFeatures', 'StdFeatures'),
 ('InteractionFeatures', 'RatioFeatures'),
 ('InteractionFeatures', 'CumulativeLoad'),
 ('InteractionFeatures', 'SoilDrynessIndex'),
 ('StdFeatures', 'RatioFeatures'),
 ('StdFeatures', 'CumulativeLoad'),
 ('StdFeatures', 'SoilDrynessIndex'),
 ('RatioFeatures', 'CumulativeLoad'),
 ('RatioFeatures'

In [14]:
len(all_combinations)

247

In [13]:
for combo in all_combinations:
    print(f"=== Training with feature set: {'+'.join(combo)} ===")
    df = df_base.copy()

    drop_cols = ['burn_probability', 'conditional_flame_length', 'conditional_risk_to_structures',
                 'distance_km', 'exposure', 'flame_length_exceedance_4ft', 'flame_length_exceedance_8ft',
                 'wildfire_hazard_potential', 'risk_to_structures', 'acres_burned', 'CBD_VALUE',
                 'EVC_VALUE', 'FBFM_VALUE', 'FDIST_VALUE', 'FVC_VALUE', 'Unnamed: 0', 'lat', 'lng']
    df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True)

    for func_name in combo:
        df = feature_engineering_functions[func_name](df)

    y = df['is_fire']
    X = df.drop(columns=['is_fire', 'date', 'latitude', 'longitude'], errors='ignore')
    X = X.select_dtypes(include=[np.number])

    X = X.dropna()
    y = y.loc[X.index]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    models = {
        'Logistic Regression': LogisticRegression(max_iter=1000),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    }

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None

        results.append({
            'FeatureSet': '+'.join(combo),
            'Model': model_name,
            'Accuracy': acc,
            'F1 Score': f1,
            'AUC': auc
     })

=== Training with feature set: MinMaxQuantile+TrendDiff ===
=== Training with feature set: MinMaxQuantile+ExtremeFlags ===
=== Training with feature set: MinMaxQuantile+InteractionFeatures ===
=== Training with feature set: MinMaxQuantile+StdFeatures ===
=== Training with feature set: MinMaxQuantile+RatioFeatures ===
=== Training with feature set: MinMaxQuantile+CumulativeLoad ===
=== Training with feature set: MinMaxQuantile+SoilDrynessIndex ===
=== Training with feature set: TrendDiff+ExtremeFlags ===
=== Training with feature set: TrendDiff+InteractionFeatures ===
=== Training with feature set: TrendDiff+StdFeatures ===
=== Training with feature set: TrendDiff+RatioFeatures ===
=== Training with feature set: TrendDiff+CumulativeLoad ===
=== Training with feature set: TrendDiff+SoilDrynessIndex ===
=== Training with feature set: ExtremeFlags+InteractionFeatures ===
=== Training with feature set: ExtremeFlags+StdFeatures ===
=== Training with feature set: ExtremeFlags+RatioFeatures ==

In [15]:
# Summary
results_df = pd.DataFrame(results)
print(results_df)

                                             FeatureSet                Model  \
0                                        MinMaxQuantile  Logistic Regression   
1                                        MinMaxQuantile        Random Forest   
2                                        MinMaxQuantile        Decision Tree   
3                                        MinMaxQuantile    Gradient Boosting   
4                                        MinMaxQuantile              XGBoost   
...                                                 ...                  ...   
1270  MinMaxQuantile+TrendDiff+ExtremeFlags+Interact...  Logistic Regression   
1271  MinMaxQuantile+TrendDiff+ExtremeFlags+Interact...        Random Forest   
1272  MinMaxQuantile+TrendDiff+ExtremeFlags+Interact...        Decision Tree   
1273  MinMaxQuantile+TrendDiff+ExtremeFlags+Interact...    Gradient Boosting   
1274  MinMaxQuantile+TrendDiff+ExtremeFlags+Interact...              XGBoost   

      Accuracy  F1 Score       AUC  
0 

In [16]:
# Save results
output_dir = os.path.join(root_path, 'data', 'featured_data')
os.makedirs(output_dir, exist_ok=True)
results_df.to_csv(os.path.join(output_dir, 'model_feature_set_combinations_results.csv'), index=False)

In [None]:
# Load the model evaluation results
results_csv_path = os.path.join(root_path, 'data', 'featured_data', 'model_feature_set_combinations_results.csv')
results_df = pd.read_csv(results_csv_path)

# Sort by Accuracy descending
top_10_models = results_df.sort_values(by='Accuracy', ascending=False).head(10)

# Display
print("==== Top 10 Models by Accuracy ====")
display(top_10_models[['FeatureSet', 'Model', 'Accuracy', 'F1 Score', 'AUC']])