In [None]:
import pandas as pd
import numpy as np
import os
import sys
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Add project root to path
current_path = os.path.abspath('.')
root_path = None

while current_path != "/":
    if os.path.exists(os.path.join(current_path, "data", "final_data", "total_data.csv")):
        root_path = current_path
        break
    current_path = os.path.dirname(current_path)

if root_path is None:
    raise FileNotFoundError("Could not find project root.")

sys.path.append(os.path.join(root_path, 'src', 'dataset_preprocess'))

# Now import your feature engineering functions
from feature_engineering_functions import (
    add_min_max_quantile_features,
    add_trend_diff_features,
    add_extreme_event_flags,
    add_interaction_features,
    add_rolling_std_features,
    add_ratio_features,
    add_cumulative_climate_load,
    add_soil_dryness_index
)

In [3]:
# Map feature engineering functions
feature_engineering_functions = {
    'MinMaxQuantile': add_min_max_quantile_features,
    'TrendDiff': add_trend_diff_features,
    'ExtremeFlags': add_extreme_event_flags,
    'InteractionFeatures': add_interaction_features,
    'StdFeatures': add_rolling_std_features,
    'RatioFeatures': add_ratio_features,
    'CumulativeLoad': add_cumulative_climate_load,
    'SoilDrynessIndex': add_soil_dryness_index
}

In [4]:
# Load original data
data_path = os.path.join(root_path, 'data', 'final_data', 'total_data.csv')
df_base = pd.read_csv(data_path)

In [5]:
# Results storage
results = []


In [6]:
for feature_name, feature_function in feature_engineering_functions.items():
    print(f"=== Training with feature set: {feature_name} ===")
    df = df_base.copy()

    # Drop leakage columns
    drop_cols = ['burn_probability', 'conditional_flame_length', 'conditional_risk_to_structures',
                 'distance_km', 'exposure', 'flame_length_exceedance_4ft', 'flame_length_exceedance_8ft',
                 'wildfire_hazard_potential', 'risk_to_structures', 'acres_burned', 'CBD_VALUE',
                 'EVC_VALUE', 'FBFM_VALUE', 'FDIST_VALUE', 'FVC_VALUE', 'Unnamed: 0', 'lat', 'lng']
    df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True)

    # Apply feature engineering
    df = feature_function(df)

    # Label
    y = df['is_fire']
    X = df.drop(columns=['is_fire', 'date', 'latitude', 'longitude'], errors='ignore')
    X = X.select_dtypes(include=[np.number])

    # Drop rows with NaN after feature engineering
    X = X.dropna()
    y = y.loc[X.index]

    # Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    models = {
        'Logistic Regression': LogisticRegression(max_iter=1000),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    }

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None

        results.append({
            'FeatureSet': feature_name,
            'Model': model_name,
            'Accuracy': acc,
            'F1 Score': f1,
            'AUC': auc
        })

=== Training with feature set: MinMaxQuantile ===


NameError: name 'roc_auc_score' is not defined

In [None]:
# Show Results
results_df = pd.DataFrame(results)
print("\n=== Model Performance Summary ===")
print(results_df)


=== Model Performance Summary ===
             FeatureSet                Model  Accuracy  F1 Score
0        MinMaxQuantile  Logistic Regression   0.51250  0.540094
1        MinMaxQuantile        Random Forest   0.50750  0.486979
2        MinMaxQuantile        Decision Tree   0.52000  0.520000
3        MinMaxQuantile    Gradient Boosting   0.46750  0.480488
4        MinMaxQuantile              XGBoost   0.50125  0.495575
5             TrendDiff  Logistic Regression   0.51125  0.543757
6             TrendDiff        Random Forest   0.49500  0.488608
7             TrendDiff        Decision Tree   0.50625  0.500632
8             TrendDiff    Gradient Boosting   0.49250  0.500000
9             TrendDiff              XGBoost   0.50250  0.501253
10         ExtremeFlags  Logistic Regression   0.51750  0.546948
11         ExtremeFlags        Random Forest   0.49625  0.506732
12         ExtremeFlags        Decision Tree   0.52000  0.508951
13         ExtremeFlags    Gradient Boosting   0.49625 