In [1]:
import pandas as pd
import numpy as np
import os
import sys
import itertools
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Add project root to path
current_path = os.path.abspath('.')
root_path = None

while current_path != "/":
    if os.path.exists(os.path.join(current_path, "data", "final_data", "total_data.csv")):
        root_path = current_path
        break
    current_path = os.path.dirname(current_path)

if root_path is None:
    raise FileNotFoundError("Could not find project root.")

sys.path.append(os.path.join(root_path, 'src', 'dataset_preprocess'))
sys.path.append(os.path.join(root_path, 'src'))

# Now import your feature engineering functions
from feature_engineering_functions import (
    add_min_max_quantile_features,
    add_trend_diff_features,
    add_extreme_event_flags,
    add_interaction_features,
    add_rolling_std_features,
    add_ratio_features,
    add_cumulative_climate_load,
    add_soil_dryness_index
)

In [3]:
# Map feature engineering functions
feature_engineering_functions = {
    'MinMaxQuantile': add_min_max_quantile_features,
    'TrendDiff': add_trend_diff_features,
    'ExtremeFlags': add_extreme_event_flags,
    'InteractionFeatures': add_interaction_features,
    'StdFeatures': add_rolling_std_features,
    'RatioFeatures': add_ratio_features,
    'CumulativeLoad': add_cumulative_climate_load,
    'SoilDrynessIndex': add_soil_dryness_index
}

In [4]:
# Load original data
data_path = os.path.join(root_path, 'data', 'final_data', 'total_data.csv')
df_base = pd.read_csv(data_path)

In [5]:
from preprocess_functions import DataPreprocess
from train_model import TrainModel


In [None]:
results = []

for feature_name, feature_function in tqdm(feature_engineering_functions.items(), desc='Training One Feature'):
    df = df_base.copy()
    dp = DataPreprocess(df)

    # data preprocessing
    df = dp.fill_climate_nan_value(method='mean')
    
    # feature engineering
    df = feature_function(df)
    
    # training
    train_module = TrainModel(df)
    train_module.train_all_models()
    results.append({
        'FeatureSet': feature_name,
        'results': train_module.models
    })

Training One Feature: 100%|██████████| 8/8 [00:26<00:00,  3.34s/it]


In [9]:
rows = []

for res in results:
    feature_set = res['FeatureSet']
    model_results = res['results']

    for model_name, model_info in model_results.items():
        result_metrics = model_info['result']
        rows.append({
            'FeatureSet': feature_set,
            'Model': model_name,
            'Precision': result_metrics.get('prec', None),
            'Recall': result_metrics.get('rec', None),
            'F1': result_metrics.get('f1', None),
            'Accuracy': result_metrics.get('acc', None),
            'ConfusionMatrix': result_metrics.get('cm', None),
            'ROC_AUC': result_metrics.get('auc', None)
        })

results_df = pd.DataFrame(rows)

print("\n=== Model Performance Summary ===")
results_df


=== Model Performance Summary ===


Unnamed: 0,FeatureSet,Model,Precision,Recall,F1,Accuracy,ConfusionMatrix,ROC_AUC
0,MinMaxQuantile,XGBoost,0.765432,0.782334,0.773791,0.826762,"[[444, 76], [69, 248]]",0.81809
1,MinMaxQuantile,LogisticRegression,0.618182,0.643533,0.630603,0.714456,"[[394, 126], [113, 204]]",0.700613
2,MinMaxQuantile,GradientBoosting,0.720339,0.804416,0.76006,0.807646,"[[421, 99], [62, 255]]",0.807016
3,MinMaxQuantile,DecisionTree,0.640523,0.927445,0.757732,0.775388,"[[355, 165], [23, 294]]",0.805069
4,TrendDiff,XGBoost,0.756024,0.791798,0.773498,0.824373,"[[439, 81], [66, 251]]",0.818014
5,TrendDiff,LogisticRegression,0.607362,0.624606,0.615863,0.704898,"[[392, 128], [119, 198]]",0.689226
6,TrendDiff,GradientBoosting,0.733333,0.798107,0.76435,0.81362,"[[428, 92], [64, 253]]",0.810592
7,TrendDiff,DecisionTree,0.640523,0.927445,0.757732,0.775388,"[[355, 165], [23, 294]]",0.805069
8,ExtremeFlags,XGBoost,0.750769,0.769716,0.760125,0.81601,"[[439, 81], [73, 244]]",0.806973
9,ExtremeFlags,LogisticRegression,0.613293,0.640379,0.626543,0.710872,"[[392, 128], [114, 203]]",0.697112


In [10]:
# Now train on combinations of 2 or more features
feature_keys = list(feature_engineering_functions.keys())
all_combinations = []

In [11]:
for r in range(2, len(feature_keys) + 1):
    combos = list(itertools.combinations(feature_keys, r))
    all_combinations.extend(combos)

In [12]:
len(all_combinations)

247

In [13]:
for combo in tqdm(all_combinations, desc="Training Feature Combinations"):
    # print(f"=== Training with feature set: {'+'.join(combo)} ===")
    df = df_base.copy()
    dp = DataPreprocess(df)

    # data preprocessing
    df = dp.fill_climate_nan_value(method='mean')
    for func_name in combo:
        df = feature_engineering_functions[func_name](df)
    
    # Train model
    train_module = TrainModel(df)
    train_module.train_all_models()
    results.append({
        'FeatureSet': '+'.join(combo),
        'results': train_module.models
    })

Training Feature Combinations: 100%|██████████| 247/247 [17:50<00:00,  4.33s/it]


In [15]:
rows = []

for res in results:
    feature_set = res['FeatureSet']
    model_results = res['results']

    for model_name, model_info in model_results.items():
        result_metrics = model_info['result']
        rows.append({
            'FeatureSet': feature_set,
            'Model': model_name,
            'Precision': result_metrics.get('prec', None),
            'Recall': result_metrics.get('rec', None),
            'F1': result_metrics.get('f1', None),
            'Accuracy': result_metrics.get('acc', None),
            'ConfusionMatrix': result_metrics.get('cm', None),
            'ROC_AUC': result_metrics.get('auc', None),
            'y_pred': result_metrics.get('y_pred', None),
            'y_true': result_metrics.get('y_true', None)
        })

results_df = pd.DataFrame(rows)

print("\n=== Model Performance Summary ===")
results_df


=== Model Performance Summary ===


Unnamed: 0,FeatureSet,Model,Precision,Recall,F1,Accuracy,ConfusionMatrix,ROC_AUC,y_pred,y_true
0,MinMaxQuantile,XGBoost,0.765432,0.782334,0.773791,0.826762,"[[444, 76], [69, 248]]",0.818090,"[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, ...",1908 0 255 0 3184 1 1133 0 960 ...
1,MinMaxQuantile,LogisticRegression,0.618182,0.643533,0.630603,0.714456,"[[394, 126], [113, 204]]",0.700613,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",1908 0 255 0 3184 1 1133 0 960 ...
2,MinMaxQuantile,GradientBoosting,0.720339,0.804416,0.760060,0.807646,"[[421, 99], [62, 255]]",0.807016,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...",1908 0 255 0 3184 1 1133 0 960 ...
3,MinMaxQuantile,DecisionTree,0.640523,0.927445,0.757732,0.775388,"[[355, 165], [23, 294]]",0.805069,"[0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, ...",1908 0 255 0 3184 1 1133 0 960 ...
4,TrendDiff,XGBoost,0.756024,0.791798,0.773498,0.824373,"[[439, 81], [66, 251]]",0.818014,"[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, ...",1908 0 255 0 3184 1 1133 0 960 ...
...,...,...,...,...,...,...,...,...,...,...
1015,TrendDiff+ExtremeFlags+InteractionFeatures+Std...,DecisionTree,0.641304,0.930599,0.759331,0.776583,"[[355, 165], [22, 295]]",0.806646,"[0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, ...",1908 0 255 0 3184 1 1133 0 960 ...
1016,MinMaxQuantile+TrendDiff+ExtremeFlags+Interact...,XGBoost,0.759146,0.785489,0.772093,0.824373,"[[441, 79], [68, 249]]",0.816783,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",1908 0 255 0 3184 1 1133 0 960 ...
1017,MinMaxQuantile+TrendDiff+ExtremeFlags+Interact...,LogisticRegression,0.607903,0.630915,0.619195,0.706093,"[[391, 129], [117, 200]]",0.691419,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",1908 0 255 0 3184 1 1133 0 960 ...
1018,MinMaxQuantile+TrendDiff+ExtremeFlags+Interact...,GradientBoosting,0.725989,0.810726,0.766021,0.812425,"[[423, 97], [60, 257]]",0.812094,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",1908 0 255 0 3184 1 1133 0 960 ...


In [16]:
# Save results
output_dir = os.path.join(root_path, 'data', 'model_results')
os.makedirs(output_dir, exist_ok=True)
results_df.to_csv(os.path.join(output_dir, 'model_feature_set_combinations_results.csv'), index=False)

In [23]:
# Load the model evaluation results
results_csv_path = os.path.join(root_path, 'data', 'model_results', 'model_feature_set_combinations_results.csv')
results_df = pd.read_csv(results_csv_path)

# Sort by Recall descending
top_10_models_recall = results_df.sort_values(by='Recall', ascending=False).head(10)

# Display
print("==== Top 10 Models by Recall ====")
display(top_10_models_recall[['FeatureSet', 'Model', 'Recall', 'F1', 'Precision', 'Accuracy', 'ROC_AUC']])


# Sort by ROC descending
top_10_models_auc = results_df.sort_values(by='ROC_AUC', ascending=False).head(10)

# Display
print("==== Top 10 Models by ROC_AUC ====")
display(top_10_models_auc[['FeatureSet', 'Model', 'Recall', 'F1', 'Precision', 'Accuracy', 'ROC_AUC']])

==== Top 10 Models by Recall ====


Unnamed: 0,FeatureSet,Model,Recall,F1,Precision,Accuracy,ROC_AUC
1019,MinMaxQuantile+TrendDiff+ExtremeFlags+Interact...,DecisionTree,0.930599,0.759331,0.641304,0.776583,0.806646
743,MinMaxQuantile+ExtremeFlags+InteractionFeature...,DecisionTree,0.930599,0.759331,0.641304,0.776583,0.806646
727,MinMaxQuantile+TrendDiff+RatioFeatures+Cumulat...,DecisionTree,0.930599,0.759331,0.641304,0.776583,0.806646
719,MinMaxQuantile+TrendDiff+StdFeatures+RatioFeat...,DecisionTree,0.930599,0.759331,0.641304,0.776583,0.806646
715,MinMaxQuantile+TrendDiff+StdFeatures+RatioFeat...,DecisionTree,0.930599,0.759331,0.641304,0.776583,0.806646
159,MinMaxQuantile+TrendDiff+RatioFeatures,DecisionTree,0.930599,0.759331,0.641304,0.776583,0.806646
707,MinMaxQuantile+TrendDiff+InteractionFeatures+R...,DecisionTree,0.930599,0.759331,0.641304,0.776583,0.806646
703,MinMaxQuantile+TrendDiff+InteractionFeatures+R...,DecisionTree,0.930599,0.759331,0.641304,0.776583,0.806646
691,MinMaxQuantile+TrendDiff+InteractionFeatures+S...,DecisionTree,0.930599,0.759331,0.641304,0.776583,0.806646
683,MinMaxQuantile+TrendDiff+ExtremeFlags+RatioFea...,DecisionTree,0.930599,0.759331,0.641304,0.776583,0.806646


==== Top 10 Models by ROC_AUC ====


Unnamed: 0,FeatureSet,Model,Recall,F1,Precision,Accuracy,ROC_AUC
904,MinMaxQuantile+TrendDiff+ExtremeFlags+StdFeatu...,XGBoost,0.823344,0.798165,0.774481,0.842294,0.838595
460,MinMaxQuantile+ExtremeFlags+RatioFeatures+Soil...,XGBoost,0.826498,0.793939,0.763848,0.837515,0.835365
984,MinMaxQuantile+TrendDiff+ExtremeFlags+Interact...,XGBoost,0.829653,0.793363,0.760116,0.83632,0.835019
948,MinMaxQuantile+ExtremeFlags+StdFeatures+RatioF...,XGBoost,0.804416,0.793157,0.782209,0.841099,0.833939
878,MinMaxQuantile+TrendDiff+ExtremeFlags+Interact...,GradientBoosting,0.851735,0.79063,0.737705,0.829152,0.83356
694,MinMaxQuantile+TrendDiff+InteractionFeatures+S...,GradientBoosting,0.851735,0.79063,0.737705,0.829152,0.83356
184,MinMaxQuantile+ExtremeFlags+SoilDrynessIndex,XGBoost,0.801262,0.791277,0.781538,0.839904,0.832362
760,MinMaxQuantile+ExtremeFlags+StdFeatures+Cumula...,XGBoost,0.801262,0.791277,0.781538,0.839904,0.832362
912,MinMaxQuantile+TrendDiff+InteractionFeatures+S...,XGBoost,0.823344,0.789713,0.758721,0.833931,0.831864
790,TrendDiff+ExtremeFlags+InteractionFeatures+Std...,GradientBoosting,0.826498,0.789157,0.755043,0.832736,0.831518


In [19]:
best_recall = results_df.sort_values(by='Recall', ascending=False).head(1)
best_auc = results_df.sort_values(by='ROC_AUC', ascending=False).head(1)
best_prec = results_df.sort_values(by='Precision', ascending=False).head(1)
best_f1 = results_df.sort_values(by='F1', ascending=False).head(1)

In [20]:
best_recall

Unnamed: 0,FeatureSet,Model,Precision,Recall,F1,Accuracy,ConfusionMatrix,ROC_AUC,y_pred,y_true
1019,MinMaxQuantile+TrendDiff+ExtremeFlags+Interact...,DecisionTree,0.641304,0.930599,0.759331,0.776583,[[355 165]\n [ 22 295]],0.806646,[0 1 1 0 0 0 0 0 1 0 1 0 1 0 1 1 0 1 1 0 1 1 0...,1908 0\n255 0\n3184 1\n1133 0\n96...
