In [1]:
import pandas as pd
import numpy as np
import os
import sys
import itertools
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Add project root to path
current_path = os.path.abspath('.')
root_path = None

while current_path != "/":
    if os.path.exists(os.path.join(current_path, "data", "final_data", "total_data.csv")):
        root_path = current_path
        break
    current_path = os.path.dirname(current_path)

if root_path is None:
    raise FileNotFoundError("Could not find project root.")

sys.path.append(os.path.join(root_path, 'src', 'dataset_preprocess'))
sys.path.append(os.path.join(root_path, 'src'))

# Now import your feature engineering functions
from feature_engineering_functions import (
    add_min_max_quantile_features,
    add_trend_diff_features,
    add_extreme_event_flags,
    add_interaction_features,
    add_rolling_std_features,
    add_ratio_features,
    add_cumulative_climate_load,
    add_soil_dryness_index
)

In [3]:
# Map feature engineering functions
feature_engineering_functions = {
    'MinMaxQuantile': add_min_max_quantile_features,
    'TrendDiff': add_trend_diff_features,
    'ExtremeFlags': add_extreme_event_flags,
    'InteractionFeatures': add_interaction_features,
    'StdFeatures': add_rolling_std_features,
    'RatioFeatures': add_ratio_features,
    'CumulativeLoad': add_cumulative_climate_load,
    'SoilDrynessIndex': add_soil_dryness_index
}

In [4]:
# Load original data
data_path = os.path.join(root_path, 'data', 'final_data', 'total_data.csv')
df_base = pd.read_csv(data_path)

In [5]:
from preprocess_functions import DataPreprocess
from train_model import TrainModel


In [6]:
results = []

for feature_name, feature_function in tqdm(feature_engineering_functions.items(), desc='Training One Feature'):
    # print(f"=== Training with feature set: {feature_name} ===")
    df = df_base.copy()
    dp = DataPreprocess(df)

    # data preprocessing
    df = dp.fill_climate_nan_value(method='mean')
    # feature engineering
    df = feature_function(df)
    # training
    train_module = TrainModel(df)
    train_module.train_all_models()
    results.append({
        'FeatureSet': feature_name,
        'results': train_module.models
    })

Training One Feature: 100%|██████████| 8/8 [00:26<00:00,  3.34s/it]


In [9]:
rows = []

for res in results:
    feature_set = res['FeatureSet']
    model_results = res['results']

    for model_name, model_info in model_results.items():
        result_metrics = model_info['result']
        rows.append({
            'FeatureSet': feature_set,
            'Model': model_name,
            'Precision': result_metrics.get('prec', None),
            'Recall': result_metrics.get('rec', None),
            'F1': result_metrics.get('f1', None),
            'Accuracy': result_metrics.get('acc', None),
            'ConfusionMatrix': result_metrics.get('cm', None),
            'ROC_AUC': result_metrics.get('auc', None)
        })

results_df = pd.DataFrame(rows)

print("\n=== Model Performance Summary ===")
results_df


=== Model Performance Summary ===


Unnamed: 0,FeatureSet,Model,Precision,Recall,F1,Accuracy,ConfusionMatrix,ROC_AUC
0,MinMaxQuantile,XGBoost,0.765432,0.782334,0.773791,0.826762,"[[444, 76], [69, 248]]",0.81809
1,MinMaxQuantile,LogisticRegression,0.618182,0.643533,0.630603,0.714456,"[[394, 126], [113, 204]]",0.700613
2,MinMaxQuantile,GradientBoosting,0.720339,0.804416,0.76006,0.807646,"[[421, 99], [62, 255]]",0.807016
3,MinMaxQuantile,DecisionTree,0.640523,0.927445,0.757732,0.775388,"[[355, 165], [23, 294]]",0.805069
4,TrendDiff,XGBoost,0.756024,0.791798,0.773498,0.824373,"[[439, 81], [66, 251]]",0.818014
5,TrendDiff,LogisticRegression,0.607362,0.624606,0.615863,0.704898,"[[392, 128], [119, 198]]",0.689226
6,TrendDiff,GradientBoosting,0.733333,0.798107,0.76435,0.81362,"[[428, 92], [64, 253]]",0.810592
7,TrendDiff,DecisionTree,0.640523,0.927445,0.757732,0.775388,"[[355, 165], [23, 294]]",0.805069
8,ExtremeFlags,XGBoost,0.750769,0.769716,0.760125,0.81601,"[[439, 81], [73, 244]]",0.806973
9,ExtremeFlags,LogisticRegression,0.613293,0.640379,0.626543,0.710872,"[[392, 128], [114, 203]]",0.697112


In [10]:
# Now train on combinations of 2 or more features
feature_keys = list(feature_engineering_functions.keys())
all_combinations = []

In [11]:
for r in range(2, len(feature_keys) + 1):
    combos = list(itertools.combinations(feature_keys, r))
    all_combinations.extend(combos)

In [12]:
len(all_combinations)

247

In [None]:
for combo in tqdm(all_combinations, desc="Training Feature Combinations"):
    # print(f"=== Training with feature set: {'+'.join(combo)} ===")
    df = df_base.copy()
    dp = DataPreprocess(df)

    # data preprocessing
    df = dp.fill_climate_nan_value(method='mean')
    for func_name in combo:
        df = feature_engineering_functions[func_name](df)
    
    # Train model
    train_module = TrainModel(df)
    train_module.train_all_models()
    results.append({
        'FeatureSet': '+'.join(combo),
        'results': train_module.models
    })

Training Feature Combinations:  13%|█▎        | 32/247 [02:03<16:37,  4.64s/it]

In [None]:
rows = []

for res in results:
    feature_set = res['FeatureSet']
    model_results = res['results']

    for model_name, model_info in model_results.items():
        result_metrics = model_info['result']
        rows.append({
            'FeatureSet': feature_set,
            'Model': model_name,
            'Precision': result_metrics.get('prec', None),
            'Recall': result_metrics.get('rec', None),
            'F1': result_metrics.get('f1', None),
            'Accuracy': result_metrics.get('acc', None),
            'ConfusionMatrix': result_metrics.get('cm', None),
            'ROC_AUC': result_metrics.get('auc', None)
            'y_pred': result_metrics.get('y_pred', None),
            'y_true': result_metrics.get('y_true', None)
        })

results_df = pd.DataFrame(rows)

print("\n=== Model Performance Summary ===")
results_df


=== Model Performance Summary ===
                                             FeatureSet               Model  \
0                                        MinMaxQuantile             XGBoost   
1                                        MinMaxQuantile  LogisticRegression   
2                                        MinMaxQuantile    GradientBoosting   
3                                        MinMaxQuantile        DecisionTree   
4                                             TrendDiff             XGBoost   
...                                                 ...                 ...   
1027  TrendDiff+ExtremeFlags+InteractionFeatures+Std...        DecisionTree   
1028  MinMaxQuantile+TrendDiff+ExtremeFlags+Interact...             XGBoost   
1029  MinMaxQuantile+TrendDiff+ExtremeFlags+Interact...  LogisticRegression   
1030  MinMaxQuantile+TrendDiff+ExtremeFlags+Interact...    GradientBoosting   
1031  MinMaxQuantile+TrendDiff+ExtremeFlags+Interact...        DecisionTree   

      Precision 

In [14]:
# Save results
output_dir = os.path.join(root_path, 'data', 'model_results')
os.makedirs(output_dir, exist_ok=True)
results_df.to_csv(os.path.join(output_dir, 'model_feature_set_combinations_results.csv'), index=False)

In [18]:
# Load the model evaluation results
results_csv_path = os.path.join(root_path, 'data', 'model_results', 'model_feature_set_combinations_results.csv')
results_df = pd.read_csv(results_csv_path)

# Sort by Accuracy descending
top_10_models = results_df.sort_values(by='Recall', ascending=False).head(10)

# Display
print("==== Top 10 Models by Accuracy ====")
display(top_10_models[['FeatureSet', 'Model', 'Recall', 'F1', 'Precision', 'Accuracy']])

==== Top 10 Models by Accuracy ====


Unnamed: 0,FeatureSet,Model,Recall,F1,Precision,Accuracy
1031,MinMaxQuantile+TrendDiff+ExtremeFlags+Interact...,DecisionTree,0.930599,0.759331,0.641304,0.776583
107,ExtremeFlags+RatioFeatures,DecisionTree,0.930599,0.759331,0.641304,0.776583
231,MinMaxQuantile+RatioFeatures+CumulativeLoad,DecisionTree,0.930599,0.759331,0.641304,0.776583
235,MinMaxQuantile+RatioFeatures+SoilDrynessIndex,DecisionTree,0.930599,0.759331,0.641304,0.776583
831,TrendDiff+ExtremeFlags+StdFeatures+RatioFeatur...,DecisionTree,0.930599,0.759331,0.641304,0.776583
827,TrendDiff+ExtremeFlags+StdFeatures+RatioFeatur...,DecisionTree,0.930599,0.759331,0.641304,0.776583
251,TrendDiff+ExtremeFlags+RatioFeatures,DecisionTree,0.930599,0.759331,0.641304,0.776583
819,TrendDiff+ExtremeFlags+InteractionFeatures+Rat...,DecisionTree,0.930599,0.759331,0.641304,0.776583
815,TrendDiff+ExtremeFlags+InteractionFeatures+Rat...,DecisionTree,0.930599,0.759331,0.641304,0.776583
267,TrendDiff+InteractionFeatures+RatioFeatures,DecisionTree,0.930599,0.759331,0.641304,0.776583
