In [79]:
# Import libraries
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, fbeta_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import pickle
import json
import warnings
import logging
import sys
from datetime import datetime
import time
import subprocess
import shutil

warnings.filterwarnings('ignore')

print("‚úÖ Libraries imported successfully")


‚úÖ Libraries imported successfully


In [80]:
# ============================================
# CONFIGURATION
# ============================================
USE_FULL_DATASET = True
ENHANCED_FEATURES = True
CLASS_WEIGHT_MULTIPLIER = 1.0  # OPTIMIZED: Gi·∫£m t·ª´ 20.0 xu·ªëng 5.0 ƒë·ªÉ balance Precision v√† Recall
F_BETA = 2.0  # F-beta score, beta=2 ∆∞u ti√™n recall g·∫•p ƒë√¥i precision
USE_SMOTE = False  # False = NHANH HON v·ªõi dataset l·ªõn, ch·ªâ d√πng class_weight
SMOTE_SAMPLE_SIZE = 500000  # Sample tr∆∞·ªõc khi SMOTE n·∫øu USE_SMOTE=True
FORCE_GPU = False  # True = force GPU (fail n·∫øu kh√¥ng c√≥), False = auto-detect (recommended)

print("‚úÖ Configuration loaded:")
print(f"  - USE_FULL_DATASET: {USE_FULL_DATASET}")
print(f"  - ENHANCED_FEATURES: {ENHANCED_FEATURES}")
print(f"  - CLASS_WEIGHT_MULTIPLIER: {CLASS_WEIGHT_MULTIPLIER}")
print(f"  - F_BETA: {F_BETA}")
print(f"  - USE_SMOTE: {USE_SMOTE}")
print(f"  - FORCE_GPU: {FORCE_GPU}")


‚úÖ Configuration loaded:
  - USE_FULL_DATASET: True
  - ENHANCED_FEATURES: True
  - CLASS_WEIGHT_MULTIPLIER: 1.0
  - F_BETA: 2.0
  - USE_SMOTE: False
  - FORCE_GPU: False


In [81]:
# Load data
dataset_dir = Path("dataset")

# Support c·∫£ old structure (file tr·ª±c ti·∫øp) v√† new structure (file trong subdir)
parquet_files_direct = sorted(dataset_dir.glob("scenario_*.parquet"))
parquet_files_subdir = []
for scenario_dir in sorted(dataset_dir.glob("scenario_*")):
    if scenario_dir.is_dir():
        # New structure: t√¨m nodes.parquet trong subdirectory
        nodes_file = scenario_dir / "nodes.parquet"
        if nodes_file.exists():
            parquet_files_subdir.append(nodes_file)
        # Ho·∫∑c t√¨m b·∫•t k·ª≥ parquet n√†o trong subdir (fallback)
        elif not parquet_files_subdir:
            parquet_in_dir = list(scenario_dir.glob("*.parquet"))
            if parquet_in_dir:
                parquet_files_subdir.extend(parquet_in_dir)

parquet_files = sorted(parquet_files_direct + parquet_files_subdir)

if len(parquet_files) == 0:
    raise FileNotFoundError("Khong tim thay parquet files!")

print(f"‚úÖ Found {len(parquet_files)} scenarios")

max_scenarios = None if USE_FULL_DATASET else 100
files_to_load = parquet_files[:max_scenarios] if max_scenarios else parquet_files

print(f"üìÇ Loading {len(files_to_load)} scenarios...")
start_time = time.time()

dfs = []
for i, f in enumerate(files_to_load):
    if (i + 1) % 100 == 0 or (i + 1) == len(files_to_load):
        elapsed = time.time() - start_time
        rate = (i + 1) / elapsed if elapsed > 0 else 0
        remaining = (len(files_to_load) - i + 1) / rate if rate > 0 else 0
        print(f"  Loaded {i+1}/{len(files_to_load)} ({elapsed:.1f}s, ~{remaining:.1f}s remaining)")
    dfs.append(pd.read_parquet(f))

df_all = pd.concat(dfs, ignore_index=True)
load_time = time.time() - start_time
print(f"‚úÖ Loaded {len(df_all):,} records in {load_time:.1f}s")


‚úÖ Found 1500 scenarios
üìÇ Loading 1500 scenarios...
  Loaded 100/1500 (1.9s, ~26.3s remaining)
  Loaded 200/1500 (2.3s, ~15.1s remaining)
  Loaded 300/1500 (2.8s, ~11.2s remaining)
  Loaded 400/1500 (3.3s, ~9.1s remaining)
  Loaded 500/1500 (3.8s, ~7.7s remaining)
  Loaded 600/1500 (4.3s, ~6.5s remaining)
  Loaded 700/1500 (5.0s, ~5.7s remaining)
  Loaded 800/1500 (5.5s, ~4.9s remaining)
  Loaded 900/1500 (6.1s, ~4.1s remaining)
  Loaded 1000/1500 (6.6s, ~3.3s remaining)
  Loaded 1100/1500 (7.2s, ~2.6s remaining)
  Loaded 1200/1500 (7.9s, ~2.0s remaining)
  Loaded 1300/1500 (8.5s, ~1.3s remaining)
  Loaded 1400/1500 (9.2s, ~0.7s remaining)
  Loaded 1500/1500 (9.9s, ~0.0s remaining)
‚úÖ Loaded 28,227,000 records in 14.3s


In [82]:
# Filter reservoir nodes (ch·ªâ gi·ªØ junction nodes)
reservoir_nodes = df_all[df_all['demand'] < -0.1]['node_id'].unique().tolist()
df_ml = df_all[~df_all['node_id'].isin(reservoir_nodes)].copy()

print(f"‚úÖ After filter: {len(df_ml):,} records")
print(f"‚úÖ Junction nodes: {df_ml['node_id'].nunique()}")
print(f"‚úÖ Reservoir nodes filtered: {len(reservoir_nodes)}")


‚úÖ After filter: 28,081,500 records
‚úÖ Junction nodes: 193
‚úÖ Reservoir nodes filtered: 1


In [83]:
# Basic features
df_ml['has_leak'] = (df_ml['leak_demand'] > 0).astype(int)
df_ml['hour'] = (df_ml['timestamp'] / 3600).astype(int)
df_ml['hour_sin'] = np.sin(2 * np.pi * df_ml['hour'] / 24)
df_ml['hour_cos'] = np.cos(2 * np.pi * df_ml['hour'] / 24)

# Node ID encoding
try:
    df_ml['node_id_int'] = pd.to_numeric(df_ml['node_id'], errors='coerce')
    if df_ml['node_id_int'].isna().any():
        from sklearn.preprocessing import LabelEncoder
        le = LabelEncoder()
        df_ml['node_id_int'] = le.fit_transform(df_ml['node_id'].astype(str))
except:
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    df_ml['node_id_int'] = le.fit_transform(df_ml['node_id'].astype(str))

print("‚úÖ Basic features created")
print(f"  Leak records: {df_ml['has_leak'].sum():,} ({100*df_ml['has_leak'].mean():.2f}%)")


‚úÖ Basic features created
  Leak records: 0 (0.00%)


In [64]:
# Enhanced features
if ENHANCED_FEATURES:
    print("üîß Adding enhanced features...")
    df_ml = df_ml.sort_values(['node_id', 'timestamp']).reset_index(drop=True)
    
    # Moving averages
    for window in [3, 5]:
        df_ml[f'pressure_ma{window}'] = df_ml.groupby('node_id')['pressure'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean()
        )
        df_ml[f'head_ma{window}'] = df_ml.groupby('node_id')['head'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean()
        )
    
    # Changes
    df_ml['pressure_change'] = df_ml.groupby('node_id')['pressure'].diff().fillna(0)
    df_ml['head_change'] = df_ml.groupby('node_id')['head'].diff().fillna(0)
    
    # Drops
    df_ml['pressure_drop'] = df_ml.groupby('node_id')['pressure'].transform(
        lambda x: x.rolling(window=5, min_periods=1).max() - x
    )
    df_ml['head_drop'] = df_ml.groupby('node_id')['head'].transform(
        lambda x: x.rolling(window=5, min_periods=1).max() - x
    )
    
    # Pressure drop rate
    df_ml['pressure_drop_rate'] = df_ml.groupby('node_id')['pressure_change'].transform(
        lambda x: x.rolling(window=3, min_periods=1).mean()
    )
    
    # Cumulative pressure drop
    df_ml['pressure_cumulative_drop'] = df_ml.groupby(['node_id', 'scenario_id'])['pressure_drop'].cumsum()
    
    print("‚úÖ Enhanced features added")
else:
    print("‚ÑπÔ∏è Enhanced features disabled")

# Feature selection
basic_features = ['pressure', 'head', 'demand', 'hour_sin', 'hour_cos', 'node_id_int']
if ENHANCED_FEATURES:
    enhanced_features = [
        'pressure_ma3', 'pressure_ma5', 'head_ma3', 'head_ma5',
        'pressure_change', 'head_change',
        'pressure_drop', 'head_drop',
        'pressure_drop_rate', 'pressure_cumulative_drop'
    ]
    feature_cols = basic_features + enhanced_features
else:
    feature_cols = basic_features

print(f"‚úÖ Total features: {len(feature_cols)}")
print(f"   Features: {feature_cols}")


üîß Adding enhanced features...
‚úÖ Enhanced features added
‚úÖ Total features: 16
   Features: ['pressure', 'head', 'demand', 'hour_sin', 'hour_cos', 'node_id_int', 'pressure_ma3', 'pressure_ma5', 'head_ma3', 'head_ma5', 'pressure_change', 'head_change', 'pressure_drop', 'head_drop', 'pressure_drop_rate', 'pressure_cumulative_drop']


In [65]:
# Split by scenario
scenario_ids = df_ml['scenario_id'].unique()
train_scenarios, temp_scenarios = train_test_split(scenario_ids, test_size=0.3, random_state=42)
val_scenarios, test_scenarios = train_test_split(temp_scenarios, test_size=0.5, random_state=42)

train_df = df_ml[df_ml['scenario_id'].isin(train_scenarios)]
val_df = df_ml[df_ml['scenario_id'].isin(val_scenarios)]
test_df = df_ml[df_ml['scenario_id'].isin(test_scenarios)]

print(f"‚úÖ Train: {len(train_df):,} records ({len(train_scenarios)} scenarios)")
print(f"‚úÖ Val:   {len(val_df):,} records ({len(val_scenarios)} scenarios)")
print(f"‚úÖ Test:  {len(test_df):,} records ({len(test_scenarios)} scenarios)")

X_train = train_df[feature_cols]
y_train = train_df['has_leak']
X_val = val_df[feature_cols]
y_val = val_df['has_leak']
X_test = test_df[feature_cols]
y_test = test_df['has_leak']

leak_ratio = y_train.mean()
print(f"\nüìä Leak ratio: {leak_ratio:.4f} ({100*leak_ratio:.2f}%)")


‚úÖ Train: 19,657,050 records (1050 scenarios)
‚úÖ Val:   4,212,225 records (225 scenarios)
‚úÖ Test:  4,212,225 records (225 scenarios)

üìä Leak ratio: 0.0000 (0.00%)


In [66]:
# Normalize features
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    columns=feature_cols,
    index=X_train.index
)
X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=feature_cols, index=X_val.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=feature_cols, index=X_test.index)

print("‚úÖ Features normalized using StandardScaler")


‚úÖ Features normalized using StandardScaler


In [67]:
# Check class distribution
print(f"üìä Class distribution:")
print(f"  No Leak: {(y_train == 0).sum():,} ({100*(y_train == 0).mean():.2f}%)")
print(f"  Leak:    {(y_train == 1).sum():,} ({100*(y_train == 1).mean():.2f}%)")

use_smote = USE_SMOTE
if USE_SMOTE:
    try:
        from imblearn.over_sampling import SMOTE
        print("\nüîÑ Applying SMOTE...")
        print(f"   Training data size: {len(X_train_scaled):,} records")
        
        # Sample tr∆∞·ªõc n·∫øu dataset qu√° l·ªõn
        if SMOTE_SAMPLE_SIZE and len(X_train_scaled) > SMOTE_SAMPLE_SIZE:
            print(f"   Dataset qu√° l·ªõn, sampling {SMOTE_SAMPLE_SIZE:,} records...")
            sample_idx = []
            leak_idx = np.where(y_train == 1)[0]
            no_leak_idx = np.where(y_train == 0)[0]
            
            sample_idx.extend(leak_idx.tolist())
            
            n_no_leak_needed = SMOTE_SAMPLE_SIZE - len(leak_idx)
            if n_no_leak_needed > 0:
                np.random.seed(42)
                sampled_no_leak = np.random.choice(no_leak_idx, size=min(n_no_leak_needed, len(no_leak_idx)), replace=False)
                sample_idx.extend(sampled_no_leak.tolist())
            
            X_train_smote = X_train_scaled.iloc[sample_idx].copy()
            y_train_smote = y_train.iloc[sample_idx].copy()
            print(f"   Sampled to {len(X_train_smote):,} records")
        else:
            X_train_smote = X_train_scaled
            y_train_smote = y_train
        
        start_smote = time.time()
        smote = SMOTE(random_state=42, k_neighbors=3, n_jobs=-1)
        X_train_balanced_array, y_train_balanced_array = smote.fit_resample(
            X_train_smote.values, y_train_smote.values
        )
        
        X_train_balanced = pd.DataFrame(X_train_balanced_array, columns=X_train_smote.columns)
        y_train_balanced = pd.Series(y_train_balanced_array)
        
        smote_time = time.time() - start_smote
        print(f"‚úÖ SMOTE completed in {smote_time:.1f}s")
        print(f"   Before: {len(X_train_smote):,} records")
        print(f"   After:  {len(X_train_balanced):,} records")
        print(f"   Distribution: No Leak {100*(y_train_balanced == 0).mean():.1f}%, Leak {100*(y_train_balanced == 1).mean():.1f}%")
        
    except ImportError:
        print("‚ö†Ô∏è imbalanced-learn not installed, skipping SMOTE")
        X_train_balanced = X_train_scaled
        y_train_balanced = y_train
        use_smote = False
    except Exception as e:
        print(f"‚ö†Ô∏è SMOTE failed: {e}")
        print("   Falling back to class_weight only")
        X_train_balanced = X_train_scaled
        y_train_balanced = y_train
        use_smote = False
else:
    print("\n‚ÑπÔ∏è SMOTE disabled - using class_weight only")
    X_train_balanced = X_train_scaled
    y_train_balanced = y_train
    use_smote = False

# Calculate class weight
class_weight_ratio = (y_train_balanced == 0).sum() / (y_train_balanced == 1).sum() if use_smote else (y_train == 0).sum() / (y_train == 1).sum()
adjusted_class_weight = class_weight_ratio * CLASS_WEIGHT_MULTIPLIER
print(f"\nüìä Class weight:")
print(f"   Base class_weight: {class_weight_ratio:.1f}")
print(f"   Adjusted (x{CLASS_WEIGHT_MULTIPLIER}): {adjusted_class_weight:.1f}")


üìä Class distribution:
  No Leak: 19,657,050 (100.00%)
  Leak:    0 (0.00%)

‚ÑπÔ∏è SMOTE disabled - using class_weight only

üìä Class weight:
   Base class_weight: inf
   Adjusted (x1.0): inf


In [68]:
import lightgbm as lgb

# GPU Detection
device = 'cpu'  # Default
if FORCE_GPU:
    device = 'gpu'
    print("üîß FORCE_GPU=True - attempting GPU...")
else:
    try:
        result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, timeout=5)
        if result.returncode == 0:
            print("üîß GPU detected (nvidia-smi), testing LightGBM GPU support...")
            try:
                # Test GPU with small dataset
                test_model = lgb.LGBMClassifier(n_estimators=1, device='gpu', verbose=-1)
                test_model.fit(X_train_balanced[:1000], y_train_balanced[:1000])
                device = 'gpu'
                print("‚úÖ GPU tested successfully - using GPU for training!")
                del test_model
            except Exception as gpu_error:
                print(f"‚ö†Ô∏è GPU available but LightGBM GPU test failed: {gpu_error}")
                print("   Falling back to CPU (LightGBM CPU is still very fast)")
                device = 'cpu'
        else:
            print("‚ÑπÔ∏è No GPU detected (nvidia-smi not found) - using CPU")
    except FileNotFoundError:
        print("‚ÑπÔ∏è nvidia-smi not found - no GPU available, using CPU")
    except subprocess.TimeoutExpired:
        print("‚ö†Ô∏è nvidia-smi timeout - using CPU")
    except Exception as e:
        print(f"‚ÑπÔ∏è Cannot check GPU ({type(e).__name__}), using CPU: {e}")

print(f"\n‚úÖ Device selected: {device.upper()}")


üîß GPU detected (nvidia-smi), testing LightGBM GPU support...
‚úÖ GPU tested successfully - using GPU for training!

‚úÖ Device selected: GPU


In [69]:
# Create model
model = lgb.LGBMClassifier(
    n_estimators=1000,
    max_depth=12,
    learning_rate=0.03,
    num_leaves=63,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight='balanced',
    scale_pos_weight=adjusted_class_weight,
    random_state=42,
    device=device,
    n_jobs=-1,
    verbose=-1
)

print("üöÄ Training LightGBM model...")
print(f"   Parameters:")
print(f"     - n_estimators: 1000")
print(f"     - max_depth: 12")
print(f"     - device: {device.upper()}")
print(f"     - scale_pos_weight: {adjusted_class_weight:.1f} (x{CLASS_WEIGHT_MULTIPLIER})")

start_train = time.time()

model.fit(
    X_train_balanced, y_train_balanced,
    eval_set=[(X_val_scaled, y_val)],
    eval_names=['validation'],
    eval_metric='f1',
    callbacks=[lgb.early_stopping(100), lgb.log_evaluation(200)]
)

train_time = time.time() - start_train
print(f"\n‚úÖ Model trained in {train_time:.1f}s ({train_time/60:.1f} minutes)")


üöÄ Training LightGBM model...
   Parameters:
     - n_estimators: 1000
     - max_depth: 12
     - device: GPU
     - scale_pos_weight: inf (x1.0)
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	validation's binary_logloss: 9.99201e-16

‚úÖ Model trained in 11.2s (0.2 minutes)


In [None]:
# Predictions
y_test_proba = model.predict_proba(X_test_scaled)[:, 1]

# Accuracy
train_acc = model.score(X_train_scaled, y_train)
val_acc = model.score(X_val_scaled, y_val)
test_acc = model.score(X_test_scaled, y_test)

print("üìä Accuracy:")
print(f"   Train: {train_acc:.4f}")
print(f"   Val:   {val_acc:.4f}")
print(f"   Test:  {test_acc:.4f}")

# ROC-AUC
auc_score = None
if y_test.sum() > 0:
    auc_score = roc_auc_score(y_test, y_test_proba)
    print(f"\nüìä ROC-AUC: {auc_score:.4f}")
else:
    print("\n‚ö†Ô∏è ROC-AUC cannot be calculated (no positive cases in y_test)")
    auc_score = 0.0  # Default value for metrics


üìä Accuracy:
   Train: 1.0000
   Val:   1.0000
   Test:  1.0000

‚ö†Ô∏è ROC-AUC cannot be calculated (no positive cases in y_test)


In [71]:
# Classification Report (Default Threshold)
y_test_pred_default = model.predict(X_test_scaled)
print("üìä Classification Report (Default Threshold 0.5):")
print(classification_report(y_test, y_test_pred_default, target_names=['No Leak', 'Leak'], labels=[0, 1]))


üìä Classification Report (Default Threshold 0.5):
              precision    recall  f1-score   support

     No Leak       1.00      1.00      1.00   4212225
        Leak       0.00      0.00      0.00         0

    accuracy                           1.00   4212225
   macro avg       0.50      0.50      0.50   4212225
weighted avg       1.00      1.00      1.00   4212225



In [72]:
# F-Beta Optimization (∆∞u ti√™n recall)
print(f"üîç Optimizing threshold for F-{F_BETA} score (∆∞u ti√™n recall)...")

test_thresholds = np.arange(0.05, 0.95, 0.05)
best_threshold = 0.5
best_f_beta = 0
best_metrics = {}

for thresh in test_thresholds:
    y_pred_thresh = (y_test_proba >= thresh).astype(int)
    
    if y_pred_thresh.sum() == 0:
        continue
    
    f_beta = fbeta_score(y_test, y_pred_thresh, beta=F_BETA)
    
    if f_beta > best_f_beta:
        best_f_beta = f_beta
        best_threshold = thresh
        
        best_metrics = {
            'precision': precision_score(y_test, y_pred_thresh),
            'recall': recall_score(y_test, y_pred_thresh),
            'f1': f1_score(y_test, y_pred_thresh)
        }

print(f"\n‚úÖ Best threshold (F-{F_BETA}): {best_threshold:.2f}")
print(f"   Best F-{F_BETA}: {best_f_beta:.4f}")
print(f"   Precision: {best_metrics.get('precision', 0):.4f}")
print(f"   Recall: {best_metrics.get('recall', 0):.4f}")
print(f"   F1: {best_metrics.get('f1', 0):.4f}")


üîç Optimizing threshold for F-2.0 score (∆∞u ti√™n recall)...

‚úÖ Best threshold (F-2.0): 0.50
   Best F-2.0: 0.0000
   Precision: 0.0000
   Recall: 0.0000
   F1: 0.0000


In [None]:
# Classification Report (Best Threshold)
y_test_pred_best = (y_test_proba >= best_threshold).astype(int)
print(f"\nüìä Classification Report (Best Threshold {best_threshold:.2f}):")
print(classification_report(y_test, y_test_pred_best, target_names=['No Leak', 'Leak'], labels=[0, 1]))



üìä Classification Report (Best Threshold 0.50):
              precision    recall  f1-score   support

     No Leak       1.00      1.00      1.00   4212225
        Leak       0.00      0.00      0.00         0

    accuracy                           1.00   4212225
   macro avg       0.50      0.50      0.50   4212225
weighted avg       1.00      1.00      1.00   4212225



In [74]:
# Feature Importance
if hasattr(model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False).head(10)
    
    print("\nüìä Feature Importance (Top 10):")
    print(feature_importance.to_string(index=False))



üìä Feature Importance (Top 10):
     feature  importance
    pressure           0
        head           0
      demand           0
    hour_sin           0
    hour_cos           0
 node_id_int           0
pressure_ma3           0
pressure_ma5           0
    head_ma3           0
    head_ma5           0


In [None]:
# Prepare metrics
report_default = classification_report(y_test, y_test_pred_default, target_names=['No Leak', 'Leak'], labels=[0, 1], output_dict=True)

metrics = {
    'train_accuracy': float(train_acc),
    'val_accuracy': float(val_acc),
    'test_accuracy': float(test_acc),
    'roc_auc': float(auc_score),
    'best_threshold': float(best_threshold),
    'best_f_beta': float(best_f_beta),
    'precision_leak_default': float(report_default['Leak']['precision']),
    'recall_leak_default': float(report_default['Leak']['recall']),
    'f1_leak_default': float(report_default['Leak']['f1-score']),
    'precision_leak_best': float(best_metrics.get('precision', 0)),
    'recall_leak_best': float(best_metrics.get('recall', 0)),
    'f1_leak_best': float(best_metrics.get('f1', 0)),
}

# Save files
model_dir = Path("models")
model_dir.mkdir(exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

model_file = model_dir / f"leak_detection_final_{timestamp}.pkl"
scaler_file = model_dir / f"scaler_final_{timestamp}.pkl"
metadata_file = model_dir / f"model_metadata_final_{timestamp}.json"

with open(model_file, 'wb') as f:
    pickle.dump(model, f)
print(f"‚úÖ Model saved: {model_file}")

with open(scaler_file, 'wb') as f:
    pickle.dump(scaler, f)
print(f"‚úÖ Scaler saved: {scaler_file}")

metadata = {
    'model_type': 'lightgbm_final',
    'enhanced_features': ENHANCED_FEATURES,
    'class_weight_multiplier': CLASS_WEIGHT_MULTIPLIER,
    'f_beta': F_BETA,
    'feature_cols': feature_cols,
    'use_smote': use_smote,
    'scale_pos_weight': float(adjusted_class_weight),
    'n_scenarios_used': len(files_to_load),
    'n_train': len(train_df),
    'n_val': len(val_df),
    'n_test': len(test_df),
    'training_time_seconds': float(train_time),
    'leak_ratio': float(leak_ratio),
    **metrics
}

with open(metadata_file, 'w') as f:
    json.dump(metadata, f, indent=2)
print(f"‚úÖ Metadata saved: {metadata_file}")

# Create latest symlinks
try:
    latest_model = model_dir / "leak_detection_final_latest.pkl"
    latest_scaler = model_dir / "scaler_final_latest.pkl"
    latest_metadata = model_dir / "model_metadata_final_latest.json"
    
    for old in [latest_model, latest_scaler, latest_metadata]:
        if old.exists():
            old.unlink()
    
    shutil.copy(model_file, latest_model)
    shutil.copy(scaler_file, latest_scaler)
    shutil.copy(metadata_file, latest_metadata)
    print("‚úÖ Created symlinks to latest")
except Exception as e:
    print(f"‚ö†Ô∏è Could not create symlinks: {e}")


‚úÖ Model saved: models\leak_detection_final_20251102_054630.pkl
‚úÖ Scaler saved: models\scaler_final_20251102_054630.pkl
‚úÖ Metadata saved: models\model_metadata_final_20251102_054630.json
‚úÖ Created symlinks to latest


In [76]:
# Summary
print("\n" + "="*80)
print("TRAINING COMPLETE!")
print("="*80)
print(f"Total time: {(time.time() - start_time)/60:.1f} minutes")
print(f"\nüìä Key Metrics:")
print(f"   ROC-AUC: {metrics['roc_auc']:.4f}")
print(f"   F-{F_BETA} Score: {metrics['best_f_beta']:.4f}")
print(f"   F1-Score: {metrics['f1_leak_best']:.4f}")
print(f"   Precision: {metrics['precision_leak_best']:.4f}")
print(f"   Recall: {metrics['recall_leak_best']:.4f}")
print("\n" + "="*80)



TRAINING COMPLETE!
Total time: 2.7 minutes

üìä Key Metrics:
   ROC-AUC: 0.0000
   F-2.0 Score: 0.0000
   F1-Score: 0.0000
   Precision: 0.0000
   Recall: 0.0000

