# NIDS-ML: Phase 1 - Hyperparameter Tuning Pipeline

This notebook runs the complete pipeline for hyperparameter tuning:
1. **Preprocessing**: Clean and prepare raw data
2. **Feature Engineering**: Statistical preprocessing, scaling, and feature selection
3. **Hyperparameter Tuning**: Bayesian optimization with Optuna

**Compatible with:** Local environment and Kaggle notebooks

---
## Cell 1: Configuration & Parameters

**EDIT THIS CELL to configure your run**

In [None]:
# ============================================================================
# CONFIGURATION PARAMETERS
# ============================================================================

# Model Selection
MODEL_TYPE = 'lightgbm'  # Options: 'lightgbm', 'random_forest', 'xgboost'

# Hyperparameter Tuning Limits
N_TRIALS = 5000           # Number of optimization trials
TIMEOUT = 41400         # Timeout in seconds (3600s = 1 hour)
CV_FOLDS = 5             # Cross-validation folds

# Note: Tuning will stop when EITHER N_TRIALS or TIMEOUT is reached (whichever comes first)

# Kaggle Dataset Configuration
KAGGLE_DATASET_NAME = 'network-intrusion-dataset'  # Change if your dataset has a different name

# ============================================================================
# DISPLAY CONFIGURATION
# ============================================================================

print("=" * 70)
print("PIPELINE CONFIGURATION")
print("=" * 70)
print(f"Model Type:       {MODEL_TYPE}")
print(f"N Trials:         {N_TRIALS}")
print(f"Timeout:          {TIMEOUT}s ({TIMEOUT/3600:.2f} hours)")
print(f"CV Folds:         {CV_FOLDS}")
print(f"Kaggle Dataset:   {KAGGLE_DATASET_NAME}")
print("=" * 70)

PIPELINE CONFIGURATION
Model Type:       lightgbm
N Trials:         100
Timeout:          60s (0.02 hours)
CV Folds:         5
Kaggle Dataset:   network-intrusion-dataset


---
## Cell 2: Environment Detection & Path Setup

In [2]:
import sys
import os
from pathlib import Path

# ============================================================================
# DETECT ENVIRONMENT
# ============================================================================

def detect_environment():
    """Detect if running on Kaggle or Local."""
    if Path("/kaggle/input").exists():
        return "kaggle"
    else:
        return "local"

ENV = detect_environment()

print("=" * 70)
print("ENVIRONMENT DETECTION")
print("=" * 70)
print(f"Detected Environment: {ENV.upper()}")

# ============================================================================
# SETUP PATHS
# ============================================================================

if ENV == "kaggle":
    # Kaggle environment
    WORKING_DIR = Path("/kaggle/working")
    RAW_DATA_DIR = Path(f"/kaggle/input/{KAGGLE_DATASET_NAME}")
    
    # Check if src directory exists, if not, download it from GitHub
    SRC_DIR = WORKING_DIR / "src"
    
    if not SRC_DIR.exists():
        print("\n‚ö†Ô∏è  'src' directory not found. Downloading from GitHub...")
        import subprocess
        import shutil
        
        try:
            # Clone repository to temporary location
            temp_repo = WORKING_DIR / "temp_repo"
            clone_result = subprocess.run(
                ["git", "clone", "--depth", "1", 
                 "https://github.com/Riiccardob/NIDS-ML-SSR2.git", 
                 str(temp_repo)],
                capture_output=True,
                text=True,
                check=True
            )
            
            # Copy only src/ directory to working directory
            shutil.copytree(temp_repo / "src", SRC_DIR)
            
            # Clean up temporary repository
            shutil.rmtree(temp_repo)
            
            print("‚úì src/ directory downloaded successfully")
        except subprocess.CalledProcessError as e:
            print(f"‚ùå ERROR downloading repository: {e.stderr}")
            raise
        except Exception as e:
            print(f"‚ùå ERROR setting up src/: {e}")
            # Clean up if something went wrong
            if temp_repo.exists():
                shutil.rmtree(temp_repo)
            raise
    
    # Add working directory to sys.path (src/ is directly in working dir)
    REPO_ROOT = WORKING_DIR
    if str(REPO_ROOT) not in sys.path:
        sys.path.insert(0, str(REPO_ROOT))
    
    # Set working directory
    os.chdir(REPO_ROOT)
    
else:
    # Local environment
    # Assume notebook is in project root or notebooks/ subdirectory
    CURRENT_DIR = Path.cwd()
    
    # Check if we're in notebooks/ subdirectory
    if CURRENT_DIR.name == "notebooks":
        REPO_ROOT = CURRENT_DIR.parent
    else:
        REPO_ROOT = CURRENT_DIR
    
    RAW_DATA_DIR = REPO_ROOT / "data" / "raw"
    
    # Add repository root to sys.path
    if str(REPO_ROOT) not in sys.path:
        sys.path.insert(0, str(REPO_ROOT))
    
    # Set working directory to repo root
    os.chdir(REPO_ROOT)

# ============================================================================
# VERIFY SETUP
# ============================================================================

print(f"\nRepository Root:  {REPO_ROOT}")
print(f"Raw Data Path:    {RAW_DATA_DIR}")
print(f"Working Dir:      {os.getcwd()}")

# Verify src is importable
try:
    import src
    print(f"\n‚úì 'src' module found at: {src.__path__[0]}")
except ImportError as e:
    print(f"\n‚ùå ERROR: Cannot import 'src' module")
    print(f"   {e}")
    print(f"\n   Current sys.path:")
    for p in sys.path[:5]:
        print(f"     - {p}")
    raise

# Verify raw data exists
if not RAW_DATA_DIR.exists():
    print(f"\n‚ö†Ô∏è  WARNING: Raw data directory not found: {RAW_DATA_DIR}")
    if ENV == "kaggle":
        print(f"   Make sure the dataset '{KAGGLE_DATASET_NAME}' is attached to this notebook.")
    else:
        print(f"   Make sure CSV files are in: {RAW_DATA_DIR}")
else:
    csv_files = list(RAW_DATA_DIR.glob("*.csv"))
    print(f"\n‚úì Found {len(csv_files)} CSV file(s) in raw data directory")

print("=" * 70)

ENVIRONMENT DETECTION
Detected Environment: LOCAL

Repository Root:  /home/enea/Desktop/NIDS-ML-SSR2
Raw Data Path:    /home/enea/Desktop/NIDS-ML-SSR2/data/raw
Working Dir:      /home/enea/Desktop/NIDS-ML-SSR2

‚úì 'src' module found at: /home/enea/Desktop/NIDS-ML-SSR2/src

‚úì Found 8 CSV file(s) in raw data directory


---
## Cell 3: Import Required Modules

In [3]:
# Standard library imports
import sys
import subprocess
from datetime import datetime

# Project imports
from src import preprocessing
from src import feature_engineering
from src import hyperparameter_tuning
from src import utils

print("=" * 70)
print("MODULES IMPORTED SUCCESSFULLY")
print("=" * 70)
print(f"‚úì preprocessing")
print(f"‚úì feature_engineering")
print(f"‚úì hyperparameter_tuning")
print(f"‚úì utils")
print("=" * 70)

MODULES IMPORTED SUCCESSFULLY
‚úì preprocessing
‚úì feature_engineering
‚úì hyperparameter_tuning
‚úì utils


---
## Step 1: Preprocessing

Loads raw CSV files, cleans data, encodes labels, and splits into train/val/test sets.

In [4]:
print("\n" + "#" * 70)
print("# STEP 1: PREPROCESSING")
print("#" * 70 + "\n")

start_time = datetime.now()

# Set up arguments for preprocessing.main()
sys.argv = [
    'preprocessing.py',
    '--input-dir', str(RAW_DATA_DIR),
    '--n-jobs', '4'
]

# Run preprocessing
try:
    preprocessing.main()
    elapsed = datetime.now() - start_time
    print(f"\n‚úì Preprocessing completed in {elapsed}")
except Exception as e:
    print(f"\n‚ùå ERROR during preprocessing: {e}")
    raise

print("\n" + "=" * 70)


######################################################################
# STEP 1: PREPROCESSING
######################################################################


PREPROCESSING CIC-IDS2017

Parametri:
  Input:         /home/enea/Desktop/NIDS-ML-SSR2/data/raw
  Output:        /home/enea/Desktop/NIDS-ML-SSR2/data/processed
  Balance:       Si (ratio 2.0:1)
  Chunk size:    Disabilitato
  Split:         70/15/15
  CPU cores:     14/16

1. Caricamento CSV da /home/enea/Desktop/NIDS-ML-SSR2/data/raw...
2026-01-28 19:48:22 | INFO     | Trovati 8 file CSV


Caricamento CSV: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [00:18<00:00,  2.33s/it]

2026-01-28 19:48:40 | INFO     | Concatenazione 8 DataFrame...





2026-01-28 19:48:41 | INFO     | Dataset combinato: 2,830,743 righe, 79 colonne
2026-01-28 19:48:41 | INFO     | Memoria: 1.7 GB

2. Pulizia dati...
2026-01-28 19:48:41 | INFO     | Inizio pulizia dati...
2026-01-28 19:48:41 | INFO     | Rimosse 1 colonne identificative
2026-01-28 19:48:42 | INFO     | Rimosse 2,867 righe con valori infiniti
2026-01-28 19:49:01 | INFO     | Rimosse 594,712 righe duplicate
2026-01-28 19:49:01 | INFO     | Pulizia completata: 2,830,743 -> 2,233,164 righe

3. Encoding label...
2026-01-28 19:49:01 | INFO     | Encoding label...
2026-01-28 19:49:02 | INFO     | Classi trovate: 15
2026-01-28 19:49:02 | INFO     | Distribuzione binaria: Benign=1,896,672, Attack=336,492

   Distribuzione classi:
   - BENIGN: 1,896,672 (84.93%)
   - DoS Hulk: 172,846 (7.74%)
   - DDoS: 128,014 (5.73%)
   - DoS GoldenEye: 10,286 (0.46%)
   - FTP-Patator: 5,931 (0.27%)
   - DoS slowloris: 5,385 (0.24%)
   - DoS Slowhttptest: 5,228 (0.23%)
   - SSH-Patator: 3,219 (0.14%)
   - Port

---
## Step 2: Feature Engineering

Applies statistical preprocessing, scaling (RobustScaler), and feature selection (Random Forest Importance).

In [5]:
print("\n" + "#" * 70)
print("# STEP 2: FEATURE ENGINEERING")
print("#" * 70 + "\n")

start_time = datetime.now()

# Set up arguments for feature_engineering.main()
# Using defaults: statistical preprocessing ON, RobustScaler ON
sys.argv = [
    'feature_engineering.py',
    '--n-jobs', '4'
]

# Run feature engineering
try:
    feature_engineering.main()
    elapsed = datetime.now() - start_time
    print(f"\n‚úì Feature engineering completed in {elapsed}")
except Exception as e:
    print(f"\n‚ùå ERROR during feature engineering: {e}")
    raise

print("\n" + "=" * 70)


######################################################################
# STEP 2: FEATURE ENGINEERING
######################################################################


FEATURE ENGINEERING v2

Parametri:
  Statistical preprocessing: ON (DEFAULT)
    - Variance threshold:    0.0
    - Correlation threshold: 0.95
  Scaler:                    RobustScaler (DEFAULT)
  Metodo selezione:          Random Forest Importance
  Feature da selezionare:    30
  RF estimators:             100
  CPU cores:                 4/16

1. Caricamento dati preprocessati...
2026-01-28 19:49:08 | INFO     | Caricati: train=706,632, val=151,422, test=151,422
   Train: 706,632 | Val: 151,422 | Test: 151,422
2026-01-28 19:49:08 | INFO     | CPU: 4.5% | RAM: 74.8% | Disponibile: 3.4GB | Core attivi: 14/16

2. Esecuzione pipeline feature engineering...
2026-01-28 19:49:08 | INFO     | Feature iniziali: 77
2026-01-28 19:49:09 | INFO     | STATISTICAL PREPROCESSING
2026-01-28 19:49:09 | INFO     | Step 1: Removi

---
## Step 3: Hyperparameter Tuning

Runs Bayesian optimization with Optuna to find optimal hyperparameters.

**Metric:** 70% F2-Score + 30% Latency (composite score)

**Stop Condition:** Whichever comes first: N_TRIALS or TIMEOUT

In [6]:
print("\n" + "#" * 70)
print("# STEP 3: HYPERPARAMETER TUNING")
print("#" * 70 + "\n")

print(f"Model:     {MODEL_TYPE}")
print(f"N Trials:  {N_TRIALS}")
print(f"Timeout:   {TIMEOUT}s ({TIMEOUT/3600:.2f}h)")
print(f"CV Folds:  {CV_FOLDS}")
print(f"\nNote: Will stop when EITHER limit is reached\n")

start_time = datetime.now()

# Set up arguments for hyperparameter_tuning.main()
sys.argv = [
    'hyperparameter_tuning.py',
    '--model', MODEL_TYPE,
    '--n-trials', str(N_TRIALS),
    '--timeout', str(TIMEOUT),
    '--cv', str(CV_FOLDS),
    '--n-jobs', '4'
]

# Run hyperparameter tuning
try:
    hyperparameter_tuning.main()
    elapsed = datetime.now() - start_time
    print(f"\n‚úì Hyperparameter tuning completed in {elapsed}")
except Exception as e:
    print(f"\n‚ùå ERROR during hyperparameter tuning: {e}")
    raise

print("\n" + "=" * 70)


######################################################################
# STEP 3: HYPERPARAMETER TUNING
######################################################################

Model:     lightgbm
N Trials:  100
Timeout:   60s (0.02h)
CV Folds:  5

Note: Will stop when EITHER limit is reached


HYPERPARAMETER TUNING

Modello:      lightgbm
Metodo:       Bayesian Optimization (Optuna)
Metrica:      70% F2-Score + 30% Latency (composite)
Task:         binary
CV:           5
Max Latency:  1.0ms/sample
CPU:          4/16
Limiti:       100 trials OPPURE 60s (0.0h)
              (si ferma al primo raggiunto)

1. Caricamento dati...
2026-01-28 19:49:57 | INFO     | Caricati: train=706,632, val=151,422, test=151,422
2. Preparazione feature...
2026-01-28 19:49:57 | INFO     | ‚úì Checksum colonne verificato: 2fd9541623be6663
2026-01-28 19:49:57 | INFO     | Caricati artifacts da /home/enea/Desktop/NIDS-ML-SSR2/artifacts
   Scaler feature alignment: 44 feature richieste
   Shape: (706632, 30)

3.

  from .autonotebook import tqdm as notebook_tqdm
Best trial: 16. Best value: 0.998877:  19%|‚ñà‚ñâ        | 19/100 [01:02<04:24,  3.27s/it, 62.13/60 seconds]


2026-01-28 19:51:00 | INFO     | Best composite score: 0.9989
2026-01-28 19:51:00 | INFO     |   - F2-Score: 0.9989
2026-01-28 19:51:00 | INFO     |   - Latency: 0.0001ms/sample
2026-01-28 19:51:00 | INFO     | Completed trials: 19
2026-01-28 19:51:00 | INFO     | Best params: {'n_estimators': 8, 'max_depth': 22, 'learning_rate': 0.16980141538927607, 'num_leaves': 297, 'subsample': 0.9909175233513504, 'colsample_bytree': 0.8832829939444796, 'min_child_samples': 27, 'reg_alpha': 0.016109046194566698, 'reg_lambda': 3.709876048497122e-07, 'class_weight': 'balanced'}

4. Salvataggio risultati...

TUNING COMPLETATO

Best composite score: 0.9989
  - F2-Score: 0.9989
  - Latency:  0.0001ms/sample

Trials completati: 19
Tempo totale: 62.1s (0.02h)

Best params:
  n_estimators: 8
  max_depth: 22
  learning_rate: 0.16980141538927607
  num_leaves: 297
  subsample: 0.9909175233513504
  colsample_bytree: 0.8832829939444796
  min_child_samples: 27
  reg_alpha: 0.016109046194566698
  reg_lambda: 3.70

---
## Pipeline Complete!

**Output Location:**
- Tuning results saved in: `tuning_results/<model_type>/`
- Processed data in: `data/processed/`
- Feature engineering artifacts in: `artifacts/`

**Next Steps:**
1. Review the tuning results JSON file in `tuning_results/<model_type>/`
2. Run the training script with the tuned parameters:
   ```bash
   python src/training/<model_type>_model.py
   ```

In [7]:
from pathlib import Path
import json

print("\n" + "=" * 70)
print("PIPELINE SUMMARY")
print("=" * 70)

# Find the most recent tuning result
tuning_dir = Path(f"tuning_results/{MODEL_TYPE}")

if tuning_dir.exists():
    json_files = sorted(tuning_dir.glob("*.json"), key=lambda x: x.stat().st_mtime, reverse=True)
    
    if json_files:
        latest_result = json_files[0]
        
        print(f"\n‚úì Latest tuning result: {latest_result.name}")
        
        # Load and display key metrics
        try:
            with open(latest_result, 'r') as f:
                result_data = json.load(f)
            
            print(f"\nKey Metrics:")
            print(f"  Best Composite Score: {result_data.get('best_score', 'N/A'):.4f}")
            print(f"  Best F2-Score:        {result_data.get('best_f2_score', 'N/A'):.4f}")
            print(f"  Best Latency:         {result_data.get('best_latency_ms', 'N/A'):.4f} ms/sample")
            print(f"  Trials Completed:     {result_data.get('search_config', {}).get('n_trials', 'N/A')}")
            print(f"  Search Time:          {result_data.get('search_config', {}).get('search_time_seconds', 0):.1f}s")
            
            print(f"\nBest Parameters:")
            for param, value in list(result_data.get('best_params', {}).items())[:10]:
                print(f"  {param:20}: {value}")
            
            if len(result_data.get('best_params', {})) > 10:
                print(f"  ... and {len(result_data.get('best_params', {})) - 10} more parameters")
                
        except Exception as e:
            print(f"  (Could not load result details: {e})")
    else:
        print(f"\n‚ö†Ô∏è  No tuning results found in {tuning_dir}")
else:
    print(f"\n‚ö†Ô∏è  Tuning results directory not found: {tuning_dir}")

print("\n" + "=" * 70)
print("NEXT STEP: Run training with tuned parameters")
print("=" * 70)
print(f"\nCommand:")
print(f"  python src/training/{MODEL_TYPE}_model.py")
print("\n" + "=" * 70)


PIPELINE SUMMARY

‚úì Latest tuning result: bayesian_trials19_cv5_2026-01-28_19.51.json

Key Metrics:
  Best Composite Score: 0.9989
  Best F2-Score:        0.9989
  Best Latency:         0.0001 ms/sample
  Trials Completed:     19
  Search Time:          62.1s

Best Parameters:
  n_estimators        : 8
  max_depth           : 22
  learning_rate       : 0.16980141538927607
  num_leaves          : 297
  subsample           : 0.9909175233513504
  colsample_bytree    : 0.8832829939444796
  min_child_samples   : 27
  reg_alpha           : 0.016109046194566698
  reg_lambda          : 3.709876048497122e-07
  class_weight        : balanced

NEXT STEP: Run training with tuned parameters

Command:
  python src/training/lightgbm_model.py



---
## Package Results (Kaggle Only)

Creates a zip file with artifacts and tuning results for easy download.

In [8]:
if ENV == "kaggle":
    import zipfile
    from pathlib import Path
    from datetime import datetime
    
    print("\n" + "=" * 70)
    print("PACKAGING RESULTS")
    print("=" * 70)
    
    # Create timestamp for unique filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    zip_filename = f"nids_tuning_{MODEL_TYPE}_{timestamp}.zip"
    zip_path = Path("/kaggle/working") / zip_filename
    
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        
        # Add artifacts directory
        artifacts_dir = Path("artifacts")
        if artifacts_dir.exists():
            print(f"\nüì¶ Adding artifacts...")
            for file in artifacts_dir.rglob("*"):
                if file.is_file():
                    arcname = file.relative_to(Path.cwd())
                    zipf.write(file, arcname)
                    print(f"   ‚úì {arcname}")
        
        # Add tuning results for this model
        tuning_dir = Path(f"tuning_results/{MODEL_TYPE}")
        if tuning_dir.exists():
            print(f"\nüì¶ Adding tuning results...")
            for file in tuning_dir.glob("*.json"):
                if file.is_file():
                    arcname = file.relative_to(Path.cwd())
                    zipf.write(file, arcname)
                    print(f"   ‚úì {arcname}")
        
        # Add processed data info (just metadata, not the actual large files)
        processed_dir = Path("data/processed")
        if processed_dir.exists():
            # Add only JSON files (mappings, etc.)
            print(f"\nüì¶ Adding data metadata...")
            for file in processed_dir.glob("*.json"):
                if file.is_file():
                    arcname = file.relative_to(Path.cwd())
                    zipf.write(file, arcname)
                    print(f"   ‚úì {arcname}")
    
    # Get file size
    size_mb = zip_path.stat().st_size / (1024 * 1024)
    
    print("\n" + "=" * 70)
    print("‚úì RESULTS PACKAGED SUCCESSFULLY")
    print("=" * 70)
    print(f"\nZip file: {zip_filename}")
    print(f"Size:     {size_mb:.2f} MB")
    print(f"Location: /kaggle/working/{zip_filename}")
    print("\nDownload this file from Kaggle output to use in local training.")
    print("=" * 70)
else:
    print("\n" + "=" * 70)
    print("PACKAGING SKIPPED (Local Environment)")
    print("=" * 70)
    print("\nResults are already in your local directories:")
    print("  - artifacts/")
    print(f"  - tuning_results/{MODEL_TYPE}/")
    print("=" * 70)


PACKAGING SKIPPED (Local Environment)

Results are already in your local directories:
  - artifacts/
  - tuning_results/lightgbm/
