# NIDS-ML Phase 1: Hyperparameter Tuning

Questo notebook esegue il tuning degli iperparametri per un algoritmo specifico.

## Parametri

- `TARGET_ALGO`: Algoritmo da ottimizzare (random_forest, xgboost, lightgbm)
- `TUNING_METHOD`: Metodo di ricerca (random, bayesian)
- `N_ITERATIONS`: Numero iterazioni/trials
- `CV_FOLDS`: Fold cross-validation

In [None]:
# ============================================================================
# CONFIGURAZIONE
# ============================================================================

TARGET_ALGO = 'xgboost'  # random_forest, xgboost, lightgbm

# probabilmente ha senso ignorae il numero di trials e lasciare girare optuna con il limite di tempo (ne fa più che riesce almeno)
N_ITERATIONS = 5000

# sicurezza massima: CV=5
# esplorare meglio lo spazio: CV=3 (più trials nello stesso tempo)
CV_FOLDS = 5
TASK = 'binary'
TIMEOUT_HOURS = 11
MAX_RUNTIME_HOURS = 11.5

print(f"Target: {TARGET_ALGO}")
print(f"Iterations: {N_ITERATIONS}")
print(f"CV: {CV_FOLDS}")

Target: xgboost
Iterations: None
CV: 5


In [2]:
# ============================================================================
# SETUP AMBIENTE
# ============================================================================

import os
import sys
from pathlib import Path

if Path("/kaggle/input").exists():
    ENV = "kaggle"
    PROJECT_ROOT = Path("/kaggle/working")
    DATA_INPUT = Path("/kaggle/input")
else:
    ENV = "local"
    PROJECT_ROOT = Path.cwd()
    if not (PROJECT_ROOT / "src").exists():
        PROJECT_ROOT = PROJECT_ROOT.parent
    DATA_INPUT = PROJECT_ROOT / "data" / "raw"

print(f"Ambiente: {ENV}")
print(f"Project root: {PROJECT_ROOT}")

sys.path.insert(0, str(PROJECT_ROOT))
os.chdir(PROJECT_ROOT)

Ambiente: local
Project root: /home/enea/Desktop/NIDS-ML-SSR2


In [3]:
# ============================================================================
# CLONE REPOSITORY
# ============================================================================

REPO_URL = "https://github.com/Riiccardob/NIDS-ML-SSR2"

if ENV == "kaggle":
    print(f"Cloning {REPO_URL}...")
    !rm -rf temp_repo 2>/dev/null
    !git clone --depth 1 {REPO_URL} temp_repo 2>/dev/null
    !cp -r temp_repo/* {PROJECT_ROOT}/ 2>/dev/null
    !rm -rf temp_repo
    print("Done.")
    
    if (PROJECT_ROOT / "requirements.txt").exists():
        !pip install -q -r {PROJECT_ROOT}/requirements.txt 2>/dev/null
        !pip install -q optuna 2>/dev/null

for d in ["data/raw", "data/processed", "artifacts", "tuning_results", "logs"]:
    (PROJECT_ROOT / d).mkdir(parents=True, exist_ok=True)

In [4]:
# ============================================================================
# COPIA DATASET CSV
# ============================================================================

if ENV == "kaggle":
    patterns = ["cicids2017", "cic-ids", "ids2017", "network-intrusion"]
    for p in DATA_INPUT.iterdir():
        if "pcap" in p.name.lower():
            continue
        if any(pat in p.name.lower() for pat in patterns) and list(p.glob("**/*.csv")):
            print(f"Dataset CSV: {p.name}")
            import shutil
            for csv in p.glob("**/*.csv"):
                dest = PROJECT_ROOT / "data" / "raw" / csv.name
                if not dest.exists():
                    shutil.copy(csv, dest)
            break

print(f"CSV disponibili: {len(list((PROJECT_ROOT / 'data' / 'raw').glob('*.csv')))}")

CSV disponibili: 8


In [5]:
# ============================================================================
# PREPROCESSING
# ============================================================================

processed_test = PROJECT_ROOT / "data" / "processed" / "test.parquet"

if not processed_test.exists():
    print("Preprocessing...")
    !python src/preprocessing.py --balance-ratio 2.0 --n-jobs 4
else:
    print("Dati gia processati.")

Dati gia processati.


In [6]:
# ============================================================================
# FEATURE ENGINEERING
# ============================================================================

if not (PROJECT_ROOT / "artifacts" / "scaler.pkl").exists():
    print("Feature engineering...")
    !python src/feature_engineering.py --n-features 30 --n-jobs 4
else:
    print("Artifacts gia presenti.")

Artifacts gia presenti.


In [11]:
# ============================================================================
# HYPERPARAMETER TUNING
# ============================================================================

timeout_seconds = int(TIMEOUT_HOURS * 3600)


!python src/hyperparameter_tuning.py \
    --model {TARGET_ALGO} \
    #--n-trials {N_ITERATIONS} \
    --cv {CV_FOLDS} \
    --task {TASK} \
    --timeout {timeout_seconds} \
    --n-jobs 4


HYPERPARAMETER TUNING

Modello:      xgboost
Metrica:      70% F2-Score + 30% Latency (composite)
Task:         binary
CV:           5
Max Latency:  1.0ms/sample
CPU:          14/16
N trials:     100

1. Caricamento dati...
2026-01-25 17:10:39 | INFO     | Caricati: train=706,632, val=151,422, test=151,422
2. Preparazione feature...
2026-01-25 17:10:39 | INFO     | Caricati artifacts da /home/enea/Desktop/NIDS-ML-SSR2/artifacts
   Shape: (706632, 30)
   NOTA: Misura latency durante CV, rallenta il processo
2026-01-25 17:10:39 | INFO     | Bayesian Optimization (Optuna): 100 trials, cv=5
2026-01-25 17:10:39 | INFO     | Metrica: 70% F2-Score + 30% Latency
2026-01-25 17:10:39 | INFO     | Max latency constraint: 1.0ms/sample
[33m[W 2026-01-25 17:11:00,892][0m Trial 0 failed with parameters: {'n_estimators': 1935, 'max_depth': 20, 'learning_rate': 0.06504856968981275, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'min_child_weight': 2, 'gamma': 0.290418060840

In [8]:
# ============================================================================
# VERIFICA OUTPUT
# ============================================================================

import json

tuning_file = PROJECT_ROOT / "tuning_results" / f"{TARGET_ALGO}_best.json"

if tuning_file.exists():
    with open(tuning_file) as f:
        data = json.load(f)
    
    print("\n" + "="*60)
    print("TUNING COMPLETATO")
    print("="*60)
    print(f"\nModello: {data['model_type']}")
    print(f"Metodo: {data['tuning_method']}")
    print(f"Best score: {data['best_score']:.4f}")
    print(f"\nBest params:")
    for k, v in data['best_params'].items():
        print(f"  {k}: {v}")
    print(f"\nFile salvato: {tuning_file}")
else:
    print("ERRORE: File tuning non trovato!")

ERRORE: File tuning non trovato!


In [9]:
# ============================================================================
# SALVA OUTPUT PER FASE 2
# ============================================================================

output_dir = PROJECT_ROOT / "phase1_output"
output_dir.mkdir(exist_ok=True)

if tuning_file.exists():
    import shutil
    shutil.copy(tuning_file, output_dir / f"{TARGET_ALGO}_best.json")
    print(f"Output copiato in: {output_dir}")
    print(f"\nProssimo step: Eseguire phase2_training.ipynb")