# NIDS-ML: Pipeline Completa con Multi-Training e Versionamento

## Modalità Disponibili

| MODE | Cosa fa | Quando usarla |
|------|---------|---------------|
| `'all'` | Pull + Preproc + Training + Eval + Compare + Sniff | Training completo da zero |
| `'training'` | Pull + Preproc + Training | Solo training, poi push manuale |
| `'evaluation'` | Pull + Preproc + Eval + Compare | Valutare modelli già su GitHub |
| `'sniffing'` | Pull + Preproc + Eval + Compare + Sniff | Testare e confrontare su PCAP |

## Nuove Funzionalità

- **Multi-Training**: Esegui più configurazioni dello stesso algoritmo in una run
- **Versionamento**: Ogni training salvato con ID univoco (es. `xgboost/cv5_iter100_gpu`)
- **Timeout 12h**: Interrompe il training prima del crash Kaggle
- **Compare Tutte Versioni**: Confronta TUTTE le versioni e trova il best assoluto
- **Grafico Plateau**: Visualizza quando ulteriore training non migliora

## Parametri Importanti

- `TRAINING_CONFIGS`: Lista di configurazioni multi-training
- `MODELS_TO_EVALUATE`: Lista modelli da valutare (default: tutti quelli disponibili)
- `MODELS_TO_SNIFF`: Lista modelli per test PCAP con confronto

In [None]:
# ============================================================================
# CONFIGURAZIONE PRINCIPALE
# ============================================================================

# MODALITA: 'all', 'training', 'evaluation', 'sniffing'
# - 'all':        Pull + Preproc + Training + Eval + Compare + Sniff
# - 'training':   Pull + Preproc + Training
# - 'evaluation': Pull + Preproc + Eval + Compare
# - 'sniffing':   Pull + Preproc + Eval + Compare + Sniff (usa modelli da GitHub)
MODE = 'evaluation'

# Task
TASK = 'binary'

# ============================================================================
# MULTI-TRAINING CONFIGURATION
# ============================================================================
# Lista di configurazioni da eseguire in sequenza.
# Ogni training viene salvato con ID univoco: models/xgboost/cv5_iter100_gpu/
#
# Formato: {'model': tipo, 'n_iter': N, 'cv': K, 'gpu': bool}
#
TRAINING_CONFIGS = [
    # XGBoost - configurazioni crescenti
    {'model': 'xgboost', 'n_iter': 10, 'cv': 2, 'gpu': True},
    {'model': 'xgboost', 'n_iter': 20, 'cv': 3, 'gpu': True},
    {'model': 'xgboost', 'n_iter': 30, 'cv': 3, 'gpu': True},
    {'model': 'xgboost', 'n_iter': 50, 'cv': 3, 'gpu': True},
    {'model': 'xgboost', 'n_iter': 50, 'cv': 5, 'gpu': True},
    # {'model': 'xgboost', 'n_iter': 100, 'cv': 5, 'gpu': True},  # Training lungo
    
    # LightGBM - configurazioni crescenti  
    # {'model': 'lightgbm', 'n_iter': 20, 'cv': 3, 'gpu': False},
    # {'model': 'lightgbm', 'n_iter': 50, 'cv': 5, 'gpu': False},
    
    # Random Forest - opzionale (molto lento)
    # {'model': 'random_forest', 'n_iter': 20, 'cv': 3, 'gpu': False},
]

# ============================================================================
# MODELLI PER EVALUATION/SNIFFING (se non fai training)
# ============================================================================
# Quali modelli valutare (per evaluation e compare)
# None = tutti quelli disponibili
MODELS_TO_EVALUATE = None  # o es: ['xgboost', 'lightgbm']

# Quali modelli testare su PCAP (per confronto)
# None = solo best_model, lista = confronta tutti
MODELS_TO_SNIFF = None
# MODELS_TO_SNIFF = ['xgboost', 'lightgbm']  # Confronta questi due

# ============================================================================
# PARAMETRI GENERALI
# ============================================================================
N_FEATURES = 30
BALANCE_RATIO = 2.0

# ============================================================================
# PARAMETRI COMPARE
# ============================================================================
MAX_FPR = 0.02          # 2% False Positive Rate massimo
MAX_LATENCY_MS = 2.0    # 2ms latenza massima per sample

# ============================================================================
# PARAMETRI SNIFFING
# ============================================================================
SNIFF_THRESHOLD = 0.5
SNIFF_MIN_PACKETS = 2
SNIFF_TIMEOUT = 120

# ============================================================================
# TIMEOUT KAGGLE (protezione crash)
# ============================================================================
# Kaggle ha limite 12h. Il training si interrompe prima per salvare i risultati.
MAX_RUNTIME_HOURS = 11.5  # Lascia 30 min per eval/compare/push

# ============================================================================
# STAMPA CONFIGURAZIONE
# ============================================================================
print("="*60)
print("CONFIGURAZIONE")
print("="*60)
print(f"MODE:              {MODE}")
print(f"Task:              {TASK}")

if MODE in ['all', 'training']:
    print(f"\nTraining configs ({len(TRAINING_CONFIGS)}):")
    for i, cfg in enumerate(TRAINING_CONFIGS, 1):
        gpu_str = " [GPU]" if cfg.get('gpu') else ""
        print(f"  {i}. {cfg['model']} cv={cfg['cv']} n_iter={cfg['n_iter']}{gpu_str}")

if MODE in ['all', 'evaluation', 'sniffing']:
    print(f"\nModelli eval:      {MODELS_TO_EVALUATE or 'tutti disponibili'}")
if MODE in ['all', 'sniffing']:
    print(f"Modelli sniff:     {MODELS_TO_SNIFF or 'solo best_model'}")
    
print(f"\nCompare:           FPR<={MAX_FPR*100}%, Latency<={MAX_LATENCY_MS}ms")
print(f"Max runtime:       {MAX_RUNTIME_HOURS}h")

---
## 1. Setup Ambiente

In [None]:
import os
import sys
import json
import shutil
from pathlib import Path
from datetime import datetime, timedelta

# ============================================================================
# TIMEOUT UTILITIES
# ============================================================================
NOTEBOOK_START_TIME = datetime.now()

def check_timeout(margin_minutes=30):
    """
    Verifica se stiamo per superare il timeout Kaggle.
    
    Returns:
        Tuple (is_timeout: bool, remaining: timedelta)
    """
    elapsed = datetime.now() - NOTEBOOK_START_TIME
    max_allowed = timedelta(hours=MAX_RUNTIME_HOURS) - timedelta(minutes=margin_minutes)
    remaining = max_allowed - elapsed
    
    if elapsed > max_allowed:
        return True, timedelta(0)
    return False, remaining

def format_timedelta(td):
    """Formatta timedelta in stringa leggibile."""
    total_seconds = int(td.total_seconds())
    if total_seconds < 0:
        return "0s"
    hours, remainder = divmod(total_seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    if hours > 0:
        return f"{hours}h {minutes}m"
    elif minutes > 0:
        return f"{minutes}m {seconds}s"
    return f"{seconds}s"

print(f"Notebook started:  {NOTEBOOK_START_TIME.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Max runtime:       {MAX_RUNTIME_HOURS}h")
print(f"Deadline:          {(NOTEBOOK_START_TIME + timedelta(hours=MAX_RUNTIME_HOURS)).strftime('%H:%M:%S')}")

In [None]:
# ============================================================================
# DETECT ENVIRONMENT
# ============================================================================
if Path("/kaggle/input").exists():
    ENV = "kaggle"
    PROJECT_ROOT = Path("/kaggle/working")
    DATA_INPUT = Path("/kaggle/input")
elif Path("/content").exists():
    ENV = "colab"
    PROJECT_ROOT = Path("/content/NIDS-ML")
    DATA_INPUT = Path("/content")
else:
    ENV = "local"
    PROJECT_ROOT = Path.cwd()
    if not (PROJECT_ROOT / "src").exists():
        PROJECT_ROOT = PROJECT_ROOT.parent
    DATA_INPUT = PROJECT_ROOT / "data" / "raw"

print(f"Ambiente: {ENV}")
print(f"Project root: {PROJECT_ROOT}")

sys.path.insert(0, str(PROJECT_ROOT))
os.chdir(PROJECT_ROOT)

In [None]:
# Clone/Pull da GitHub
REPO_URL = "https://github.com/Riiccardob/NIDS-ML-SSR2"

if ENV in ["kaggle", "colab"]:
    print(f"Cloning {REPO_URL}...")
    !rm -rf temp_repo 2>/dev/null
    !git clone --depth 1 {REPO_URL} temp_repo 2>/dev/null
    !cp -r temp_repo/* {PROJECT_ROOT}/ 2>/dev/null
    !rm -rf temp_repo
    print("Done.")
    
    if (PROJECT_ROOT / "requirements.txt").exists():
        !pip install -q -r {PROJECT_ROOT}/requirements.txt 2>/dev/null

# Crea directory
for d in ["data/raw", "data/processed", "artifacts", "models", "logs", "reports"]:
    (PROJECT_ROOT / d).mkdir(parents=True, exist_ok=True)

# Mostra modelli presenti da GitHub (incluse versioni)
print(f"\nModelli da GitHub:")
for name in ['random_forest', 'xgboost', 'lightgbm']:
    model_dir = PROJECT_ROOT / "models" / name
    if model_dir.exists():
        # Cerca versioni
        versions = []
        for item in model_dir.iterdir():
            if item.is_dir() and not item.is_symlink():
                if (item / f"model_{TASK}.pkl").exists():
                    versions.append(item.name)
        # Check modello root
        if (model_dir / f"model_{TASK}.pkl").exists():
            versions.insert(0, "[default]")
        if versions:
            print(f"  {name}: {', '.join(versions)}")
        else:
            print(f"  {name}: -")
    else:
        print(f"  {name}: -")

if (PROJECT_ROOT / "models" / "best_model" / f"model_{TASK}.pkl").exists():
    print(f"  best_model: OK")

In [None]:
# Copia dataset CSV
if ENV in ["kaggle", "colab"]:
    patterns = ["cicids2017", "cic-ids", "ids2017", "network-intrusion"]
    for p in DATA_INPUT.iterdir():
        if "pcap" in p.name.lower():
            continue
        if any(pat in p.name.lower() for pat in patterns) and list(p.glob("**/*.csv")):
            print(f"Dataset CSV: {p.name}")
            for csv in p.glob("**/*.csv"):
                dest = PROJECT_ROOT / "data" / "raw" / csv.name
                if not dest.exists():
                    shutil.copy(csv, dest)
            break

print(f"CSV disponibili: {len(list((PROJECT_ROOT / 'data' / 'raw').glob('*.csv')))}")

---
## 2. Preprocessing & Feature Engineering

*Eseguito per MODE: all, training, evaluation, sniffing*

In [None]:
%%time
if MODE in ['all', 'training', 'evaluation', 'sniffing']:
    from src.preprocessing import main as preprocessing_main, load_processed_data
    
    processed_test = PROJECT_ROOT / "data" / "processed" / "test.parquet"
    if not processed_test.exists():
        print("Preprocessing...")
        sys.argv = ['preprocessing.py', '--balance-ratio', str(BALANCE_RATIO), '--n-jobs', '4']
        preprocessing_main()
    else:
        print("Dati gia' processati.")
    
    train, val, test, mappings = load_processed_data()
    print(f"Train: {len(train):,} | Val: {len(val):,} | Test: {len(test):,}")
else:
    print(f"Skip (MODE={MODE})")

In [None]:
%%time
if MODE in ['all', 'training', 'evaluation', 'sniffing']:
    from src.feature_engineering import main as fe_main, load_artifacts
    
    if not (PROJECT_ROOT / "artifacts" / "scaler.pkl").exists():
        print("Feature engineering...")
        sys.argv = ['feature_engineering.py', '--n-features', str(N_FEATURES), '--n-jobs', '4']
        fe_main()
    else:
        print("Artifacts gia' presenti.")
    
    scaler, selected_features, _, _ = load_artifacts()
    print(f"Feature: {len(selected_features)}")
else:
    print(f"Skip (MODE={MODE})")

---
## 3. Multi-Training con Versionamento

*Eseguito per MODE: all, training*

Ogni training viene salvato con ID univoco basato sui parametri:
- `models/xgboost/cv3_iter20_gpu/model_binary.pkl`
- `models/xgboost/cv5_iter100_gpu/model_binary.pkl`
- `models/lightgbm/cv5_iter50/model_binary.pkl`

Se il timeout si avvicina, il training viene interrotto per salvare i risultati già completati.

In [None]:
# Traccia quali modelli sono stati trainati in QUESTA sessione
TRAINED_THIS_SESSION = []  # [(model_type, version_id), ...]
SKIPPED_CONFIGS = []       # Configs skippati per timeout

if MODE in ['all', 'training']:
    print(f"Training configs da eseguire: {len(TRAINING_CONFIGS)}")
    for i, cfg in enumerate(TRAINING_CONFIGS, 1):
        gpu_str = " [GPU]" if cfg.get('gpu') else ""
        print(f"  {i}. {cfg['model']} cv={cfg['cv']} n_iter={cfg['n_iter']}{gpu_str}")
else:
    print(f"Skip training (MODE={MODE})")

In [None]:
def check_version_exists(model_type, n_iter, cv, gpu=False):
    """
    Verifica se una versione del modello esiste già.
    
    Returns:
        Tuple (exists: bool, version_id: str, version_dir: Path)
    """
    from src.model_versioning import generate_version_id, get_version_dir
    
    extra_params = {'gpu': gpu} if gpu else None
    version_id = generate_version_id(n_iter, cv, extra_params)
    version_dir = get_version_dir(model_type, n_iter, cv, extra_params, create=False)
    
    model_file = version_dir / f"model_{TASK}.pkl"
    exists = model_file.exists()
    
    return exists, version_id, version_dir


def run_single_training(config, config_idx, total_configs):
    """
    Esegue un singolo training con la configurazione specificata.
    
    Returns:
        bool: True se completato (o già esistente), False se errore/timeout
    """
    model_type = config['model']
    n_iter = config['n_iter']
    cv = config['cv']
    use_gpu = config.get('gpu', False)
    
    # Verifica se esiste già
    exists, version_id, version_dir = check_version_exists(model_type, n_iter, cv, use_gpu)
    
    if exists:
        print(f"\n[{config_idx}/{total_configs}] {model_type}/{version_id}")
        print(f"  -> Già esistente, skip")
        TRAINED_THIS_SESSION.append((model_type, version_id))
        return True
    
    # Header
    print(f"\n{'#' * 70}")
    print(f"# TRAINING {config_idx}/{total_configs}: {model_type.upper()}")
    print(f"# Config: cv={cv}, n_iter={n_iter}, gpu={use_gpu}")
    print(f"# Version ID: {version_id}")
    print(f"{'#' * 70}")
    
    # Check timeout (60 min margine per il training)
    is_timeout, remaining = check_timeout(margin_minutes=60)
    if is_timeout:
        print(f"\n  TIMEOUT: Tempo insufficiente, training skippato")
        SKIPPED_CONFIGS.append(config)
        return False
    
    print(f"  Tempo rimanente: {format_timedelta(remaining)}")
    
    start_time = datetime.now()
    
    try:
        # Prepara argomenti
        if model_type == 'xgboost':
            from src.training.xgboost_model import main as train_main
            args = ['xgb.py', '--task', TASK, '--n-iter', str(n_iter), '--cv', str(cv)]
            if use_gpu:
                args.append('--gpu')
            else:
                args.extend(['--n-jobs', '4'])
                
        elif model_type == 'lightgbm':
            from src.training.lightgbm_model import main as train_main
            args = ['lgbm.py', '--task', TASK, '--n-iter', str(n_iter), '--cv', str(cv), '--n-jobs', '4']
            
        elif model_type == 'random_forest':
            from src.training.random_forest import main as train_main
            args = ['rf.py', '--task', TASK, '--n-iter', str(n_iter), '--cv', str(cv), '--n-jobs', '4']
            
        else:
            print(f"  ERRORE: Tipo modello sconosciuto: {model_type}")
            SKIPPED_CONFIGS.append(config)
            return False
        
        # Esegui training
        sys.argv = args
        train_main()
        
        elapsed = datetime.now() - start_time
        print(f"\n  Training completato in {format_timedelta(elapsed)}")
        
        TRAINED_THIS_SESSION.append((model_type, version_id))
        return True
        
    except Exception as e:
        print(f"\n  ERRORE durante training: {e}")
        import traceback
        traceback.print_exc()
        SKIPPED_CONFIGS.append(config)
        return False

In [None]:
%%time
# ============================================================================
# ESECUZIONE MULTI-TRAINING
# ============================================================================
if MODE in ['all', 'training']:
    total = len(TRAINING_CONFIGS)
    completed = 0
    
    for i, config in enumerate(TRAINING_CONFIGS, 1):
        # Check timeout prima di ogni training
        is_timeout, remaining = check_timeout(margin_minutes=45)
        if is_timeout:
            print(f"\n{'!' * 70}")
            print(f"! TIMEOUT RAGGIUNTO - Training rimanenti skippati")
            print(f"{'!' * 70}")
            SKIPPED_CONFIGS.extend(TRAINING_CONFIGS[i-1:])
            break
        
        # Esegui training
        success = run_single_training(config, i, total)
        if success:
            completed += 1
    
    # Riepilogo
    print(f"\n{'=' * 70}")
    print(f"TRAINING SUMMARY")
    print(f"{'=' * 70}")
    print(f"Completati:  {completed}/{total}")
    print(f"Skippati:    {len(SKIPPED_CONFIGS)}")
    
    if TRAINED_THIS_SESSION:
        print(f"\nVersioni create/confermate:")
        for mt, vid in TRAINED_THIS_SESSION:
            print(f"  - {mt}/{vid}")
    
    if SKIPPED_CONFIGS:
        print(f"\nConfigs skippati:")
        for cfg in SKIPPED_CONFIGS:
            print(f"  - {cfg['model']} cv={cfg['cv']} n_iter={cfg['n_iter']}")
else:
    print(f"Skip training (MODE={MODE})")

In [None]:
# Riepilogo modelli disponibili (incluse tutte le versioni)
print("\n" + "="*70)
print("MODELLI DISPONIBILI (tutte le versioni)")
print("="*70)

from src.model_versioning import list_model_versions

versions = list_model_versions(task=TASK)
if versions:
    print(f"\n{'Modello':<15} {'Versione':<25} {'Origine':<15} {'F1':>10}")
    print("-"*70)
    
    for v in versions:
        origine = "TRAINATO ORA" if (v['model_type'], v['version_id']) in TRAINED_THIS_SESSION else "GitHub"
        f1 = v.get('validation_metrics', {}).get('f1', 0)
        print(f"{v['model_type']:<15} {v['version_id']:<25} {origine:<15} {f1:>10.4f}")
else:
    print("Nessun modello trovato")

---
## 4. Evaluation

*Eseguito per MODE: all, evaluation, sniffing*

In [None]:
if MODE in ['all', 'evaluation', 'sniffing']:
    from src.evaluation import main as evaluation_main
    from src.model_versioning import list_model_versions
    
    # Trova tutte le versioni da valutare
    all_versions = list_model_versions(task=TASK)
    
    # Filtra se specificato
    if MODELS_TO_EVALUATE:
        versions_to_eval = [v for v in all_versions if v['model_type'] in MODELS_TO_EVALUATE]
    else:
        versions_to_eval = all_versions
    
    print(f"Versioni da valutare: {len(versions_to_eval)}")
    for v in versions_to_eval:
        origine = "TRAINATO ORA" if (v['model_type'], v['version_id']) in TRAINED_THIS_SESSION else "GitHub"
        print(f"  - {v['model_type']}/{v['version_id']} ({origine})")
else:
    print(f"Skip evaluation (MODE={MODE})")
    versions_to_eval = []

In [None]:
%%time
# Esegui evaluation per ogni versione
if MODE in ['all', 'evaluation', 'sniffing'] and versions_to_eval:
    for i, v in enumerate(versions_to_eval, 1):
        full_id = f"{v['model_type']}/{v['version_id']}"
        model_path = v['model_path']
        
        print(f"\n{'='*60}")
        print(f"EVALUATION [{i}/{len(versions_to_eval)}]: {full_id}")
        origine = "TRAINATO ORA" if (v['model_type'], v['version_id']) in TRAINED_THIS_SESSION else "GitHub"
        print(f"Origine modello: {origine}")
        print(f"{'='*60}")
        
        # Check timeout
        is_timeout, _ = check_timeout(margin_minutes=20)
        if is_timeout:
            print(f"  TIMEOUT - evaluation interrotta")
            break
        
        sys.argv = ['eval.py', '--model-path', str(model_path), '--task', TASK]
        try:
            evaluation_main()
        except Exception as e:
            print(f"ERRORE: {e}")

---
## 5. Compare Models (Tutte le Versioni)

*Eseguito per MODE: all, evaluation, sniffing*

Confronta **TUTTE** le versioni disponibili e genera:
- Ranking per ogni algoritmo
- Grafico plateau (F1/Recall vs Training Effort)
- Selezione best_model assoluto

In [None]:
%%time
if MODE in ['all', 'evaluation', 'sniffing']:
    from src.compare_models import main as compare_main
    
    print(f"Compare: FPR <= {MAX_FPR*100}%, Latency <= {MAX_LATENCY_MS}ms")
    sys.argv = ['compare.py', '--max-fpr', str(MAX_FPR), '--max-latency-ms', str(MAX_LATENCY_MS)]
    compare_main()
else:
    print(f"Skip compare (MODE={MODE})")

In [None]:
# Mostra best model e grafici
best_model_dir = PROJECT_ROOT / "models" / "best_model"
if best_model_dir.exists() and (best_model_dir / "metadata.json").exists():
    with open(best_model_dir / "metadata.json") as f:
        meta = json.load(f)
    print(f"\nBEST MODEL: {meta.get('best_model', 'N/A').upper()}")
    print(f"Score: {meta.get('score', 0):.4f}")
    print(f"F1: {meta.get('metrics', {}).get('f1', 0):.4f}")
    print(f"Recall: {meta.get('metrics', {}).get('recall', 0):.4f}")
    
    # Grafici
    from IPython.display import Image, display
    for img_name in ['plateau_analysis.png', 'scorecard_comparison.png', 'algorithm_rankings.png']:
        img_path = best_model_dir / img_name
        if img_path.exists():
            print(f"\n{img_name}:")
            display(Image(filename=str(img_path)))

---
## 6. Test Sniffer su PCAP con CONFRONTO

*Eseguito per MODE: all, sniffing*

Questa sezione permette di **confrontare** le performance di diversi modelli sugli stessi PCAP.

In [None]:
# Cerca PCAP
pcap_files = []

if MODE in ['all', 'sniffing'] and ENV in ["kaggle", "colab"]:
    pcap_patterns = ["pcap", "cic-ids-2017-pcap", "cicids"]
    
    for p in DATA_INPUT.iterdir():
        name_lower = p.name.lower().replace("_", "-")
        if any(pat in name_lower for pat in pcap_patterns):
            found = list(p.glob("**/*.pcap")) + list(p.glob("**/*.pcapng"))
            if found:
                pcap_files = sorted(found, key=lambda x: x.name)
                print(f"Dataset PCAP: {p.name}")
                break
    
    if pcap_files:
        print(f"\nPCAP disponibili ({len(pcap_files)}):")
        for f in pcap_files:
            print(f"  - {f.name}: {f.stat().st_size/(1024**2):.1f} MB")
    else:
        print("Dataset PCAP non trovato.")
else:
    print(f"Skip ricerca PCAP (MODE={MODE} o ambiente locale)")

In [None]:
# Determina quali modelli testare
models_to_sniff = []

if MODE in ['all', 'sniffing'] and pcap_files:
    if MODELS_TO_SNIFF:
        # Lista specifica - cerca la migliore versione per ogni tipo
        for name in MODELS_TO_SNIFF:
            # Prima cerca best version di questo tipo
            type_versions = [v for v in list_model_versions(task=TASK) if v['model_type'] == name]
            if type_versions:
                # Prendi la migliore per F1
                best_v = max(type_versions, key=lambda x: x.get('validation_metrics', {}).get('f1', 0))
                models_to_sniff.append((f"{name}/{best_v['version_id']}", best_v['model_path']))
            else:
                # Fallback: cerca nella root
                model_path = PROJECT_ROOT / "models" / name / f"model_{TASK}.pkl"
                if model_path.exists():
                    models_to_sniff.append((name, model_path))
                else:
                    print(f"WARNING: {name} non trovato, skip")
    else:
        # Solo best_model
        best_path = PROJECT_ROOT / "models" / "best_model" / f"model_{TASK}.pkl"
        if best_path.exists():
            models_to_sniff.append(('best_model', best_path))
        else:
            # Fallback: primo disponibile
            for name in ['lightgbm', 'xgboost', 'random_forest']:
                p = PROJECT_ROOT / "models" / name / f"model_{TASK}.pkl"
                if p.exists():
                    models_to_sniff.append((name, p))
                    break
    
    print(f"\nModelli per test PCAP:")
    for name, path in models_to_sniff:
        print(f"  - {name}")

In [None]:
%%time
# Test PCAP con CONFRONTO tra modelli
all_sniff_results = {}  # {model_name: [results per pcap]}

if MODE in ['all', 'sniffing'] and pcap_files and models_to_sniff:
    from src.sniffer import analyze_pcap_file
    
    for model_name, model_path in models_to_sniff:
        print(f"\n{'#'*70}")
        print(f"# MODELLO: {model_name.upper()}")
        print(f"{'#'*70}")
        
        model_results = []
        
        for pcap_path in pcap_files:
            print(f"\n  Testing: {pcap_path.name}...")
            
            # Check timeout
            is_timeout, _ = check_timeout(margin_minutes=20)
            if is_timeout:
                print(f"  TIMEOUT - PCAP rimanenti skippati")
                break
            
            try:
                result = analyze_pcap_file(
                    pcap_path=str(pcap_path),
                    model_path=str(model_path),
                    threshold=SNIFF_THRESHOLD,
                    timeout=SNIFF_TIMEOUT,
                    min_packets=SNIFF_MIN_PACKETS,
                    verbose=False,
                    progress_interval=500000,
                    show_progress=True
                )
                result['model'] = model_name
                model_results.append(result)
                
                rate = result.get('detection_rate', 0)
                print(f"  -> Flows: {result['flows_analyzed']:,}, Attacks: {result['attacks_detected']:,} ({rate:.1f}%)")
                
            except Exception as e:
                print(f"  ERRORE: {e}")
        
        all_sniff_results[model_name] = model_results
else:
    print("Test PCAP non eseguito")

In [None]:
# TABELLA CONFRONTO MODELLI SU PCAP
if all_sniff_results:
    print("\n" + "="*90)
    print("CONFRONTO PERFORMANCE SU PCAP")
    print("="*90)
    
    # Raccogli tutti i PCAP
    all_pcaps = set()
    for results in all_sniff_results.values():
        for r in results:
            all_pcaps.add(r['pcap'])
    all_pcaps = sorted(all_pcaps)
    
    # Header
    models = list(all_sniff_results.keys())
    header = f"{'PCAP':<35}"
    for m in models:
        header += f" | {m:^20}"
    print(header)
    print("-" * len(header))
    
    # Per ogni PCAP
    totals = {m: {'flows': 0, 'attacks': 0} for m in models}
    
    for pcap_name in all_pcaps:
        row = f"{pcap_name[:35]:<35}"
        for m in models:
            # Trova risultato per questo modello e pcap
            result = None
            for r in all_sniff_results.get(m, []):
                if r['pcap'] == pcap_name:
                    result = r
                    break
            
            if result:
                attacks = result['attacks_detected']
                flows = result['flows_analyzed']
                rate = result.get('detection_rate', 0)
                row += f" | {attacks:>6} / {flows:<6} ({rate:>5.1f}%)"
                totals[m]['flows'] += flows
                totals[m]['attacks'] += attacks
            else:
                row += f" | {'N/A':^20}"
        print(row)
    
    # Totali
    print("-" * len(header))
    row = f"{'TOTALE':<35}"
    for m in models:
        t = totals[m]
        if t['flows'] > 0:
            rate = t['attacks'] / t['flows'] * 100
            row += f" | {t['attacks']:>6} / {t['flows']:<6} ({rate:>5.1f}%)"
        else:
            row += f" | {'N/A':^20}"
    print(row)
    
    # Salva risultati
    comparison_data = {
        'timestamp': datetime.now().isoformat(),
        'models': models,
        'parameters': {
            'threshold': SNIFF_THRESHOLD,
            'min_packets': SNIFF_MIN_PACKETS,
            'timeout': SNIFF_TIMEOUT
        },
        'results': {k: v for k, v in all_sniff_results.items()},
        'totals': totals
    }
    
    with open(PROJECT_ROOT / "reports" / "pcap_comparison.json", 'w') as f:
        json.dump(comparison_data, f, indent=2, default=str)
    print(f"\nRisultati salvati in: reports/pcap_comparison.json")

---
## 7. Download Output

In [None]:
import zipfile

if ENV in ["kaggle", "colab"]:
    zip_path = PROJECT_ROOT / "nids_ml_output.zip"
    
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as z:
        # Artifacts
        for f in (PROJECT_ROOT / "artifacts").glob("*"):
            if f.is_file():
                z.write(f, f"artifacts/{f.name}")
        
        # Models (tutti, incluse versioni)
        for model_dir in (PROJECT_ROOT / "models").iterdir():
            if model_dir.is_dir():
                for f in model_dir.rglob("*"):
                    if f.is_file():
                        rel_path = f.relative_to(PROJECT_ROOT / "models")
                        z.write(f, f"models/{rel_path}")
        
        # Reports
        for f in (PROJECT_ROOT / "reports").rglob("*"):
            if f.is_file():
                rel_path = f.relative_to(PROJECT_ROOT / "reports")
                z.write(f, f"reports/{rel_path}")
    
    print(f"ZIP: {zip_path.name} ({zip_path.stat().st_size/(1024**2):.1f} MB)")
    print(f"\nPer usare i modelli:")
    print(f"  1. Scarica lo ZIP")
    print(f"  2. Estrai e copia models/ nel repo locale")
    print(f"  3. git add models/ && git commit && git push")
else:
    print("Locale - output nelle cartelle del progetto")

---
## 8. Riepilogo

In [None]:
elapsed = datetime.now() - NOTEBOOK_START_TIME

print("="*70)
print("RIEPILOGO ESECUZIONE")
print("="*70)

print(f"\nTempo totale: {format_timedelta(elapsed)}")
print(f"Modalità: {MODE}")

# Training
if TRAINED_THIS_SESSION:
    print(f"\nTrainati in questa sessione ({len(TRAINED_THIS_SESSION)}):")
    for mt, vid in TRAINED_THIS_SESSION:
        print(f"  - {mt}/{vid}")

if SKIPPED_CONFIGS:
    print(f"\nSkippati per timeout ({len(SKIPPED_CONFIGS)}):")
    for cfg in SKIPPED_CONFIGS:
        print(f"  - {cfg['model']} cv={cfg['cv']} n_iter={cfg['n_iter']}")

# Best model
if (PROJECT_ROOT / "models" / "best_model" / "metadata.json").exists():
    with open(PROJECT_ROOT / "models" / "best_model" / "metadata.json") as f:
        meta = json.load(f)
    print(f"\nBest Model: {meta.get('best_model', 'N/A').upper()}")
    print(f"  Score: {meta.get('score', 0):.4f}")

# PCAP comparison
if all_sniff_results:
    print(f"\nTest PCAP eseguito su {len(models_to_sniff)} modelli")
    for m in all_sniff_results:
        t = totals.get(m, {})
        if t.get('flows', 0) > 0:
            rate = t['attacks'] / t['flows'] * 100
            print(f"  {m}: {t['attacks']:,} attacchi su {t['flows']:,} flussi ({rate:.1f}%)")

print("\n" + "="*70)
if MODE in ['all', 'training'] and TRAINED_THIS_SESSION:
    print("PROSSIMI STEP:")
    print("1. Scarica lo ZIP")
    print("2. Estrai models/ e copia nel repo locale")
    print("3. git add models/ && git commit -m 'Add trained models' && git push")
else:
    print("Per usare il modello in locale:")
    print("  sudo python src/sniffer.py --interface eth0 --verbose")
print("="*70)