# NIDS-ML: Pipeline Completa

## Modalità Disponibili

| MODE | Cosa fa | Quando usarla |
|------|---------|---------------|
| `'all'` | Pull + Preproc + Training + Eval + Compare + Sniff | Training completo da zero |
| `'training'` | Pull + Preproc + Training | Solo training, poi push manuale |
| `'evaluation'` | Pull + Eval + Compare | Modelli già su GitHub |
| `'sniffing'` | Pull + Test PCAP con confronto | Testare/confrontare modelli |

## Parametri Importanti

- `MODELS_TO_TRAIN`: Lista modelli da trainare (es. `['xgboost']` o `['xgboost', 'lightgbm']`)
- `MODELS_TO_EVALUATE`: Lista modelli da valutare (default: tutti quelli disponibili)
- `MODELS_TO_SNIFF`: Lista modelli per test PCAP con confronto

In [None]:
# ============================================================================
# CONFIGURAZIONE PRINCIPALE
# ============================================================================

# MODALITA: 'all', 'training', 'evaluation', 'sniffing'
MODE = 'all'

# Task
TASK = 'binary'

# ---- MODELLI ----
# Quali modelli trainare (solo per MODE='all' o 'training')
# Opzioni: 'random_forest', 'xgboost', 'lightgbm'
MODELS_TO_TRAIN = ['xgboost', 'lightgbm']  # RF e' lento, skip

# Quali modelli valutare (per evaluation e compare)
# None = tutti quelli disponibili
MODELS_TO_EVALUATE = None  # o es: ['xgboost', 'lightgbm']

# Quali modelli testare su PCAP (per confronto)
# None = solo best_model, lista = confronta tutti
MODELS_TO_SNIFF = ['xgboost', 'lightgbm']  # Confronta questi due

# ---- PARAMETRI TRAINING ----
TRAINING_PARAMS = {
    'random_forest': {'n_iter': 30, 'cv': 3},   # RF lento
    'xgboost': {'n_iter': 50, 'cv': 5, 'gpu': True},
    'lightgbm': {'n_iter': 50, 'cv': 5}
}

# ---- PARAMETRI GENERALI ----
N_FEATURES = 30
BALANCE_RATIO = 2.0

# ---- PARAMETRI COMPARE ----
MAX_FPR = 0.02
MAX_LATENCY_MS = 2.0

# ---- PARAMETRI SNIFFING ----
SNIFF_THRESHOLD = 0.5
SNIFF_MIN_PACKETS = 2
SNIFF_TIMEOUT = 60

print(f"="*60)
print(f"CONFIGURAZIONE")
print(f"="*60)
print(f"MODE:              {MODE}")
print(f"Task:              {TASK}")
if MODE in ['all', 'training']:
    print(f"Modelli training:  {MODELS_TO_TRAIN}")
if MODE in ['all', 'evaluation']:
    print(f"Modelli eval:      {MODELS_TO_EVALUATE or 'tutti disponibili'}")
if MODE in ['all', 'sniffing']:
    print(f"Modelli sniff:     {MODELS_TO_SNIFF or 'solo best_model'}")
print(f"Compare:           FPR<={MAX_FPR*100}%, Latency<={MAX_LATENCY_MS}ms")

---
## 1. Setup Ambiente

In [None]:
import os
import sys
import json
from pathlib import Path
from datetime import datetime

if Path("/kaggle/input").exists():
    ENV = "kaggle"
    PROJECT_ROOT = Path("/kaggle/working")
    DATA_INPUT = Path("/kaggle/input")
elif Path("/content").exists():
    ENV = "colab"
    PROJECT_ROOT = Path("/content/NIDS-ML")
    DATA_INPUT = Path("/content")
else:
    ENV = "local"
    PROJECT_ROOT = Path.cwd()
    if not (PROJECT_ROOT / "src").exists():
        PROJECT_ROOT = PROJECT_ROOT.parent
    DATA_INPUT = PROJECT_ROOT / "data" / "raw"

print(f"Ambiente: {ENV}")
print(f"Project root: {PROJECT_ROOT}")

sys.path.insert(0, str(PROJECT_ROOT))
os.chdir(PROJECT_ROOT)

In [None]:
# Clone/Pull da GitHub
REPO_URL = "https://github.com/Riiccardob/NIDS-ML-SSR2"

if ENV in ["kaggle", "colab"]:
    print(f"Cloning {REPO_URL}...")
    !rm -rf temp_repo 2>/dev/null
    !git clone --depth 1 {REPO_URL} temp_repo 2>/dev/null
    !cp -r temp_repo/* {PROJECT_ROOT}/ 2>/dev/null
    !rm -rf temp_repo
    print("Done.")
    
    if (PROJECT_ROOT / "requirements.txt").exists():
        !pip install -q -r {PROJECT_ROOT}/requirements.txt 2>/dev/null

# Crea directory
for d in ["data/raw", "data/processed", "artifacts", "models", "logs", "reports"]:
    (PROJECT_ROOT / d).mkdir(parents=True, exist_ok=True)

# Mostra modelli presenti da GitHub
print(f"\nModelli da GitHub:")
for name in ['random_forest', 'xgboost', 'lightgbm', 'best_model']:
    model_dir = PROJECT_ROOT / "models" / name
    if model_dir.exists() and list(model_dir.glob("*.pkl")):
        print(f"  {name}: OK")
    else:
        print(f"  {name}: -")

In [None]:
# Copia dataset CSV
if ENV in ["kaggle", "colab"]:
    patterns = ["cicids2017", "cic-ids", "ids2017", "network-intrusion"]
    for p in DATA_INPUT.iterdir():
        if "pcap" in p.name.lower():
            continue
        if any(pat in p.name.lower() for pat in patterns) and list(p.glob("**/*.csv")):
            print(f"Dataset CSV: {p.name}")
            for csv in p.glob("**/*.csv"):
                dest = PROJECT_ROOT / "data" / "raw" / csv.name
                if not dest.exists():
                    !cp "{csv}" "{dest}"
            break

print(f"CSV disponibili: {len(list((PROJECT_ROOT / 'data' / 'raw').glob('*.csv')))}")

---
## 2. Preprocessing & Feature Engineering

*Eseguito per MODE: all, training, evaluation*

In [None]:
%%time
if MODE in ['all', 'training', 'evaluation']:
    from src.preprocessing import main as preprocessing_main, load_processed_data
    
    processed_test = PROJECT_ROOT / "data" / "processed" / "test.parquet"
    if not processed_test.exists():
        print("Preprocessing...")
        sys.argv = ['preprocessing.py', '--balance-ratio', str(BALANCE_RATIO), '--n-jobs', '4']
        preprocessing_main()
    else:
        print("Dati gia' processati.")
    
    train, val, test, mappings = load_processed_data()
    print(f"Train: {len(train):,} | Val: {len(val):,} | Test: {len(test):,}")
else:
    print(f"Skip (MODE={MODE})")

In [None]:
%%time
if MODE in ['all', 'training', 'evaluation']:
    from src.feature_engineering import main as fe_main, load_artifacts
    
    if not (PROJECT_ROOT / "artifacts" / "scaler.pkl").exists():
        print("Feature engineering...")
        sys.argv = ['feature_engineering.py', '--n-features', str(N_FEATURES), '--n-jobs', '4']
        fe_main()
    else:
        print("Artifacts gia' presenti.")
    
    scaler, selected_features, _, _ = load_artifacts()
    print(f"Feature: {len(selected_features)}")
else:
    print(f"Skip (MODE={MODE})")

---
## 3. Training

*Eseguito per MODE: all, training*

I modelli trainati vengono salvati in `/kaggle/working/models/` (NON su GitHub).
Per usarli successivamente, scarica lo ZIP e pusha su GitHub.

In [None]:
# Traccia quali modelli sono stati trainati in QUESTA sessione
# Serve per sapere se usare modelli locali o da GitHub
TRAINED_THIS_SESSION = []

if MODE in ['all', 'training']:
    print(f"Modelli da trainare: {MODELS_TO_TRAIN}")
else:
    print(f"Skip training (MODE={MODE})")

In [None]:
%%time
# RANDOM FOREST
if MODE in ['all', 'training'] and 'random_forest' in MODELS_TO_TRAIN:
    from src.training.random_forest import main as rf_main
    
    params = TRAINING_PARAMS.get('random_forest', {})
    n_iter = params.get('n_iter', 30)
    cv = params.get('cv', 3)
    
    print(f"Training Random Forest: n_iter={n_iter}, cv={cv}")
    sys.argv = ['rf.py', '--task', TASK, '--n-iter', str(n_iter), '--cv', str(cv), '--n-jobs', '4']
    rf_main()
    TRAINED_THIS_SESSION.append('random_forest')
else:
    print("Skip Random Forest")

In [None]:
%%time
# XGBOOST
if MODE in ['all', 'training'] and 'xgboost' in MODELS_TO_TRAIN:
    from src.training.xgboost_model import main as xgb_main
    
    params = TRAINING_PARAMS.get('xgboost', {})
    n_iter = params.get('n_iter', 50)
    cv = params.get('cv', 5)
    use_gpu = params.get('gpu', True)
    
    print(f"Training XGBoost: n_iter={n_iter}, cv={cv}, GPU={use_gpu}")
    if use_gpu:
        sys.argv = ['xgb.py', '--task', TASK, '--n-iter', str(n_iter), '--cv', str(cv), '--gpu']
    else:
        sys.argv = ['xgb.py', '--task', TASK, '--n-iter', str(n_iter), '--cv', str(cv), '--n-jobs', '4']
    xgb_main()
    TRAINED_THIS_SESSION.append('xgboost')
else:
    print("Skip XGBoost")

In [None]:
%%time
# LIGHTGBM
if MODE in ['all', 'training'] and 'lightgbm' in MODELS_TO_TRAIN:
    from src.training.lightgbm_model import main as lgbm_main
    
    params = TRAINING_PARAMS.get('lightgbm', {})
    n_iter = params.get('n_iter', 50)
    cv = params.get('cv', 5)
    
    print(f"Training LightGBM: n_iter={n_iter}, cv={cv}")
    sys.argv = ['lgbm.py', '--task', TASK, '--n-iter', str(n_iter), '--cv', str(cv), '--n-jobs', '4']
    lgbm_main()
    TRAINED_THIS_SESSION.append('lightgbm')
else:
    print("Skip LightGBM")

In [None]:
# Riepilogo modelli disponibili
print("\n" + "="*60)
print("MODELLI DISPONIBILI")
print("="*60)
print(f"{'Modello':<20} {'Origine':<15} {'F1':>10} {'Accuracy':>10}")
print("-"*60)

for name in ['random_forest', 'xgboost', 'lightgbm']:
    model_path = PROJECT_ROOT / "models" / name / f"model_{TASK}.pkl"
    results_path = PROJECT_ROOT / "models" / name / f"results_{TASK}.json"
    
    if model_path.exists():
        origine = "TRAINATO ORA" if name in TRAINED_THIS_SESSION else "GitHub"
        f1, acc = "-", "-"
        if results_path.exists():
            with open(results_path) as f:
                r = json.load(f)
            m = r.get('validation_metrics', {})
            f1 = f"{m.get('f1', 0):.4f}"
            acc = f"{m.get('accuracy', 0):.4f}"
        print(f"{name:<20} {origine:<15} {f1:>10} {acc:>10}")
    else:
        print(f"{name:<20} {'MANCANTE':<15}")

print(f"\nTrainati in questa sessione: {TRAINED_THIS_SESSION or 'nessuno'}")

---
## 4. Evaluation

*Eseguito per MODE: all, evaluation*

In [None]:
if MODE in ['all', 'evaluation']:
    from src.evaluation import main as evaluation_main
    
    # Determina quali modelli valutare
    if MODELS_TO_EVALUATE:
        models_eval = MODELS_TO_EVALUATE
    else:
        # Tutti quelli disponibili
        models_eval = []
        for name in ['random_forest', 'xgboost', 'lightgbm']:
            if (PROJECT_ROOT / "models" / name / f"model_{TASK}.pkl").exists():
                models_eval.append(name)
    
    print(f"Modelli da valutare: {models_eval}")
else:
    print(f"Skip evaluation (MODE={MODE})")
    models_eval = []

In [None]:
%%time
# Esegui evaluation
if MODE in ['all', 'evaluation'] and models_eval:
    for name in models_eval:
        model_path = PROJECT_ROOT / "models" / name / f"model_{TASK}.pkl"
        
        print(f"\n{'='*60}")
        print(f"EVALUATION: {name}")
        origine = "TRAINATO ORA" if name in TRAINED_THIS_SESSION else "GitHub"
        print(f"Origine modello: {origine}")
        print(f"{'='*60}")
        
        sys.argv = ['eval.py', '--model-path', str(model_path), '--task', TASK]
        try:
            evaluation_main()
        except Exception as e:
            print(f"ERRORE: {e}")

---
## 5. Compare Models

*Eseguito per MODE: all, evaluation*

In [None]:
%%time
if MODE in ['all', 'evaluation']:
    from src.compare_models import main as compare_main
    
    print(f"Compare: FPR <= {MAX_FPR*100}%, Latency <= {MAX_LATENCY_MS}ms")
    sys.argv = ['compare.py', '--max-fpr', str(MAX_FPR), '--max-latency-ms', str(MAX_LATENCY_MS)]
    compare_main()
else:
    print(f"Skip compare (MODE={MODE})")

In [None]:
# Mostra best model
best_model_dir = PROJECT_ROOT / "models" / "best_model"
if best_model_dir.exists() and (best_model_dir / "metadata.json").exists():
    with open(best_model_dir / "metadata.json") as f:
        meta = json.load(f)
    print(f"\nBEST MODEL: {meta.get('best_model', 'N/A').upper()}")
    print(f"Score: {meta.get('score', 0):.4f}")
    
    # Scorecard
    from IPython.display import Image, display
    scorecard = best_model_dir / "scorecard_comparison.png"
    if scorecard.exists():
        display(Image(filename=str(scorecard)))

---
## 6. Test Sniffer su PCAP con CONFRONTO

*Eseguito per MODE: all, sniffing*

Questa sezione permette di **confrontare** le performance di diversi modelli sugli stessi PCAP.

In [None]:
# Cerca PCAP
pcap_files = []

if MODE in ['all', 'sniffing'] and ENV in ["kaggle", "colab"]:
    pcap_patterns = ["pcap", "cic-ids-2017-pcap", "cicids"]
    
    for p in DATA_INPUT.iterdir():
        name_lower = p.name.lower().replace("_", "-")
        if any(pat in name_lower for pat in pcap_patterns):
            found = list(p.glob("**/*.pcap")) + list(p.glob("**/*.pcapng"))
            if found:
                pcap_files = sorted(found, key=lambda x: x.name)
                print(f"Dataset PCAP: {p.name}")
                break
    
    if pcap_files:
        print(f"\nPCAP disponibili ({len(pcap_files)}):")
        for f in pcap_files:
            print(f"  - {f.name}: {f.stat().st_size/(1024**2):.1f} MB")
    else:
        print("Dataset PCAP non trovato.")
else:
    print(f"Skip ricerca PCAP (MODE={MODE} o ambiente locale)")

In [None]:
# Determina quali modelli testare
models_to_sniff = []

if MODE in ['all', 'sniffing'] and pcap_files:
    if MODELS_TO_SNIFF:
        # Lista specifica
        for name in MODELS_TO_SNIFF:
            model_path = PROJECT_ROOT / "models" / name / f"model_{TASK}.pkl"
            if model_path.exists():
                models_to_sniff.append((name, model_path))
            else:
                print(f"WARNING: {name} non trovato, skip")
    else:
        # Solo best_model
        best_path = PROJECT_ROOT / "models" / "best_model" / f"model_{TASK}.pkl"
        if best_path.exists():
            models_to_sniff.append(('best_model', best_path))
        else:
            # Fallback: primo disponibile
            for name in ['lightgbm', 'xgboost', 'random_forest']:
                p = PROJECT_ROOT / "models" / name / f"model_{TASK}.pkl"
                if p.exists():
                    models_to_sniff.append((name, p))
                    break
    
    print(f"\nModelli per test PCAP:")
    for name, path in models_to_sniff:
        origine = "TRAINATO ORA" if name in TRAINED_THIS_SESSION else "GitHub"
        print(f"  - {name} ({origine})")

In [None]:
%%time
# Test PCAP con CONFRONTO tra modelli
all_sniff_results = {}  # {model_name: [results per pcap]}

if MODE in ['all', 'sniffing'] and pcap_files and models_to_sniff:
    from src.sniffer import analyze_pcap_file
    
    for model_name, model_path in models_to_sniff:
        print(f"\n{'#'*70}")
        print(f"# MODELLO: {model_name.upper()}")
        origine = "TRAINATO ORA" if model_name in TRAINED_THIS_SESSION else "GitHub"
        print(f"# Origine: {origine}")
        print(f"{'#'*70}")
        
        model_results = []
        
        for pcap_path in pcap_files:
            print(f"\n  Testing: {pcap_path.name}...")
            
            try:
                result = analyze_pcap_file(
                    pcap_path=str(pcap_path),
                    model_path=str(model_path),
                    threshold=SNIFF_THRESHOLD,
                    timeout=SNIFF_TIMEOUT,
                    min_packets=SNIFF_MIN_PACKETS,
                    progress_interval=100000
                )
                result['model'] = model_name
                result['model_origin'] = origine
                model_results.append(result)
                
                rate = result.get('detection_rate', 0)
                print(f"  -> Flows: {result['flows_analyzed']:,}, Attacks: {result['attacks_detected']:,} ({rate:.1f}%)")
                
            except Exception as e:
                print(f"  ERRORE: {e}")
        
        all_sniff_results[model_name] = model_results
else:
    print("Test PCAP non eseguito")

In [None]:
# TABELLA CONFRONTO MODELLI SU PCAP
if all_sniff_results:
    print("\n" + "="*90)
    print("CONFRONTO PERFORMANCE SU PCAP")
    print("="*90)
    
    # Raccogli tutti i PCAP
    all_pcaps = set()
    for results in all_sniff_results.values():
        for r in results:
            all_pcaps.add(r['pcap'])
    all_pcaps = sorted(all_pcaps)
    
    # Header
    models = list(all_sniff_results.keys())
    header = f"{'PCAP':<35}"
    for m in models:
        header += f" | {m:^20}"
    print(header)
    print("-" * len(header))
    
    # Per ogni PCAP
    totals = {m: {'flows': 0, 'attacks': 0} for m in models}
    
    for pcap_name in all_pcaps:
        row = f"{pcap_name[:35]:<35}"
        for m in models:
            # Trova risultato per questo modello e pcap
            result = None
            for r in all_sniff_results.get(m, []):
                if r['pcap'] == pcap_name:
                    result = r
                    break
            
            if result:
                attacks = result['attacks_detected']
                flows = result['flows_analyzed']
                rate = result.get('detection_rate', 0)
                row += f" | {attacks:>6} / {flows:<6} ({rate:>5.1f}%)"
                totals[m]['flows'] += flows
                totals[m]['attacks'] += attacks
            else:
                row += f" | {'N/A':^20}"
        print(row)
    
    # Totali
    print("-" * len(header))
    row = f"{'TOTALE':<35}"
    for m in models:
        t = totals[m]
        if t['flows'] > 0:
            rate = t['attacks'] / t['flows'] * 100
            row += f" | {t['attacks']:>6} / {t['flows']:<6} ({rate:>5.1f}%)"
        else:
            row += f" | {'N/A':^20}"
    print(row)
    
    # Statistiche
    print("\n" + "="*90)
    print("STATISTICHE CONFRONTO")
    print("="*90)
    print(f"\n{'Modello':<20} {'Origine':<15} {'Tot Flows':>12} {'Tot Attacks':>12} {'Rate':>10}")
    print("-"*70)
    
    best_rate = 0
    best_model = None
    
    for m in models:
        t = totals[m]
        origine = "TRAINATO ORA" if m in TRAINED_THIS_SESSION else "GitHub"
        if t['flows'] > 0:
            rate = t['attacks'] / t['flows'] * 100
            print(f"{m:<20} {origine:<15} {t['flows']:>12,} {t['attacks']:>12,} {rate:>9.2f}%")
            if rate > best_rate:
                best_rate = rate
                best_model = m
        else:
            print(f"{m:<20} {origine:<15} {'N/A':>12}")
    
    if best_model and len(models) > 1:
        print(f"\nMigliore su PCAP: {best_model.upper()} ({best_rate:.2f}% detection rate)")
    
    # Salva risultati
    comparison_data = {
        'timestamp': datetime.now().isoformat(),
        'models': models,
        'parameters': {
            'threshold': SNIFF_THRESHOLD,
            'min_packets': SNIFF_MIN_PACKETS,
            'timeout': SNIFF_TIMEOUT
        },
        'results': all_sniff_results,
        'totals': totals
    }
    
    with open(PROJECT_ROOT / "reports" / "pcap_comparison.json", 'w') as f:
        json.dump(comparison_data, f, indent=2, default=str)
    print(f"\nRisultati salvati in: reports/pcap_comparison.json")

---
## 7. Download Output

In [None]:
import zipfile

if ENV in ["kaggle", "colab"]:
    zip_path = PROJECT_ROOT / "nids_ml_output.zip"
    
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as z:
        # Artifacts
        for f in (PROJECT_ROOT / "artifacts").glob("*"):
            z.write(f, f"artifacts/{f.name}")
        
        # Models (tutti)
        for model_dir in (PROJECT_ROOT / "models").iterdir():
            if model_dir.is_dir():
                for f in model_dir.glob("*"):
                    z.write(f, f"models/{model_dir.name}/{f.name}")
        
        # Reports
        for f in (PROJECT_ROOT / "reports").rglob("*"):
            if f.is_file():
                z.write(f, f"reports/{f.relative_to(PROJECT_ROOT / 'reports')}")
    
    print(f"ZIP: {zip_path.name} ({zip_path.stat().st_size/(1024**2):.1f} MB)")
else:
    print("Locale - output nelle cartelle del progetto")

---
## 8. Riepilogo

In [None]:
print("="*70)
print("RIEPILOGO ESECUZIONE")
print("="*70)

print(f"\nModalita': {MODE}")
print(f"Trainati in questa sessione: {TRAINED_THIS_SESSION or 'nessuno'}")

# Best model
if (PROJECT_ROOT / "models" / "best_model" / "metadata.json").exists():
    with open(PROJECT_ROOT / "models" / "best_model" / "metadata.json") as f:
        meta = json.load(f)
    print(f"\nBest Model (da compare): {meta.get('best_model', 'N/A').upper()}")

# PCAP comparison
if all_sniff_results:
    print(f"\nTest PCAP eseguito su {len(models_to_sniff)} modelli")
    for m in all_sniff_results:
        t = totals.get(m, {})
        if t.get('flows', 0) > 0:
            rate = t['attacks'] / t['flows'] * 100
            print(f"  {m}: {t['attacks']:,} attacchi su {t['flows']:,} flussi ({rate:.1f}%)")

print("\n" + "="*70)
if MODE in ['all', 'training'] and TRAINED_THIS_SESSION:
    print("PROSSIMI STEP:")
    print("1. Scarica lo ZIP")
    print("2. Estrai models/ e artifacts/")
    print("3. git add models/ artifacts/ && git commit && git push")
else:
    print("Per usare il modello in locale:")
    print("  sudo python src/sniffer.py --interface eth0 --verbose")
print("="*70)