# NIDS-ML: Pipeline con Multi-Training e Versionamento

## Modalita
| MODE | Cosa fa |
|------|---------|  
| `'all'` | Preproc + Multi-Training + Eval + Compare + Sniff |
| `'training'` | Preproc + Multi-Training |
| `'evaluation'` | Preproc + Eval + Compare |
| `'sniffing'` | Preproc + Eval + Compare + Sniff |

In [None]:
# ============================================================================
# CONFIGURAZIONE PRINCIPALE
# ============================================================================

# MODALITA: 'all', 'training', 'evaluation', 'sniffing'
MODE = 'all'
TASK = 'binary'

# ============================================================================
# MULTI-TRAINING: Lista configurazioni da eseguire
# Ogni training salvato come: models/xgboost/cv5_iter100_gpu/
# ============================================================================
TRAINING_CONFIGS = [
    {'model': 'xgboost', 'n_iter': 20, 'cv': 3, 'gpu': True},
    {'model': 'xgboost', 'n_iter': 50, 'cv': 5, 'gpu': True},
    {'model': 'lightgbm', 'n_iter': 20, 'cv': 3, 'gpu': False},
    {'model': 'lightgbm', 'n_iter': 50, 'cv': 5, 'gpu': False},
]

# ============================================================================
# MODELLI PER EVAL/SNIFF (se non fai training)
# ============================================================================
MODELS_TO_EVALUATE = None  # None = tutti disponibili
MODELS_TO_SNIFF = None     # None = solo best_model

# ============================================================================
# PARAMETRI
# ============================================================================
N_FEATURES = 30
BALANCE_RATIO = 2.0
MAX_FPR = 0.02
MAX_LATENCY_MS = 2.0

# Sniffing
SNIFF_THRESHOLD = 0.5
SNIFF_MIN_PACKETS = 2
SNIFF_TIMEOUT = 120
SNIFF_VERBOSE = False

# Timeout Kaggle (protezione crash)
MAX_RUNTIME_HOURS = 11.5

# Git
GIT_PUSH_ENABLED = False
GIT_USER = "Riiccardob"

print(f"MODE: {MODE}, Task: {TASK}")
print(f"Training configs: {len(TRAINING_CONFIGS)}")
for i, c in enumerate(TRAINING_CONFIGS, 1):
    print(f"  {i}. {c['model']} cv={c['cv']} n_iter={c['n_iter']} gpu={c.get('gpu', False)}")

In [None]:
# Setup
import os, sys, json, shutil
from pathlib import Path
from datetime import datetime, timedelta

NOTEBOOK_START = datetime.now()

def check_timeout(margin_min=30):
    elapsed = datetime.now() - NOTEBOOK_START
    limit = timedelta(hours=MAX_RUNTIME_HOURS) - timedelta(minutes=margin_min)
    return elapsed > limit, limit - elapsed

def format_td(td):
    s = int(td.total_seconds())
    return f"{s//3600}h {(s%3600)//60}m" if s > 0 else "0s"

if Path("/kaggle/input").exists():
    ENV, PROJECT_ROOT, DATA_INPUT = "kaggle", Path("/kaggle/working"), Path("/kaggle/input")
elif Path("/content").exists():
    ENV, PROJECT_ROOT, DATA_INPUT = "colab", Path("/content/NIDS-ML"), Path("/content")
else:
    ENV = "local"
    PROJECT_ROOT = Path.cwd()
    if not (PROJECT_ROOT / "src").exists(): PROJECT_ROOT = PROJECT_ROOT.parent
    DATA_INPUT = PROJECT_ROOT / "data" / "raw"

sys.path.insert(0, str(PROJECT_ROOT))
os.chdir(PROJECT_ROOT)
print(f"ENV: {ENV}, ROOT: {PROJECT_ROOT}")

In [None]:
# Clone repo
REPO_URL = "https://github.com/Riiccardob/NIDS-ML-SSR2"
if ENV in ["kaggle", "colab"]:
    !rm -rf temp_repo 2>/dev/null; git clone --depth 1 {REPO_URL} temp_repo 2>/dev/null
    !cp -r temp_repo/* {PROJECT_ROOT}/ 2>/dev/null; rm -rf temp_repo
    if (PROJECT_ROOT / "requirements.txt").exists():
        !pip install -q -r {PROJECT_ROOT}/requirements.txt 2>/dev/null
for d in ["data/raw", "data/processed", "artifacts", "models", "logs", "reports"]:
    (PROJECT_ROOT / d).mkdir(parents=True, exist_ok=True)
print("Setup completato")

In [None]:
# Copy CSV dataset
if ENV in ["kaggle", "colab"]:
    for p in DATA_INPUT.iterdir():
        if "pcap" not in p.name.lower() and any(x in p.name.lower() for x in ["cicids", "cic-ids", "ids2017"]):
            for csv in p.glob("**/*.csv"):
                dest = PROJECT_ROOT / "data" / "raw" / csv.name
                if not dest.exists(): shutil.copy(csv, dest)
            break
print(f"CSV: {len(list((PROJECT_ROOT / 'data' / 'raw').glob('*.csv')))}")

In [None]:
%%time
# Preprocessing
if MODE in ['all', 'training', 'evaluation', 'sniffing']:
    from src.preprocessing import main as preproc_main, load_processed_data
    if not (PROJECT_ROOT / "data" / "processed" / "test.parquet").exists():
        sys.argv = ['p.py', '--balance-ratio', str(BALANCE_RATIO), '--n-jobs', '4']
        preproc_main()
    train, val, test, mappings = load_processed_data()
    print(f"Train: {len(train):,} | Val: {len(val):,} | Test: {len(test):,}")

In [None]:
%%time
# Feature Engineering
if MODE in ['all', 'training', 'evaluation', 'sniffing']:
    from src.feature_engineering import main as fe_main, load_artifacts
    if not (PROJECT_ROOT / "artifacts" / "scaler.pkl").exists():
        sys.argv = ['fe.py', '--n-features', str(N_FEATURES), '--n-jobs', '4']
        fe_main()
    scaler, selected_features, _, scaler_columns = load_artifacts()
    print(f"Features: {len(selected_features)}")

In [None]:
# Multi-Training setup
TRAINED_THIS_SESSION = []
SKIPPED = []

if MODE in ['all', 'training']:
    from src.model_versioning import generate_version_id, get_version_dir
    print(f"Training {len(TRAINING_CONFIGS)} configs...")

In [None]:
%%time
# Multi-Training execution
if MODE in ['all', 'training']:
    for i, cfg in enumerate(TRAINING_CONFIGS, 1):
        mt, ni, cv, gpu = cfg['model'], cfg['n_iter'], cfg['cv'], cfg.get('gpu', False)
        extra = {'gpu': gpu} if gpu else None
        vid = generate_version_id(ni, cv, extra)
        vdir = get_version_dir(mt, ni, cv, extra, create=False)
        
        if (vdir / f"model_{TASK}.pkl").exists():
            print(f"[{i}/{len(TRAINING_CONFIGS)}] {mt}/{vid} exists, skip")
            TRAINED_THIS_SESSION.append((mt, vid))
            continue
        
        is_timeout, remaining = check_timeout(60)
        if is_timeout:
            print(f"TIMEOUT - skip remaining"); SKIPPED = TRAINING_CONFIGS[i-1:]; break
        
        print(f"\n[{i}/{len(TRAINING_CONFIGS)}] Training {mt}/{vid} (remaining: {format_td(remaining)})")
        try:
            if mt == 'xgboost':
                from src.training.xgboost_model import main as train_fn
                args = ['x.py', '--task', TASK, '--n-iter', str(ni), '--cv', str(cv)]
                args.append('--gpu') if gpu else args.extend(['--n-jobs', '4'])
            elif mt == 'lightgbm':
                from src.training.lightgbm_model import main as train_fn
                args = ['l.py', '--task', TASK, '--n-iter', str(ni), '--cv', str(cv), '--n-jobs', '4']
            elif mt == 'random_forest':
                from src.training.random_forest import main as train_fn
                args = ['r.py', '--task', TASK, '--n-iter', str(ni), '--cv', str(cv), '--n-jobs', '4']
            sys.argv = args
            train_fn()
            TRAINED_THIS_SESSION.append((mt, vid))
        except Exception as e:
            print(f"ERROR: {e}")
    print(f"\nTrained: {len(TRAINED_THIS_SESSION)}, Skipped: {len(SKIPPED)}")

In [None]:
# Show versions
from src.model_versioning import print_versions_summary
print_versions_summary(task=TASK)

In [None]:
%%time
# Evaluation
if MODE in ['all', 'evaluation', 'sniffing']:
    from src.evaluation import main as eval_main
    from src.model_versioning import list_model_versions
    versions = list_model_versions(task=TASK)
    print(f"Evaluating {len(versions)} versions...")
    for v in versions:
        print(f"  Eval: {v['model_type']}/{v['version_id']}")
        sys.argv = ['e.py', '--model-path', str(v['model_path']), '--task', TASK]
        try: eval_main()
        except Exception as e: print(f"    Error: {e}")

In [None]:
%%time
# Compare all versions
if MODE in ['all', 'evaluation', 'sniffing']:
    from src.compare_models import main as compare_main
    print(f"Compare: FPR<={MAX_FPR*100}%, Latency<={MAX_LATENCY_MS}ms")
    sys.argv = ['c.py', '--max-fpr', str(MAX_FPR), '--max-latency-ms', str(MAX_LATENCY_MS)]
    compare_main()

In [None]:
# Show results
from IPython.display import Image, display
best_dir = PROJECT_ROOT / "models" / "best_model"
if (best_dir / "metadata.json").exists():
    with open(best_dir / "metadata.json") as f:
        meta = json.load(f)
    print(f"BEST: {meta.get('best_model')} (score={meta.get('score', 0):.4f})")
for img in ['plateau_analysis.png', 'scorecard_comparison.png', 'algorithm_rankings.png']:
    if (best_dir / img).exists(): display(Image(filename=str(best_dir / img)))

In [None]:
# Find PCAP
pcap_files = []
if MODE in ['all', 'sniffing'] and ENV in ["kaggle", "colab"]:
    for p in DATA_INPUT.iterdir():
        if any(x in p.name.lower() for x in ["pcap", "cic-ids"]):
            pcap_files = sorted(p.glob("**/*.pcap"))[:3]
            if pcap_files: print(f"PCAP: {[f.name for f in pcap_files]}"); break
    if not pcap_files: print("No PCAP found")

In [None]:
%%time
# Sniffing
all_sniff_results = {}
if MODE in ['all', 'sniffing'] and pcap_files:
    from src.sniffer import analyze_pcap_file
    best_model = PROJECT_ROOT / "models" / "best_model" / f"model_{TASK}.pkl"
    if best_model.exists():
        for pcap in pcap_files:
            print(f"\nTesting: {pcap.name}")
            try:
                r = analyze_pcap_file(
                    pcap_path=str(pcap),
                    model_path=str(best_model),
                    threshold=SNIFF_THRESHOLD,
                    timeout=SNIFF_TIMEOUT,
                    min_packets=SNIFF_MIN_PACKETS,
                    verbose=SNIFF_VERBOSE,
                    progress_interval=500000,
                    show_progress=True
                )
                all_sniff_results[pcap.name] = r
                print(f"  Flows: {r['flows_analyzed']:,}, Attacks: {r['attacks_detected']:,} ({r.get('detection_rate',0):.1f}%)")
            except Exception as e:
                print(f"  Error: {e}")
    else:
        print("best_model not found")

In [None]:
# Download ZIP
import zipfile
if ENV in ["kaggle", "colab"]:
    zip_path = PROJECT_ROOT / "nids_output.zip"
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as z:
        for d in ["artifacts", "models", "reports"]:
            for f in (PROJECT_ROOT / d).rglob("*"):
                if f.is_file(): z.write(f, f"{d}/{f.relative_to(PROJECT_ROOT / d)}")
    print(f"ZIP: {zip_path.name} ({zip_path.stat().st_size/(1024**2):.1f} MB)")

In [None]:
# Summary
elapsed = datetime.now() - NOTEBOOK_START
print(f"\n{'='*60}\nCOMPLETED in {format_td(elapsed)}")
print(f"Trained: {[f'{m}/{v}' for m,v in TRAINED_THIS_SESSION]}")
if (PROJECT_ROOT / "models" / "best_model" / "metadata.json").exists():
    with open(PROJECT_ROOT / "models" / "best_model" / "metadata.json") as f:
        print(f"Best: {json.load(f).get('best_model')}")