# NIDS-ML: Network Intrusion Detection System

Questo notebook esegue la pipeline completa usando i moduli del progetto.

**Requisiti**:
- Dataset CIC-IDS2017 aggiunto al notebook
- Cartella `src/` del progetto caricata

## 1. Setup Ambiente

In [None]:
# Su Kaggle, installa dipendenze mancanti se necessario
# !pip install -q xgboost lightgbm

In [None]:
import os
import sys
from pathlib import Path

# Rileva ambiente
if Path("/kaggle/input").exists():
    # Kaggle
    ENV = "kaggle"
    PROJECT_ROOT = Path("/kaggle/working")
    DATA_INPUT = Path("/kaggle/input")
elif Path("/content").exists():
    # Google Colab
    ENV = "colab"
    PROJECT_ROOT = Path("/content/NIDS-ML")
    DATA_INPUT = Path("/content")
else:
    # Locale
    ENV = "local"
    PROJECT_ROOT = Path.cwd()
    if not (PROJECT_ROOT / "src").exists():
        PROJECT_ROOT = PROJECT_ROOT.parent
    DATA_INPUT = PROJECT_ROOT / "data" / "raw"

print(f"Ambiente: {ENV}")
print(f"Project root: {PROJECT_ROOT}")
print(f"Data input: {DATA_INPUT}")

# Aggiungi src al path
sys.path.insert(0, str(PROJECT_ROOT))
os.chdir(PROJECT_ROOT)

In [None]:
# Su Kaggle/Colab: clona o carica il progetto
if ENV == "kaggle":
    # Opzione 1: Se hai caricato il progetto come dataset
    # Cerca la cartella src nei dataset aggiunti
    for p in DATA_INPUT.iterdir():
        if (p / "src").exists():
            print(f"Progetto trovato in: {p}")
            # Copia src nella working directory
            !cp -r {p}/src {PROJECT_ROOT}/
            !cp -r {p}/config.yaml {PROJECT_ROOT}/ 2>/dev/null || true
            break
    else:
        print("ATTENZIONE: Cartella src non trovata!")
        print("Carica il progetto come dataset Kaggle.")

elif ENV == "colab":
    # Clona da GitHub (se disponibile) o carica manualmente
    if not (PROJECT_ROOT / "src").exists():
        print("Carica la cartella del progetto in /content/NIDS-ML")

# Verifica
if (PROJECT_ROOT / "src").exists():
    print(f"\nModuli disponibili:")
    for f in sorted((PROJECT_ROOT / "src").glob("*.py")):
        print(f"  - {f.name}")
else:
    raise FileNotFoundError("Cartella src/ non trovata!")

In [None]:
# Crea struttura directory
for d in ["data/raw", "data/processed", "artifacts", "models", "logs", "reports"]:
    (PROJECT_ROOT / d).mkdir(parents=True, exist_ok=True)
    
print("Directory create.")

In [None]:
# Trova e linka dataset CIC-IDS2017
if ENV in ["kaggle", "colab"]:
    # Cerca il dataset
    dataset_patterns = ["cicids2017", "cic-ids-2017", "cicids", "ids2017"]
    dataset_path = None
    
    for p in DATA_INPUT.iterdir():
        name_lower = p.name.lower()
        if any(pat in name_lower for pat in dataset_patterns):
            # Verifica che contenga CSV
            if list(p.glob("**/*.csv")):
                dataset_path = p
                break
    
    if dataset_path:
        print(f"Dataset trovato: {dataset_path}")
        # Crea symlink a data/raw
        raw_dir = PROJECT_ROOT / "data" / "raw"
        for csv in dataset_path.glob("**/*.csv"):
            dest = raw_dir / csv.name
            if not dest.exists():
                !cp "{csv}" "{dest}"
        print(f"CSV copiati in: {raw_dir}")
    else:
        print("ERRORE: Dataset CIC-IDS2017 non trovato!")
        print("Aggiungi il dataset al notebook Kaggle.")

# Verifica CSV
csv_files = list((PROJECT_ROOT / "data" / "raw").glob("*.csv"))
print(f"\nCSV disponibili: {len(csv_files)}")
for f in csv_files:
    size_mb = f.stat().st_size / (1024**2)
    print(f"  - {f.name}: {size_mb:.1f} MB")

## 2. Preprocessing

In [None]:
# Import moduli progetto
from src.preprocessing import main as preprocessing_main
from src.preprocessing import load_processed_data

In [None]:
%%time

# Esegui preprocessing
# Equivalente a: python src/preprocessing.py --balance-ratio 2.0 --n-jobs 4

import sys
sys.argv = ['preprocessing.py', '--balance-ratio', '2.0', '--n-jobs', '4']

preprocessing_main()

In [None]:
# Verifica output
train, val, test, mappings = load_processed_data()
print(f"Train: {len(train):,} | Val: {len(val):,} | Test: {len(test):,}")

## 3. Feature Engineering

In [None]:
from src.feature_engineering import main as feature_engineering_main
from src.feature_engineering import load_artifacts

In [None]:
%%time

# Esegui feature engineering
# Equivalente a: python src/feature_engineering.py --n-features 30 --n-jobs 4

sys.argv = ['feature_engineering.py', '--n-features', '30', '--rf-estimators', '100', '--n-jobs', '4']

feature_engineering_main()

In [None]:
# Verifica artifacts
scaler, selected_features, importances, scaler_columns = load_artifacts()
print(f"Feature selezionate: {len(selected_features)}")
print(f"Colonne scaler: {len(scaler_columns)}")
print(f"\nTop 10 feature:")
for i, feat in enumerate(selected_features[:10]):
    print(f"  {i+1:2}. {feat}: {importances[feat]:.4f}")

## 4. Training Modelli

In [None]:
from src.training.random_forest import main as rf_main
from src.training.xgboost_model import main as xgb_main
from src.training.lightgbm_model import main as lgbm_main

In [None]:
%%time

# Training Random Forest
# Equivalente a: python src/training/random_forest.py --n-iter 20 --cv 3 --n-jobs 4

sys.argv = ['random_forest.py', '--task', 'binary', '--n-iter', '20', '--cv', '3', '--n-jobs', '4']

rf_main()

In [None]:
%%time

# Training XGBoost
# Equivalente a: python src/training/xgboost_model.py --n-iter 20 --cv 3 --n-jobs 4

sys.argv = ['xgboost_model.py', '--task', 'binary', '--n-iter', '20', '--cv', '3', '--n-jobs', '4']

xgb_main()

In [None]:
%%time

# Training LightGBM
# Equivalente a: python src/training/lightgbm_model.py --n-iter 20 --cv 3 --n-jobs 4

sys.argv = ['lightgbm_model.py', '--task', 'binary', '--n-iter', '20', '--cv', '3', '--n-jobs', '4']

lgbm_main()

In [None]:
# Verifica modelli salvati
import json

print("Modelli addestrati:")
for model_dir in (PROJECT_ROOT / "models").iterdir():
    if model_dir.is_dir() and model_dir.name != "best_model":
        results_file = model_dir / "results_binary.json"
        if results_file.exists():
            with open(results_file) as f:
                results = json.load(f)
            metrics = results.get('validation_metrics', {})
            print(f"\n  {model_dir.name}:")
            print(f"    Accuracy: {metrics.get('accuracy', 0):.4f}")
            print(f"    F1:       {metrics.get('f1', 0):.4f}")

## 5. Evaluation

In [None]:
from src.evaluation import main as evaluation_main

In [None]:
%%time

# Evaluation Random Forest
sys.argv = ['evaluation.py', '--model-path', 'models/random_forest/model_binary.pkl']
evaluation_main()

In [None]:
%%time

# Evaluation XGBoost
sys.argv = ['evaluation.py', '--model-path', 'models/xgboost/model_binary.pkl']
evaluation_main()

In [None]:
%%time

# Evaluation LightGBM
sys.argv = ['evaluation.py', '--model-path', 'models/lightgbm/model_binary.pkl']
evaluation_main()

## 6. Confronto e Selezione Best Model

In [None]:
from src.compare_models import main as compare_main

In [None]:
%%time

# Confronto modelli con scorecard
# Equivalente a: python src/compare_models.py --max-fpr 0.01 --max-latency-ms 1.0

sys.argv = ['compare_models.py', '--max-fpr', '0.02', '--max-latency-ms', '1.0']

compare_main()

In [None]:
# Verifica best model
best_model_dir = PROJECT_ROOT / "models" / "best_model"
if best_model_dir.exists():
    print(f"Best model salvato in: {best_model_dir}")
    print(f"\nFile:")
    for f in best_model_dir.iterdir():
        print(f"  - {f.name}")
    
    # Mostra metadata
    metadata_file = best_model_dir / "metadata.json"
    if metadata_file.exists():
        with open(metadata_file) as f:
            metadata = json.load(f)
        print(f"\nBest model: {metadata.get('best_model', 'N/A')}")

## 7. Timing Report

In [None]:
from src.timing import main as timing_main

In [None]:
# Genera report timing
sys.argv = ['timing.py', '--report']

try:
    timing_main()
except Exception as e:
    print(f"Timing report non disponibile: {e}")

## 8. Visualizzazioni

In [None]:
import matplotlib.pyplot as plt
from pathlib import Path
from IPython.display import Image, display

# Mostra grafici generati
reports_dir = PROJECT_ROOT / "reports"

# Scorecard comparison
scorecard_img = PROJECT_ROOT / "models" / "best_model" / "scorecard_comparison.png"
if scorecard_img.exists():
    print("Scorecard Comparison:")
    display(Image(filename=str(scorecard_img)))

In [None]:
# Mostra confusion matrix e ROC del best model
best_model_name = "lightgbm"  # Cambia se diverso

# Cerca il nome corretto
metadata_file = PROJECT_ROOT / "models" / "best_model" / "metadata.json"
if metadata_file.exists():
    with open(metadata_file) as f:
        best_model_name = json.load(f).get('best_model', 'lightgbm')

report_dir = reports_dir / best_model_name

if report_dir.exists():
    for img_name in ["confusion_matrix_binary.png", "roc_curve_binary.png", "feature_importance_binary.png"]:
        img_path = report_dir / img_name
        if img_path.exists():
            print(f"\n{img_name}:")
            display(Image(filename=str(img_path)))

## 9. Download Output (Kaggle/Colab)

In [None]:
if ENV in ["kaggle", "colab"]:
    import zipfile
    import shutil
    
    # Crea ZIP con tutto l'output
    zip_path = PROJECT_ROOT / "nids_ml_output.zip"
    
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Artifacts
        for f in (PROJECT_ROOT / "artifacts").glob("*"):
            zipf.write(f, f"artifacts/{f.name}")
        
        # Models
        for model_dir in (PROJECT_ROOT / "models").iterdir():
            if model_dir.is_dir():
                for f in model_dir.glob("*"):
                    zipf.write(f, f"models/{model_dir.name}/{f.name}")
        
        # Reports
        for f in (PROJECT_ROOT / "reports").rglob("*"):
            if f.is_file():
                rel_path = f.relative_to(PROJECT_ROOT / "reports")
                zipf.write(f, f"reports/{rel_path}")
        
        # Logs timing
        timing_dir = PROJECT_ROOT / "logs" / "timing"
        if timing_dir.exists():
            for f in timing_dir.glob("*"):
                zipf.write(f, f"logs/timing/{f.name}")
    
    zip_size = zip_path.stat().st_size / (1024 * 1024)
    print(f"ZIP creato: {zip_path}")
    print(f"Dimensione: {zip_size:.1f} MB")
    print(f"\nScarica dalla tab 'Output' (Kaggle) o Files (Colab)")
else:
    print("Ambiente locale - output gia disponibile nella cartella del progetto.")

## 10. Riepilogo Finale

In [None]:
print("="*70)
print("PIPELINE COMPLETATA")
print("="*70)

# Mostra metriche best model
best_report = PROJECT_ROOT / "models" / "best_model" / "comparison_results.json"
if best_report.exists():
    with open(best_report) as f:
        results = json.load(f)
    
    # Trova il best
    best = max(results, key=lambda x: x.get('score', 0))
    
    print(f"\nBest Model: {best['model_name'].upper()}")
    print(f"\nMetriche:")
    for k, v in best.get('metrics', {}).items():
        if isinstance(v, float):
            print(f"  {k:25}: {v:.4f}")
    
    print(f"\nLatenza:")
    for k, v in best.get('latency', {}).items():
        if isinstance(v, float):
            print(f"  {k:25}: {v:.4f}")

print(f"\n" + "="*70)
print("Per usare lo sniffer in locale:")
print("  sudo python src/sniffer.py --interface eth0 --verbose")
print("="*70)