# NIDS-ML: Network Intrusion Detection System

Pipeline completa: preprocessing, feature engineering, training, evaluation e test su PCAP.

**Requisiti Kaggle**:
- Dataset CIC-IDS2017 (CSV)
- Dataset `cicids2017-test-pcap` (opzionale, per test PCAP)

## 1. Setup Ambiente

In [None]:
import os
import sys
from pathlib import Path

if Path("/kaggle/input").exists():
    ENV = "kaggle"
    PROJECT_ROOT = Path("/kaggle/working")
    DATA_INPUT = Path("/kaggle/input")
elif Path("/content").exists():
    ENV = "colab"
    PROJECT_ROOT = Path("/content/NIDS-ML")
    DATA_INPUT = Path("/content")
else:
    ENV = "local"
    PROJECT_ROOT = Path.cwd()
    if not (PROJECT_ROOT / "src").exists():
        PROJECT_ROOT = PROJECT_ROOT.parent
    DATA_INPUT = PROJECT_ROOT / "data" / "raw"

print(f"Ambiente: {ENV}")
print(f"Project root: {PROJECT_ROOT}")
print(f"Data input: {DATA_INPUT}")

sys.path.insert(0, str(PROJECT_ROOT))
os.chdir(PROJECT_ROOT)

In [None]:
REPO_URL = "https://github.com/Riiccardob/NIDS-ML-SSR2"

if ENV in ["kaggle", "colab"]:
    if not (PROJECT_ROOT / "src").exists():
        print(f"Cloning project from {REPO_URL}...")
        !git clone {REPO_URL} temp_repo
        !cp -r temp_repo/* {PROJECT_ROOT}/
        !rm -rf temp_repo
        print("Project cloned.")
        if (PROJECT_ROOT / "requirements.txt").exists():
            !pip install -q -r {PROJECT_ROOT}/requirements.txt
    else:
        print("Project already loaded.")

if (PROJECT_ROOT / "src").exists():
    print(f"\nModuli disponibili:")
    for f in sorted((PROJECT_ROOT / "src").glob("*.py")):
        print(f"  - {f.name}")
else:
    raise FileNotFoundError("Cartella src/ non trovata!")

In [None]:
for d in ["data/raw", "data/processed", "artifacts", "models", "logs", "reports"]:
    (PROJECT_ROOT / d).mkdir(parents=True, exist_ok=True)
print("Directory create.")

In [None]:
if ENV in ["kaggle", "colab"]:
    dataset_patterns = ["cicids2017", "cic-ids-2017", "cicids", "ids2017", "network-intrusion-dataset"]
    dataset_path = None
    for p in DATA_INPUT.iterdir():
        name_lower = p.name.lower()
        if "pcap" in name_lower:
            continue
        if any(pat in name_lower for pat in dataset_patterns):
            if list(p.glob("**/*.csv")):
                dataset_path = p
                break
    if dataset_path:
        print(f"Dataset CSV trovato: {dataset_path}")
        raw_dir = PROJECT_ROOT / "data" / "raw"
        for csv in dataset_path.glob("**/*.csv"):
            dest = raw_dir / csv.name
            if not dest.exists():
                !cp "{csv}" "{dest}"
        print(f"CSV copiati in: {raw_dir}")
    else:
        print("ERRORE: Dataset CIC-IDS2017 non trovato!")

csv_files = list((PROJECT_ROOT / "data" / "raw").glob("*.csv"))
print(f"\nCSV disponibili: {len(csv_files)}")
for f in csv_files:
    size_mb = f.stat().st_size / (1024**2)
    print(f"  - {f.name}: {size_mb:.1f} MB")

## 2. Preprocessing

In [None]:
from src.preprocessing import main as preprocessing_main
from src.preprocessing import load_processed_data

In [None]:
%%time
import sys
sys.argv = ['preprocessing.py', '--balance-ratio', '2.0', '--n-jobs', '4']
preprocessing_main()

In [None]:
train, val, test, mappings = load_processed_data()
print(f"Train: {len(train):,} | Val: {len(val):,} | Test: {len(test):,}")

## 3. Feature Engineering

In [None]:
from src.feature_engineering import main as feature_engineering_main
from src.feature_engineering import load_artifacts

In [None]:
%%time
sys.argv = ['feature_engineering.py', '--n-features', '30', '--rf-estimators', '100', '--n-jobs', '4']
feature_engineering_main()

In [None]:
scaler, selected_features, importances, scaler_columns = load_artifacts()
print(f"Feature selezionate: {len(selected_features)}")
print(f"Colonne scaler: {len(scaler_columns)}")
print(f"\nTop 10 feature:")
for i, feat in enumerate(selected_features[:10]):
    print(f"  {i+1:2}. {feat}: {importances[feat]:.4f}")

## 4. Training Modelli (Definitivo)

Parametri: `--n-iter 50 --cv 5` (250 fit per modello)

In [None]:
from src.training.random_forest import main as rf_main
from src.training.xgboost_model import main as xgb_main
from src.training.lightgbm_model import main as lgbm_main

In [None]:
%%time
# Random Forest
sys.argv = ['random_forest.py', '--task', 'binary', '--n-iter', '50', '--cv', '5', '--n-jobs', '4']
rf_main()

In [None]:
%%time
# XGBoost
sys.argv = ['xgboost_model.py', '--task', 'binary', '--n-iter', '50', '--cv', '5', '--n-jobs', '4']
xgb_main()

In [None]:
%%time
# LightGBM
sys.argv = ['lightgbm_model.py', '--task', 'binary', '--n-iter', '50', '--cv', '5', '--n-jobs', '4']
lgbm_main()

In [None]:
import json
print("Modelli addestrati:")
for model_dir in (PROJECT_ROOT / "models").iterdir():
    if model_dir.is_dir() and model_dir.name != "best_model":
        results_file = model_dir / "results_binary.json"
        if results_file.exists():
            with open(results_file) as f:
                results = json.load(f)
            metrics = results.get('validation_metrics', {})
            print(f"\n  {model_dir.name}: Acc={metrics.get('accuracy', 0):.4f}, F1={metrics.get('f1', 0):.4f}")

## 5. Evaluation

In [None]:
from src.evaluation import main as evaluation_main

In [None]:
%%time
sys.argv = ['evaluation.py', '--model-path', 'models/random_forest/model_binary.pkl']
evaluation_main()

In [None]:
%%time
sys.argv = ['evaluation.py', '--model-path', 'models/xgboost/model_binary.pkl']
evaluation_main()

In [None]:
%%time
sys.argv = ['evaluation.py', '--model-path', 'models/lightgbm/model_binary.pkl']
evaluation_main()

## 6. Confronto e Selezione Best Model

In [None]:
from src.compare_models import main as compare_main

In [None]:
%%time
# max-fpr: False Positive Rate massimo accettabile (2%)
# max-latency-ms: Latenza massima per predizione (1ms)
sys.argv = ['compare_models.py', '--max-fpr', '0.02', '--max-latency-ms', '1.0']
compare_main()

In [None]:
best_model_dir = PROJECT_ROOT / "models" / "best_model"
if best_model_dir.exists():
    print(f"Best model: {best_model_dir}")
    for f in best_model_dir.iterdir():
        print(f"  - {f.name}")
    metadata_file = best_model_dir / "metadata.json"
    if metadata_file.exists():
        with open(metadata_file) as f:
            print(f"\nBest: {json.load(f).get('best_model', 'N/A')}")

## 7. Visualizzazioni

In [None]:
import matplotlib.pyplot as plt
from IPython.display import Image, display

scorecard_img = PROJECT_ROOT / "models" / "best_model" / "scorecard_comparison.png"
if scorecard_img.exists():
    print("Scorecard:")
    display(Image(filename=str(scorecard_img)))

In [None]:
best_model_name = "lightgbm"
metadata_file = PROJECT_ROOT / "models" / "best_model" / "metadata.json"
if metadata_file.exists():
    with open(metadata_file) as f:
        best_model_name = json.load(f).get('best_model', 'lightgbm')

report_dir = PROJECT_ROOT / "reports" / best_model_name
if report_dir.exists():
    for img_name in ["confusion_matrix_binary.png", "roc_curve_binary.png"]:
        img_path = report_dir / img_name
        if img_path.exists():
            print(f"\n{img_name}:")
            display(Image(filename=str(img_path)))

---
## 8. Test Sniffer su PCAP

Usa la funzione `analyze_pcap_file()` da `src/sniffer.py`.

Dataset: `cicids2017-test-pcap` con `test_pcap/*.pcap`

In [None]:
# Cerca dataset PCAP
pcap_patterns = ["pcap", "test-pcap", "test_pcap"]
pcap_dataset_path = None
pcap_files = []

if ENV in ["kaggle", "colab"]:
    for p in DATA_INPUT.iterdir():
        if any(pat in p.name.lower() for pat in pcap_patterns):
            found = list(p.glob("**/*.pcap")) + list(p.glob("**/*.pcapng"))
            if found:
                pcap_dataset_path = p
                pcap_files = found
                break

if pcap_files:
    print(f"Dataset PCAP: {pcap_dataset_path}")
    for f in pcap_files:
        print(f"  - {f.name}: {f.stat().st_size/(1024**2):.1f} MB")
else:
    print("Dataset PCAP non trovato (opzionale).")
    print("Per testare, aggiungi dataset 'cicids2017-test-pcap' al notebook.")

In [None]:
# Import funzione analyze_pcap_file da sniffer.py
if pcap_files:
    from src.sniffer import analyze_pcap_file
    print("Funzione analyze_pcap_file importata da src/sniffer.py")

In [None]:
%%time
# Analizza tutti i PCAP usando la funzione del progetto
all_pcap_results = []

if pcap_files:
    for pcap_path in pcap_files:
        print(f"\n\n{'#'*70}")
        print(f"# PCAP: {pcap_path.name}")
        print(f"{'#'*70}")
        
        # Usa la funzione analyze_pcap_file da sniffer.py
        result = analyze_pcap_file(
            pcap_path=str(pcap_path),
            model_path=None,  # Usa best_model di default
            threshold=0.5,    # Soglia standard
            timeout=60,       # Timeout flusso 60s
            min_packets=2,    # Minimo 2 pacchetti
            verbose=False,    # Non stampare ogni flusso
            progress_interval=50000  # Progress ogni 50k pacchetti
        )
        all_pcap_results.append(result)
else:
    print("Nessun PCAP da analizzare.")

In [None]:
# Riepilogo finale PCAP
if all_pcap_results:
    print("\n" + "="*70)
    print("RIEPILOGO TEST PCAP")
    print("="*70)
    
    tot_pkt = sum(r['packets_processed'] for r in all_pcap_results)
    tot_fl = sum(r['flows_analyzed'] for r in all_pcap_results)
    tot_att = sum(r['attacks_detected'] for r in all_pcap_results)
    tot_ben = sum(r['benign_detected'] for r in all_pcap_results)
    
    print(f"\nTotale:")
    print(f"  Pacchetti:  {tot_pkt:,}")
    print(f"  Flussi:     {tot_fl:,}")
    print(f"  Attacchi:   {tot_att:,} ({tot_att/tot_fl*100:.1f}%)" if tot_fl > 0 else "")
    print(f"  Benigni:    {tot_ben:,}")
    
    print(f"\n{'PCAP':<35} {'Packets':>12} {'Flows':>10} {'Attacks':>10} {'Rate':>8}")
    print("-"*77)
    for r in all_pcap_results:
        pct = r.get('detection_rate', 0)
        print(f"{r['pcap']:<35} {r['packets_processed']:>12,} {r['flows_analyzed']:>10,} {r['attacks_detected']:>10,} {pct:>7.1f}%")
    
    # Salva risultati
    results_path = PROJECT_ROOT / "reports" / "pcap_test_results.json"
    with open(results_path, 'w') as f:
        json.dump(all_pcap_results, f, indent=2, default=str)
    print(f"\nRisultati salvati: {results_path}")

In [None]:
# Visualizzazione distribuzione probabilita
if all_pcap_results:
    import matplotlib.pyplot as plt
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Grafico 1: Attacchi vs Benigni per PCAP
    pcap_names = [r['pcap'][:25] for r in all_pcap_results]
    attacks = [r['attacks_detected'] for r in all_pcap_results]
    benign = [r['benign_detected'] for r in all_pcap_results]
    
    x = range(len(pcap_names))
    width = 0.35
    axes[0].bar([i-width/2 for i in x], attacks, width, label='Attacks', color='red', alpha=0.7)
    axes[0].bar([i+width/2 for i in x], benign, width, label='Benign', color='green', alpha=0.7)
    axes[0].set_xticks(x)
    axes[0].set_xticklabels(pcap_names, rotation=45, ha='right')
    axes[0].set_ylabel('Flussi')
    axes[0].set_title('Distribuzione Rilevamenti')
    axes[0].legend()
    
    # Grafico 2: Detection Rate
    rates = [r.get('detection_rate', 0) for r in all_pcap_results]
    colors = ['red' if r > 30 else 'orange' if r > 10 else 'green' for r in rates]
    axes[1].bar(x, rates, color=colors, alpha=0.7)
    axes[1].set_xticks(x)
    axes[1].set_xticklabels(pcap_names, rotation=45, ha='right')
    axes[1].set_ylabel('Detection Rate (%)')
    axes[1].set_title('Percentuale Attacchi Rilevati')
    axes[1].axhline(y=20, color='orange', linestyle='--', alpha=0.5, label='20% atteso')
    axes[1].legend()
    
    plt.tight_layout()
    plt.savefig(PROJECT_ROOT / "reports" / "pcap_test_chart.png", dpi=150)
    plt.show()

## 9. Download Output

In [None]:
if ENV in ["kaggle", "colab"]:
    import zipfile
    zip_path = PROJECT_ROOT / "nids_ml_output.zip"
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as z:
        for f in (PROJECT_ROOT/"artifacts").glob("*"): z.write(f, f"artifacts/{f.name}")
        for d in (PROJECT_ROOT/"models").iterdir():
            if d.is_dir():
                for f in d.glob("*"): z.write(f, f"models/{d.name}/{f.name}")
        for f in (PROJECT_ROOT/"reports").rglob("*"):
            if f.is_file(): z.write(f, f"reports/{f.relative_to(PROJECT_ROOT/'reports')}")
    print(f"ZIP: {zip_path} ({zip_path.stat().st_size/(1024**2):.1f} MB)")
else:
    print("Locale - output in cartella progetto.")

## 10. Riepilogo

In [None]:
print("="*70)
print("PIPELINE COMPLETATA")
print("="*70)

best_report = PROJECT_ROOT / "models" / "best_model" / "comparison_results.json"
if best_report.exists():
    with open(best_report) as f:
        results = json.load(f)
    best = max(results, key=lambda x: x.get('score', 0))
    print(f"\nBest Model: {best['model_name'].upper()}")
    for k, v in best.get('metrics', {}).items():
        if isinstance(v, float):
            print(f"  {k}: {v:.4f}")

if all_pcap_results:
    tot_fl = sum(r['flows_analyzed'] for r in all_pcap_results)
    tot_att = sum(r['attacks_detected'] for r in all_pcap_results)
    print(f"\nTest PCAP: {tot_fl:,} flussi, {tot_att:,} attacchi ({tot_att/tot_fl*100:.1f}%)" if tot_fl > 0 else "")

print("\n" + "="*70)
print("Comandi per uso locale:")
print("  # Live capture")
print("  sudo python src/sniffer.py --interface eth0 --verbose")
print("")
print("  # Analisi PCAP")
print("  sudo python src/sniffer.py --pcap file.pcap --threshold 0.3 --min-packets 1")
print("")
print("  # Da Python/notebook")
print("  from src.sniffer import analyze_pcap_file")
print("  results = analyze_pcap_file('file.pcap', threshold=0.3)")
print("="*70)