# NIDS-ML: Network Intrusion Detection System

Pipeline completa: preprocessing, feature engineering, training, evaluation e test su PCAP.

**Requisiti Kaggle**:
- Dataset CIC-IDS2017 (CSV)
- Dataset `cicids2017-test-pcap` (opzionale, per test PCAP)

## 1. Setup Ambiente

In [None]:
import os
import sys
from pathlib import Path

if Path("/kaggle/input").exists():
    ENV = "kaggle"
    PROJECT_ROOT = Path("/kaggle/working")
    DATA_INPUT = Path("/kaggle/input")
elif Path("/content").exists():
    ENV = "colab"
    PROJECT_ROOT = Path("/content/NIDS-ML")
    DATA_INPUT = Path("/content")
else:
    ENV = "local"
    PROJECT_ROOT = Path.cwd()
    if not (PROJECT_ROOT / "src").exists():
        PROJECT_ROOT = PROJECT_ROOT.parent
    DATA_INPUT = PROJECT_ROOT / "data" / "raw"

print(f"Ambiente: {ENV}")
print(f"Project root: {PROJECT_ROOT}")
print(f"Data input: {DATA_INPUT}")

sys.path.insert(0, str(PROJECT_ROOT))
os.chdir(PROJECT_ROOT)

In [None]:
REPO_URL = "https://github.com/Riiccardob/NIDS-ML-SSR2"

if ENV in ["kaggle", "colab"]:
    if not (PROJECT_ROOT / "src").exists():
        print(f"Cloning project from {REPO_URL}...")
        !git clone {REPO_URL} temp_repo
        !cp -r temp_repo/* {PROJECT_ROOT}/
        !rm -rf temp_repo
        print("Project cloned.")
        if (PROJECT_ROOT / "requirements.txt").exists():
            !pip install -q -r {PROJECT_ROOT}/requirements.txt
    else:
        print("Project already loaded.")

if (PROJECT_ROOT / "src").exists():
    print(f"\nModuli disponibili:")
    for f in sorted((PROJECT_ROOT / "src").glob("*.py")):
        print(f"  - {f.name}")
else:
    raise FileNotFoundError("Cartella src/ non trovata!")

In [None]:
for d in ["data/raw", "data/processed", "artifacts", "models", "logs", "reports"]:
    (PROJECT_ROOT / d).mkdir(parents=True, exist_ok=True)
print("Directory create.")

In [None]:
if ENV in ["kaggle", "colab"]:
    dataset_patterns = ["cicids2017", "cic-ids-2017", "cicids", "ids2017", "network-intrusion-dataset"]
    dataset_path = None
    for p in DATA_INPUT.iterdir():
        name_lower = p.name.lower()
        if "pcap" in name_lower:
            continue
        if any(pat in name_lower for pat in dataset_patterns):
            if list(p.glob("**/*.csv")):
                dataset_path = p
                break
    if dataset_path:
        print(f"Dataset CSV trovato: {dataset_path}")
        raw_dir = PROJECT_ROOT / "data" / "raw"
        for csv in dataset_path.glob("**/*.csv"):
            dest = raw_dir / csv.name
            if not dest.exists():
                !cp "{csv}" "{dest}"
        print(f"CSV copiati in: {raw_dir}")
    else:
        print("ERRORE: Dataset CIC-IDS2017 non trovato!")

csv_files = list((PROJECT_ROOT / "data" / "raw").glob("*.csv"))
print(f"\nCSV disponibili: {len(csv_files)}")
for f in csv_files:
    size_mb = f.stat().st_size / (1024**2)
    print(f"  - {f.name}: {size_mb:.1f} MB")

## 2. Preprocessing

In [None]:
from src.preprocessing import main as preprocessing_main
from src.preprocessing import load_processed_data

In [None]:
%%time
import sys
sys.argv = ['preprocessing.py', '--balance-ratio', '2.0', '--n-jobs', '4']
preprocessing_main()

In [None]:
train, val, test, mappings = load_processed_data()
print(f"Train: {len(train):,} | Val: {len(val):,} | Test: {len(test):,}")

## 3. Feature Engineering

In [None]:
from src.feature_engineering import main as feature_engineering_main
from src.feature_engineering import load_artifacts

In [None]:
%%time
sys.argv = ['feature_engineering.py', '--n-features', '30', '--rf-estimators', '100', '--n-jobs', '4']
feature_engineering_main()

In [None]:
scaler, selected_features, importances, scaler_columns = load_artifacts()
print(f"Feature selezionate: {len(selected_features)}")
print(f"Colonne scaler: {len(scaler_columns)}")
print(f"\nTop 10 feature:")
for i, feat in enumerate(selected_features[:10]):
    print(f"  {i+1:2}. {feat}: {importances[feat]:.4f}")

## 4. Training Modelli (Definitivo)

Parametri: `--n-iter 50 --cv 5` (250 fit per modello)

In [None]:
from src.training.random_forest import main as rf_main
from src.training.xgboost_model import main as xgb_main
from src.training.lightgbm_model import main as lgbm_main

In [None]:
%%time
# Random Forest
sys.argv = ['random_forest.py', '--task', 'binary', '--n-iter', '50', '--cv', '5', '--n-jobs', '4']
rf_main()

In [None]:
%%time
# XGBoost
sys.argv = ['xgboost_model.py', '--task', 'binary', '--n-iter', '50', '--cv', '5', '--n-jobs', '4']
xgb_main()

In [None]:
%%time
# LightGBM
sys.argv = ['lightgbm_model.py', '--task', 'binary', '--n-iter', '50', '--cv', '5', '--n-jobs', '4']
lgbm_main()

In [None]:
import json
print("Modelli addestrati:")
for model_dir in (PROJECT_ROOT / "models").iterdir():
    if model_dir.is_dir() and model_dir.name != "best_model":
        results_file = model_dir / "results_binary.json"
        if results_file.exists():
            with open(results_file) as f:
                results = json.load(f)
            metrics = results.get('validation_metrics', {})
            print(f"\n  {model_dir.name}: Acc={metrics.get('accuracy', 0):.4f}, F1={metrics.get('f1', 0):.4f}")

## 5. Evaluation

In [None]:
from src.evaluation import main as evaluation_main

In [None]:
%%time
sys.argv = ['evaluation.py', '--model-path', 'models/random_forest/model_binary.pkl']
evaluation_main()

In [None]:
%%time
sys.argv = ['evaluation.py', '--model-path', 'models/xgboost/model_binary.pkl']
evaluation_main()

In [None]:
%%time
sys.argv = ['evaluation.py', '--model-path', 'models/lightgbm/model_binary.pkl']
evaluation_main()

## 6. Confronto e Selezione Best Model

In [None]:
from src.compare_models import main as compare_main

In [None]:
%%time
sys.argv = ['compare_models.py', '--max-fpr', '0.02', '--max-latency-ms', '1.0']
compare_main()

In [None]:
best_model_dir = PROJECT_ROOT / "models" / "best_model"
if best_model_dir.exists():
    print(f"Best model: {best_model_dir}")
    for f in best_model_dir.iterdir():
        print(f"  - {f.name}")
    metadata_file = best_model_dir / "metadata.json"
    if metadata_file.exists():
        with open(metadata_file) as f:
            print(f"\nBest: {json.load(f).get('best_model', 'N/A')}")

## 7. Visualizzazioni

In [None]:
import matplotlib.pyplot as plt
from IPython.display import Image, display

scorecard_img = PROJECT_ROOT / "models" / "best_model" / "scorecard_comparison.png"
if scorecard_img.exists():
    print("Scorecard:")
    display(Image(filename=str(scorecard_img)))

In [None]:
best_model_name = "lightgbm"
metadata_file = PROJECT_ROOT / "models" / "best_model" / "metadata.json"
if metadata_file.exists():
    with open(metadata_file) as f:
        best_model_name = json.load(f).get('best_model', 'lightgbm')

report_dir = PROJECT_ROOT / "reports" / best_model_name
if report_dir.exists():
    for img_name in ["confusion_matrix_binary.png", "roc_curve_binary.png"]:
        img_path = report_dir / img_name
        if img_path.exists():
            print(f"\n{img_name}:")
            display(Image(filename=str(img_path)))

---
## 8. Test Sniffer su PCAP

Dataset: `cicids2017-test-pcap` con `test_pcap/*.pcap`

In [None]:
import numpy as np
import pandas as pd
import joblib
from collections import defaultdict
from tqdm.notebook import tqdm

pcap_patterns = ["pcap", "test-pcap", "test_pcap"]
pcap_dataset_path = None
pcap_files = []

if ENV in ["kaggle", "colab"]:
    for p in DATA_INPUT.iterdir():
        if any(pat in p.name.lower() for pat in pcap_patterns):
            found = list(p.glob("**/*.pcap")) + list(p.glob("**/*.pcapng"))
            if found:
                pcap_dataset_path = p
                pcap_files = found
                break

if pcap_files:
    print(f"Dataset PCAP: {pcap_dataset_path}")
    for f in pcap_files:
        print(f"  - {f.name}: {f.stat().st_size/(1024**2):.1f} MB")
else:
    print("Dataset PCAP non trovato (opzionale).")

In [None]:
if pcap_files:
    print("Caricamento modello...")
    model = joblib.load(PROJECT_ROOT / "models" / "best_model" / "model_binary.pkl")
    scaler, selected_features, _, scaler_columns = load_artifacts()
    model_feat_path = PROJECT_ROOT / "models" / "best_model" / "features_binary.json"
    if model_feat_path.exists():
        with open(model_feat_path) as f:
            selected_features = json.load(f)
    print(f"  Features: {len(selected_features)}")

In [None]:
class LightweightFlow:
    def __init__(self, src_ip, dst_ip, src_port, dst_port, protocol):
        self.src_ip, self.dst_ip = src_ip, dst_ip
        self.src_port, self.dst_port = src_port, dst_port
        self.protocol = protocol
        self.fwd_lengths, self.bwd_lengths = [], []
        self.fwd_times, self.bwd_times = [], []
        self.tcp_flags = defaultdict(int)
        self.start_time = self.end_time = None
    
    @property
    def flow_id(self): return f"{self.src_ip}:{self.src_port}->{self.dst_ip}:{self.dst_port}"
    @property
    def total_packets(self): return len(self.fwd_lengths) + len(self.bwd_lengths)
    
    def add_packet(self, pkt_len, ts, is_fwd, flags=None):
        (self.fwd_lengths if is_fwd else self.bwd_lengths).append(pkt_len)
        (self.fwd_times if is_fwd else self.bwd_times).append(ts)
        if self.start_time is None: self.start_time = ts
        self.end_time = ts
        if flags:
            for f in 'FSRPAU':
                if f in flags: self.tcp_flags[f] += 1
    
    def extract_features(self):
        feat = {}
        dur = (self.end_time - self.start_time) if self.start_time else 0
        feat['Flow Duration'] = dur * 1e6
        feat['Total Fwd Packets'] = feat['Subflow Fwd Packets'] = len(self.fwd_lengths)
        feat['Total Backward Packets'] = feat['Subflow Bwd Packets'] = len(self.bwd_lengths)
        fwd = self.fwd_lengths or [0]
        bwd = self.bwd_lengths or [0]
        feat['Total Length of Fwd Packets'] = feat['Subflow Fwd Bytes'] = sum(fwd)
        feat['Total Length of Bwd Packets'] = feat['Subflow Bwd Bytes'] = sum(bwd)
        feat['Fwd Packet Length Max'], feat['Fwd Packet Length Min'] = max(fwd), min(fwd)
        feat['Fwd Packet Length Mean'] = feat['Avg Fwd Segment Size'] = np.mean(fwd)
        feat['Fwd Packet Length Std'] = np.std(fwd, ddof=0) if len(fwd)>1 else 0
        feat['Bwd Packet Length Max'], feat['Bwd Packet Length Min'] = max(bwd), min(bwd)
        feat['Bwd Packet Length Mean'] = feat['Avg Bwd Segment Size'] = np.mean(bwd)
        feat['Bwd Packet Length Std'] = np.std(bwd, ddof=0) if len(bwd)>1 else 0
        all_len = fwd + bwd
        feat['Packet Length Mean'] = feat['Average Packet Size'] = np.mean(all_len)
        feat['Packet Length Std'] = np.std(all_len, ddof=0) if len(all_len)>1 else 0
        feat['Packet Length Variance'] = np.var(all_len, ddof=0) if len(all_len)>1 else 0
        feat['Max Packet Length'] = max(all_len)
        if dur > 0:
            feat['Flow Bytes/s'] = sum(all_len)/dur
            feat['Flow Packets/s'] = self.total_packets/dur
            feat['Fwd Packets/s'] = len(self.fwd_lengths)/dur
            feat['Bwd Packets/s'] = len(self.bwd_lengths)/dur
        else:
            feat['Flow Bytes/s']=feat['Flow Packets/s']=feat['Fwd Packets/s']=feat['Bwd Packets/s']=0
        def iat(t): return [t[i+1]-t[i] for i in range(len(t)-1)] if len(t)>1 else [0]
        all_t = sorted(self.fwd_times + self.bwd_times)
        for prefix, times in [('Flow IAT', iat(all_t)), ('Fwd IAT', iat(sorted(self.fwd_times))), ('Bwd IAT', iat(sorted(self.bwd_times)))]:
            feat[f'{prefix} Mean'] = np.mean(times)*1e6 if times else 0
            feat[f'{prefix} Std'] = np.std(times,ddof=0)*1e6 if len(times)>1 else 0
            feat[f'{prefix} Max'] = max(times)*1e6 if times else 0
            feat[f'{prefix} Min'] = min(times)*1e6 if times else 0
            if 'Fwd' in prefix or 'Bwd' in prefix: feat[f'{prefix} Total'] = sum(times)*1e6
        for f,n in [('F','FIN'),('S','SYN'),('R','RST'),('P','PSH'),('A','ACK'),('U','URG')]:
            feat[f'{n} Flag Count'] = self.tcp_flags.get(f,0)
        feat['Fwd Header Length'] = feat['Fwd Header Length.1'] = len(self.fwd_lengths)*20
        feat['Bwd Header Length'] = len(self.bwd_lengths)*20
        feat['Init_Win_bytes_forward'] = feat['Init_Win_bytes_backward'] = 65535
        for s in ['Mean','Std','Max','Min']: feat[f'Active {s}']=feat[f'Idle {s}']=0
        return feat

In [None]:
def predict_flow(flow, model, scaler, scaler_cols, sel_feat):
    ext = flow.extract_features()
    df = pd.DataFrame([{c: ext.get(c,0) for c in scaler_cols}])
    df_sc = pd.DataFrame(scaler.transform(df), columns=scaler_cols)
    pred = int(model.predict(df_sc[sel_feat])[0])
    prob = float(model.predict_proba(df_sc[sel_feat])[0][1]) if hasattr(model,'predict_proba') else 0.5
    return pred, prob

def analyze_pcap(pcap_path, model, scaler, scaler_cols, sel_feat, timeout=60, min_pkt=2, interval=100000):
    from scapy.all import PcapReader, IP, TCP, UDP
    flows = {}
    res = {'pcap': pcap_path.name, 'packets_processed': 0, 'flows_analyzed': 0, 'attacks_detected': 0, 'benign_detected': 0, 'attack_flows': []}
    print(f"\n{'='*60}\nAnalisi: {pcap_path.name}\n{'='*60}")
    try:
        with PcapReader(str(pcap_path)) as reader:
            for pkt in reader:
                if not pkt.haslayer(IP): continue
                ip = pkt[IP]
                src_ip, dst_ip, proto, pkt_len, ts = ip.src, ip.dst, ip.proto, len(pkt), float(pkt.time)
                src_port = dst_port = 0
                flags = None
                if pkt.haslayer(TCP): src_port, dst_port, flags = pkt[TCP].sport, pkt[TCP].dport, str(pkt[TCP].flags)
                elif pkt.haslayer(UDP): src_port, dst_port = pkt[UDP].sport, pkt[UDP].dport
                if (src_ip, src_port) < (dst_ip, dst_port):
                    key, is_fwd = (src_ip, dst_ip, src_port, dst_port, proto), True
                else:
                    key, is_fwd = (dst_ip, src_ip, dst_port, src_port, proto), False
                if key not in flows: flows[key] = LightweightFlow(*key)
                flows[key].add_packet(pkt_len, ts, is_fwd, flags)
                res['packets_processed'] += 1
                if res['packets_processed'] % interval == 0:
                    print(f"  Pkts: {res['packets_processed']:,} | Flows: {len(flows):,} | Attacks: {res['attacks_detected']}")
                    expired = [k for k,f in flows.items() if f.end_time and (ts-f.end_time)>timeout]
                    for k in expired:
                        f = flows.pop(k)
                        if f.total_packets >= min_pkt:
                            p, pr = predict_flow(f, model, scaler, scaler_cols, sel_feat)
                            res['flows_analyzed'] += 1
                            if p == 1:
                                res['attacks_detected'] += 1
                                if len(res['attack_flows'])<100: res['attack_flows'].append({'flow_id':f.flow_id,'prob':pr,'pkts':f.total_packets})
                            else: res['benign_detected'] += 1
        print(f"  Final flows: {len(flows):,}")
        for f in tqdm(flows.values(), desc="Analyzing"):
            if f.total_packets >= min_pkt:
                p, pr = predict_flow(f, model, scaler, scaler_cols, sel_feat)
                res['flows_analyzed'] += 1
                if p == 1:
                    res['attacks_detected'] += 1
                    if len(res['attack_flows'])<100: res['attack_flows'].append({'flow_id':f.flow_id,'prob':pr,'pkts':f.total_packets})
                else: res['benign_detected'] += 1
    except Exception as e:
        print(f"  ERROR: {e}")
        res['error'] = str(e)
    return res

In [None]:
%%time
all_pcap_results = []
if pcap_files:
    for pcap_path in pcap_files:
        r = analyze_pcap(pcap_path, model, scaler, scaler_columns, selected_features)
        all_pcap_results.append(r)
        print(f"\n  {r['pcap']}: Pkts={r['packets_processed']:,} Flows={r['flows_analyzed']:,} Attacks={r['attacks_detected']:,}")
else:
    print("Nessun PCAP.")

In [None]:
if all_pcap_results:
    print("\n" + "="*70 + "\nRIEPILOGO PCAP\n" + "="*70)
    tot_pkt = sum(r['packets_processed'] for r in all_pcap_results)
    tot_fl = sum(r['flows_analyzed'] for r in all_pcap_results)
    tot_att = sum(r['attacks_detected'] for r in all_pcap_results)
    print(f"\nTotale: Pkts={tot_pkt:,} Flows={tot_fl:,} Attacks={tot_att:,} ({tot_att/tot_fl*100:.1f}%)" if tot_fl>0 else "")
    print(f"\n{'PCAP':<35} {'Packets':>12} {'Flows':>10} {'Attacks':>10} {'%':>8}")
    print("-"*77)
    for r in all_pcap_results:
        pct = r['attacks_detected']/r['flows_analyzed']*100 if r['flows_analyzed']>0 else 0
        print(f"{r['pcap']:<35} {r['packets_processed']:>12,} {r['flows_analyzed']:>10,} {r['attacks_detected']:>10,} {pct:>7.1f}%")
    with open(PROJECT_ROOT/"reports"/"pcap_results.json",'w') as f: json.dump(all_pcap_results,f,indent=2,default=str)
    print(f"\nSalvato: reports/pcap_results.json")

## 9. Download Output

In [None]:
if ENV in ["kaggle", "colab"]:
    import zipfile
    zip_path = PROJECT_ROOT / "nids_ml_output.zip"
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as z:
        for f in (PROJECT_ROOT/"artifacts").glob("*"): z.write(f, f"artifacts/{f.name}")
        for d in (PROJECT_ROOT/"models").iterdir():
            if d.is_dir():
                for f in d.glob("*"): z.write(f, f"models/{d.name}/{f.name}")
        for f in (PROJECT_ROOT/"reports").rglob("*"):
            if f.is_file(): z.write(f, f"reports/{f.relative_to(PROJECT_ROOT/'reports')}")
    print(f"ZIP: {zip_path} ({zip_path.stat().st_size/(1024**2):.1f} MB)")
else:
    print("Locale - output in cartella progetto.")

## 10. Riepilogo

In [None]:
print("="*70 + "\nPIPELINE COMPLETATA\n" + "="*70)
best_report = PROJECT_ROOT/"models"/"best_model"/"comparison_results.json"
if best_report.exists():
    with open(best_report) as f: results = json.load(f)
    best = max(results, key=lambda x: x.get('score',0))
    print(f"\nBest Model: {best['model_name'].upper()}")
    for k,v in best.get('metrics',{}).items():
        if isinstance(v,float): print(f"  {k}: {v:.4f}")
if all_pcap_results:
    print(f"\nPCAP Test: {sum(r['flows_analyzed'] for r in all_pcap_results):,} flows, {sum(r['attacks_detected'] for r in all_pcap_results):,} attacks")
print("\n" + "="*70 + "\nComandi locali:\n  sudo python src/sniffer.py --interface eth0\n  sudo python src/sniffer.py --pcap file.pcap\n" + "="*70)