# imports


In [1]:

import torch 
import joblib
import pandas as pd
import seaborn as sns
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler


 # dataloader.py

In [2]:
import os
import joblib
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

def load_data(train_path, test_path, binary=True, features_file=None, top_k=None):
    """
    Carica e preprocessa il dataset NSL-KDD.
    """
    # Nomi delle colonne NSL-KDD
    columns = [
        "duration","protocol_type","service","flag","src_bytes","dst_bytes","land",
        "wrong_fragment","urgent","hot","num_failed_logins","logged_in","num_compromised",
        "root_shell","su_attempted","num_root","num_file_creations","num_shells","num_access_files",
        "num_outbound_cmds","is_host_login","is_guest_login","count","srv_count","serror_rate",
        "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate",
        "srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate",
        "dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate",
        "dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate",
        "dst_host_srv_rerror_rate","label","difficulty"
    ]

    # Caricamento dataset
    train = pd.read_csv(train_path, names=columns)
    test = pd.read_csv(test_path, names=columns)

    # Etichetta binaria anziché multiclass
    if binary:
        train["binary_label"] = train["label"].apply(lambda x: 0 if x == "normal" else 1)
        test["binary_label"] = test["label"].apply(lambda x: 0 if x == "normal" else 1)
        target = "binary_label"
    else:
        target = "label"

    # Encoding categoriche
    encoders = {}
    for col in ["protocol_type", "service", "flag"]:
        le = LabelEncoder()
        train[col] = le.fit_transform(train[col])
        test[col] = le.transform(test[col])
        encoders[col] = le

    # Selezione feature più importanti
    selected_features = None
    if features_file is not None:
        feat_df = pd.read_csv(features_file)
        if top_k is not None:
            selected_features = feat_df.head(top_k)["feature"].tolist()
        else:
            selected_features = feat_df["feature"].tolist()

    drop_cols = ["label", "difficulty", target]
    X_train = train.drop(columns=drop_cols)
    X_test = test.drop(columns=drop_cols)

    if selected_features:
        X_train = X_train[selected_features]
        X_test = X_test[selected_features]

    y_train = train[target]
    y_test = test[target]

    # Scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # === Salvataggio encoder e scaler per l'inference ===
    REPORTS_DIR = "../reports"
    os.makedirs(REPORTS_DIR, exist_ok=True)

    joblib.dump(encoders, os.path.join(REPORTS_DIR, "encoders.joblib"))
    joblib.dump(scaler, os.path.join(REPORTS_DIR, "scaler.joblib"))

    return X_train, y_train, X_test, y_test


def preprocess_sample(
    sample,
    encoder_path="../reports/encoders.joblib",
    scaler_path="../reports/scaler.joblib",
    features=None
):
    """
    Preprocessa un singolo campione (dict) per l'inference.
    """
    # Carica encoder e scaler
    encoders = joblib.load(encoder_path)
    scaler = joblib.load(scaler_path)

    df = pd.DataFrame([sample])

    # Encoding delle categoriche
    for col in ["protocol_type", "service", "flag"]:
        if col in df.columns and col in encoders:
            df[col] = encoders[col].transform(df[col])

    if features is not None:
        df = df[features]

    X = scaler.transform(df)
    return X


 # feature_selection.py

In [3]:


# ===  Caricamento dataset ===
columns = [
    "duration","protocol_type","service","flag","src_bytes","dst_bytes","land",
    "wrong_fragment","urgent","hot","num_failed_logins","logged_in","num_compromised",
    "root_shell","su_attempted","num_root","num_file_creations","num_shells","num_access_files",
    "num_outbound_cmds","is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate",
    "srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate",
    "dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate",
    "dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate",
    "dst_host_srv_rerror_rate","label","difficulty"
]

train = pd.read_csv("../data/nsl-kdd/KDDTrain+.TXT", names=columns)

# ===  Preprocessing ===
train["binary_label"] = train["label"].apply(lambda x: 0 if x == "normal" else 1)

encoder = LabelEncoder()
for col in ["protocol_type", "service", "flag"]:
    train[col] = encoder.fit_transform(train[col])

X = train.drop(columns=["label", "difficulty", "binary_label"])
y = train["binary_label"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# === Training Random Forest ===
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
rf.fit(X_scaled, y)

importances = rf.feature_importances_
feature_names = X.columns

feature_importance = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)

# === Salvataggio risultati ===
feature_importance.to_csv("../reports/feature_importance.csv", index=False)

plt.figure(figsize=(12,8))
sns.barplot(data=feature_importance.head(20), x="importance", y="feature", palette="viridis")
plt.title("Top 20 Feature per importanza (Random Forest)", fontsize=14)
plt.xlabel("Importanza")
plt.ylabel("Feature")
plt.tight_layout()
plt.savefig("../reports/feature_importance.png")
plt.close()

print(" Feature importance salvata in reports/feature_importance.csv e reports/feature_importance.png")



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=feature_importance.head(20), x="importance", y="feature", palette="viridis")


 Feature importance salvata in reports/feature_importance.csv e reports/feature_importance.png


# torch_models.py


In [4]:
# src/torch_models.py


class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=2, dropout=0.3):
        super(MLPClassifier, self).__init__()
        
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.network(x)


# torch_train.py


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

import numpy as np
import os


from dataloader import load_data
from torch_models import MLPClassifier

# === CONFIG ===
TRAIN_PATH = "../data/nsl-kdd/KDDTrain+.txt"
TEST_PATH = "../data/nsl-kdd/KDDTest+.txt"
FEATURES_FILE = "../reports/feature_importance.csv"
TOP_K = 20

BATCH_SIZE = 64
EPOCHS = 50
LEARNING_RATE = 1e-3
PATIENCE = 5  # early stopping

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === CARTELLE ===
REPORTS_DIR = "../reports"
RUNS_DIR = "../runs/ids_experiment"
os.makedirs(REPORTS_DIR, exist_ok=True)
os.makedirs(RUNS_DIR, exist_ok=True)

# === CARICAMENTO DATI ===
print("[INFO] Caricamento dataset...")
X_train, y_train, X_test, y_test = load_data(
    train_path=TRAIN_PATH,
    test_path=TEST_PATH,
    binary=True,
    features_file=FEATURES_FILE,
    top_k=TOP_K
)

input_dim = X_train.shape[1]
print(f"[INFO] Numero di feature usate: {input_dim}")

# Conversione in tensori
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Creazione DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# === MODELLO, LOSS E OPTIMIZER ===
model = MLPClassifier(input_dim=input_dim).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# TensorBoard
writer = SummaryWriter(log_dir=RUNS_DIR)

# === TRAINING LOOP ===
best_val_loss = float("inf")
patience_counter = 0

for epoch in range(EPOCHS):
    model.train()
    train_losses = []

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())

    avg_train_loss = np.mean(train_losses)

    # === VALIDAZIONE ===
    model.eval()
    val_losses = []
    correct, total = 0, 0

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            val_losses.append(loss.item())

            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

    avg_val_loss = np.mean(val_losses)
    val_accuracy = correct / total

    print(f"[EPOCH {epoch+1}/{EPOCHS}] "
            f"Train Loss: {avg_train_loss:.4f} | "
            f"Val Loss: {avg_val_loss:.4f} | "
            f"Val Acc: {val_accuracy:.4f}")

    # TensorBoard logging
    writer.add_scalar("Loss/train", avg_train_loss, epoch)
    writer.add_scalar("Loss/val", avg_val_loss, epoch)
    writer.add_scalar("Accuracy/val", val_accuracy, epoch)

    # === SALVATAGGIO MODELLO ===
    torch.save(model.state_dict(), os.path.join(REPORTS_DIR, "model_last.pth"))

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), os.path.join(REPORTS_DIR, "model_best.pth"))
        print("[INFO] Miglior modello salvato")
        patience_counter = 0
    else:
        patience_counter += 1

    # Early stopping
    if patience_counter >= PATIENCE:
        print("[INFO] Early stopping attivato.")
        break

writer.close()
print("[INFO] Training completato.")


[INFO] Caricamento dataset...
[INFO] Numero di feature usate: 20
[EPOCH 1/50] Train Loss: 0.0881 | Val Loss: 1.0183 | Val Acc: 0.7662
[INFO] Miglior modello salvato
[EPOCH 2/50] Train Loss: 0.0550 | Val Loss: 1.0949 | Val Acc: 0.7784
[EPOCH 3/50] Train Loss: 0.0470 | Val Loss: 1.1929 | Val Acc: 0.7764
[EPOCH 4/50] Train Loss: 0.0423 | Val Loss: 1.2205 | Val Acc: 0.7868
[EPOCH 5/50] Train Loss: 0.0386 | Val Loss: 1.4332 | Val Acc: 0.7896
[EPOCH 6/50] Train Loss: 0.0364 | Val Loss: 1.5263 | Val Acc: 0.7804
[INFO] Early stopping attivato.
[INFO] Training completato.


# inference_torch.py

In [6]:
# src/inference_torch.py

import torch
import torch.nn as nn
import joblib
import pandas as pd
import numpy as np

from torch_models import MLPClassifier
from dataloader import preprocess_sample

# Config
MODEL_PATH = "../reports/model_best.pth"
FEATURES_FILE = "../reports/feature_importance.csv"
TOP_K = 20

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Carica le feature selezionate
features = pd.read_csv(FEATURES_FILE).head(TOP_K)["feature"].tolist()

# Carica modello
input_dim = len(features)
model = MLPClassifier(input_dim=input_dim)
model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
model.to(DEVICE)
model.eval()

print("[INFO] Modello caricato correttamente.")

def predict(sample: dict):
    """
    sample: dizionario con le stesse chiavi del dataset NSL-KDD
    Esempio:
    {
        "duration": 0,
        "protocol_type": "tcp",
        "service": "http",
        "flag": "SF",
        "src_bytes": 181,
        "dst_bytes": 5450,
        ...
    }
    """
    # Preprocessing (usa lo stesso scaler/encoder salvato)
    X = preprocess_sample(sample, features=features)
    X_tensor = torch.tensor(X, dtype=torch.float32).to(DEVICE)

    with torch.no_grad():
        outputs = model(X_tensor)
        _, predicted = torch.max(outputs, 1)

    label = int(predicted.item())
    return "normal" if label == 0 else "attack"


if __name__ == "__main__":
    # Esempiodi un campione normale
    sample_normal = {
        "duration": 0, "protocol_type": "tcp", "service": "http", "flag": "SF",
        "src_bytes": 181, "dst_bytes": 5450, "land": 0, "wrong_fragment": 0, "urgent": 0,
        "hot": 0, "num_failed_logins": 0, "logged_in": 1, "num_compromised": 0,
        "root_shell": 0, "su_attempted": 0, "num_root": 0, "num_file_creations": 0,
        "num_shells": 0, "num_access_files": 0, "num_outbound_cmds": 0,
        "is_host_login": 0, "is_guest_login": 0, "count": 9, "srv_count": 9,
        "serror_rate": 0.00, "srv_serror_rate": 0.00, "rerror_rate": 0.00,
        "srv_rerror_rate": 0.00, "same_srv_rate": 1.00, "diff_srv_rate": 0.00,
        "srv_diff_host_rate": 0.00, "dst_host_count": 9, "dst_host_srv_count": 9,
        "dst_host_same_srv_rate": 1.00, "dst_host_diff_srv_rate": 0.00,
        "dst_host_same_src_port_rate": 0.11, "dst_host_srv_diff_host_rate": 0.00,
        "dst_host_serror_rate": 0.00, "dst_host_srv_serror_rate": 0.00,
        "dst_host_rerror_rate": 0.00, "dst_host_srv_rerror_rate": 0.00
    }

    print("Predizione:", predict(sample_normal))
# src/inference_torch.py

import torch
import pandas as pd
from torch_models import MLPClassifier
from dataloader import preprocess_sample

# Config
MODEL_PATH = "../reports/model_best.pth"
FEATURES_FILE = "../reports/feature_importance.csv"
TOP_K = 20

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Carica le top-K feature da usare
features = pd.read_csv(FEATURES_FILE).head(TOP_K)["feature"].tolist()

# Inizializza modello
input_dim = len(features)
model = MLPClassifier(input_dim=input_dim)
model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
model.to(DEVICE)
model.eval()

print("[INFO] Modello caricato correttamente.")

def predict(sample: dict):
    """
    sample: dizionario con tutte le 41 feature originali del dataset NSL-KDD.
    """
    # Preprocessing (usa encoder/scaler salvati + riduzione top-K feature)
    X = preprocess_sample(sample, features=features)
    X_tensor = torch.tensor(X, dtype=torch.float32).to(DEVICE)

    with torch.no_grad():
        outputs = model(X_tensor)
        _, predicted = torch.max(outputs, 1)

    label = int(predicted.item())
    return "normal" if label == 0 else "attack"


if __name__ == "__main__":
    # campione con tutte le 41 feature NSL-KDD tranne difficulti e labels
    sample_example = {
        "duration": 0,
        "protocol_type": "tcp",
        "service": "http",
        "flag": "SF",
        "src_bytes": 181,
        "dst_bytes": 5450,
        "land": 0,
        "wrong_fragment": 0,
        "urgent": 0,
        "hot": 0,
        "num_failed_logins": 0,
        "logged_in": 1,
        "num_compromised": 0,
        "root_shell": 0,
        "su_attempted": 0,
        "num_root": 0,
        "num_file_creations": 0,
        "num_shells": 0,
        "num_access_files": 0,
        "num_outbound_cmds": 0,
        "is_host_login": 0,
        "is_guest_login": 0,
        "count": 9,
        "srv_count": 9,
        "serror_rate": 0.00,
        "srv_serror_rate": 0.00,
        "rerror_rate": 0.00,
        "srv_rerror_rate": 0.00,
        "same_srv_rate": 1.00,
        "diff_srv_rate": 0.00,
        "srv_diff_host_rate": 0.00,
        "dst_host_count": 9,
        "dst_host_srv_count": 9,
        "dst_host_same_srv_rate": 1.00,
        "dst_host_diff_srv_rate": 0.00,
        "dst_host_same_src_port_rate": 0.11,
        "dst_host_srv_diff_host_rate": 0.00,
        "dst_host_serror_rate": 0.00,
        "dst_host_srv_serror_rate": 0.00,
        "dst_host_rerror_rate": 0.00,
        "dst_host_srv_rerror_rate": 0.00
    }

    print("Predizione:", predict(sample_example))


[INFO] Modello caricato correttamente.
Predizione: normal
[INFO] Modello caricato correttamente.
Predizione: normal
