In [2]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

# --------------------
# Config
# --------------------
original_dataset_folder = "original_dataset"
training_dataset_folder = "training_dataset"
adversarial_dataset_folder = "adversarial_dataset"

os.makedirs(training_dataset_folder, exist_ok=True)
os.makedirs(adversarial_dataset_folder, exist_ok=True)

label_encoder = LabelEncoder()
minmax_scaler = MinMaxScaler()
feature_names = None
epsilons = [0, 0.00005, 0.05, 0.1, 0.15]

# --------------------
# Simple NN for FGSM
# --------------------
class SimpleNet(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

def fgsm_attack(data: torch.Tensor, epsilon: float, data_grad: torch.Tensor) -> torch.Tensor:
    sign_data_grad = data_grad.sign()
    perturbed_data = data + epsilon * sign_data_grad
    perturbed_data = torch.clamp(perturbed_data, 0.0, 1.0)
    return perturbed_data

# --------------------
# Utilities
# --------------------
def remove_highly_correlated_features(df: pd.DataFrame, threshold: float = 0.95) -> pd.DataFrame:
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    if to_drop:
        print(f"Removing {len(to_drop)} highly correlated features: {to_drop}")
    return df.drop(columns=to_drop)

def preprocess_data(df: pd.DataFrame, is_training: bool = True):
    global feature_names, label_encoder, minmax_scaler

    non_feature_cols = ['Flow ID', 'Src IP', 'Dst IP', 'Timestamp', 'source_file']
    feature_cols = [col for col in df.columns if col not in non_feature_cols + ['Label']]

    X = df[feature_cols].copy()
    y = df['Label'].copy()

    # Clean labels
    y = y.astype(str).str.strip().str.lower().replace(r'\s+', ' ', regex=True)

    # Handle missing values
    X = X.fillna(X.median(numeric_only=True))

    # Convert objects to numeric
    for col in X.columns:
        if X[col].dtype == 'object':
            X[col] = pd.to_numeric(X[col], errors='coerce')
            X[col] = X[col].fillna(X[col].median(numeric_only=True))

    # Replace inf
    X = X.replace([np.inf, -np.inf], np.nan)
    X = X.fillna(X.median(numeric_only=True))

    if is_training:
        X = remove_highly_correlated_features(X, threshold=0.95)
        feature_names = list(X.columns)
        y_encoded = label_encoder.fit_transform(y)
        X_minmax = minmax_scaler.fit_transform(X)
    else:
        missing_features = set(feature_names) - set(X.columns)
        extra_features = set(X.columns) - set(feature_names)

        for feat in missing_features:
            X[feat] = 0
        if extra_features:
            X = X.drop(columns=list(extra_features))

        X = X[feature_names]

        mask = y.isin(label_encoder.classes_)
        X = X.loc[mask]
        y = y.loc[mask]

        y_encoded = label_encoder.transform(y)
        X_minmax = minmax_scaler.transform(X)

    return X, y, X_minmax, y_encoded

def generate_adversarial_data(df: pd.DataFrame, epsilon: float, model: nn.Module, device: str) -> pd.DataFrame:
    X_df, y_series, X_minmax, y_encoded = preprocess_data(df, is_training=False)

    # Torch tensors
    X_tensor = torch.tensor(X_minmax, dtype=torch.float32, device=device, requires_grad=True)
    y_tensor = torch.tensor(y_encoded, dtype=torch.long, device=device)

    model.zero_grad()
    output = model(X_tensor)
    loss = F.nll_loss(output, y_tensor)
    loss.backward()

    data_grad = X_tensor.grad.data
    perturbed = fgsm_attack(X_tensor, epsilon, data_grad)

    perturbed_np = perturbed.detach().cpu().numpy()
    X_perturbed_orig = minmax_scaler.inverse_transform(perturbed_np)

    perturbed_df = df.loc[X_df.index].copy()
    perturbed_df[feature_names] = X_perturbed_orig
    return perturbed_df

# --------------------
# 1) Load, split, and save per file
# --------------------
train_parts = []
infer_parts = []

csv_files = [f for f in os.listdir(original_dataset_folder) if f.endswith('.csv')]
print("CSV files found:", csv_files)

for f in csv_files:
    df = pd.read_csv(os.path.join(original_dataset_folder, f)).drop_duplicates()

    train_df, infer_df = train_test_split(
        df,
        test_size=0.20,
        stratify=df["Label"],
        random_state=42
    )

    # Save training split immediately
    train_outfile = os.path.join(training_dataset_folder, f"training_{f}")
    train_df.to_csv(train_outfile, index=False)
    print(f"Saved training file: {train_outfile}")

    train_parts.append(train_df)
    infer_parts.append((f, infer_df))  # keep filename for later adversarial gen

# Merge all train parts for preprocessing & NN training
train_df = pd.concat(train_parts, ignore_index=True)
print("Global Train shape:", train_df.shape)

# --------------------
# 2) Preprocess training data
# --------------------
X_train_df, y_train_series, X_train_minmax, y_train_encoded = preprocess_data(train_df, is_training=True)

# --------------------
# 3) Train NN
# --------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_dim = X_train_minmax.shape[1]
num_classes = len(label_encoder.classes_)

nn_model = SimpleNet(input_dim=input_dim, num_classes=num_classes).to(device)
optimizer = torch.optim.Adam(nn_model.parameters(), lr=0.01)

nn_model.train()
X_tensor = torch.tensor(X_train_minmax, dtype=torch.float32, device=device)
y_tensor = torch.tensor(y_train_encoded, dtype=torch.long, device=device)
for epoch in range(5):
    optimizer.zero_grad()
    out = nn_model(X_tensor)
    loss = F.nll_loss(out, y_tensor)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")
nn_model.eval()

# --------------------
# 4) Generate adversarial inference data per file
# --------------------
for f, infer_df in infer_parts:
    for eps in epsilons:
        adv_df = generate_adversarial_data(infer_df, eps, nn_model, device)
        outfile = os.path.join(adversarial_dataset_folder, f"adv_eps_{eps}_inference_{f}")
        adv_df.to_csv(outfile, index=False)
        print(f"Saved adversarial file: {outfile}")


CSV files found: ['gtp_encapsulation_labeled.csv', 'gtp_malformed_labeled.csv', 'brute_force_attack_labeled.csv', 'intra_upf_ddos_attack_labeled.csv', 'ddos_attack_labeled.csv', 'benign_labeled.csv']
Saved training file: training_dataset/training_gtp_encapsulation_labeled.csv
Saved training file: training_dataset/training_gtp_malformed_labeled.csv
Saved training file: training_dataset/training_brute_force_attack_labeled.csv
Saved training file: training_dataset/training_intra_upf_ddos_attack_labeled.csv
Saved training file: training_dataset/training_ddos_attack_labeled.csv
Saved training file: training_dataset/training_benign_labeled.csv
Global Train shape: (53506, 84)
Removing 17 highly correlated features: ['Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Fwd Header Length', 'Fwd Packets/s', 'Packet Length Variance', 'PSH Flag Count', 'Average Packet Size', 'Fwd Segment Size Avg', 'Bwd Segment Size Avg', 'Active Max', 'Active Min', 'Idle Mean', 'Idle Max