In [6]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset

# ============================
# Step 1: Load CICIDS-2018 CSVs
# ============================

folder_path = "D:/federated learning/FL-IDS-Intrusion-Detection/CICIDS-2018"  # <-- update this path

# Example filenames (adjust to your dataset structure)
files = [
    "02-14-2018.csv",
    "02-15-2018.csv",
    "02-16-2018.csv",
    "02-20-2018.csv",
    "02-21-2018.csv",
    "02-22-2018.csv",
    "02-23-2018.csv",
    "02-28-2018.csv",
    "03-01-2018.csv",
    "03-02-2018.csv"
]

df_list = []
missing_files = []

for file in files:
    file_path = os.path.join(folder_path, file)
    if os.path.exists(file_path):
        print(f"Loading: {file}")
        df = pd.read_csv(file_path, low_memory=False)
        df_list.append(df)
    else:
        print(f"File not found: {file}")
        missing_files.append(file)

# Combine into one DataFrame
if df_list:
    df = pd.concat(df_list, axis=0, ignore_index=True)
    print(f"Combined dataset shape: {df.shape}")
else:
    raise FileNotFoundError("No CICIDS-2018 files found in the given path.")

# ============================
# Step 2: Preprocessing
# ============================

# Drop NA rows
df = df.dropna()

# Map attack labels to binary (0=BENIGN, 1=ATTACK)
df['Label'] = df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)

# Separate features and labels
X = df.drop(columns=['Label']).values
y = df['Label'].values

# Handle inf and NaN
X = np.where(np.isinf(X), np.nan, X)
col_means = np.nanmean(X, axis=0)
inds = np.where(np.isnan(X))
X[inds] = np.take(col_means, inds[1])

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# ============================
# Step 3: Non-IID Split (5 clients)
# ============================

def split_noniid_data(X, y, num_clients):
    """
    Split data in a non-IID fashion among clients.
    Each client gets biased label distributions.
    """
    non_iid_data = []
    unique_labels = np.unique(y)

    label_indices = {label: np.where(y == label)[0] for label in unique_labels}

    for client_id in range(num_clients):
        client_data_indices = []
        for label in unique_labels:
            num_samples = int(len(label_indices[label]) / num_clients)
            if num_samples > 0:
                selected_indices = np.random.choice(label_indices[label], num_samples, replace=False)
                client_data_indices.extend(selected_indices)
                label_indices[label] = np.setdiff1d(label_indices[label], selected_indices)

        client_data_X = X[client_data_indices]
        client_data_y = y[client_data_indices]
        non_iid_data.append((client_data_X, client_data_y))

    return non_iid_data

num_clients = 5
client_data_splits = split_noniid_data(X_train, y_train, num_clients)

# Convert each client's data to PyTorch datasets
client_datasets = []
for client_data_X, client_data_y in client_data_splits:
    client_X_tensor = torch.tensor(client_data_X, dtype=torch.float32)
    client_y_tensor = torch.tensor(client_data_y, dtype=torch.long)
    client_datasets.append(TensorDataset(client_X_tensor, client_y_tensor))

# ============================
# Step 4: Verification
# ============================

for i, dataset in enumerate(client_datasets):
    print(f"Client {i+1} data size: {len(dataset)} samples")

print(f"Test dataset size: {len(test_dataset)} samples")


Loading: 02-14-2018.csv
Loading: 02-15-2018.csv
Loading: 02-16-2018.csv
Loading: 02-20-2018.csv


ParserError: Error tokenizing data. C error: out of memory