In [179]:
# Libraries
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import time
from sklearn.preprocessing import StandardScaler

from torch.utils.data import DataLoader, Dataset
import pandas as pd
import os, re, struct, socket

import import_ipynb
import project_models


In [180]:
#GPU Checking
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("GPU is not available. Using CPU.")


os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

Using GPU: NVIDIA GeForce RTX 3060


Data Preprocessing

In [181]:
#IDS2017_Monday ==== No anomolies
IDS2017_data = 'data\TrafficLabelling\Monday-WorkingHours.pcap_ISCX.csv'
IDS2017_DF = pd.DataFrame(pd.read_csv(IDS2017_data))



print(IDS2017_DF.columns)
IDS2017_DF.head()

Index(['Flow ID', ' Source IP', ' Source Port', ' Destination IP',
       ' Destination Port', ' Protocol', ' Timestamp', ' Flow Duration',
       ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
       'Bwd Packet Length Max', ' Bwd Packet Length Min',
       ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s',
       ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max',
       ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std',
       ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean',
       ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags',
       ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags',
       ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s',
       ' Bwd Packets/s', ' Min Packet Length', ' Max Pa

Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,192.168.10.5-8.254.250.126-49188-80-6,8.254.250.126,80,192.168.10.5,49188,6,03/07/2017 08:55:58,4,2,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
1,192.168.10.5-8.254.250.126-49188-80-6,8.254.250.126,80,192.168.10.5,49188,6,03/07/2017 08:55:58,1,2,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
2,192.168.10.5-8.254.250.126-49188-80-6,8.254.250.126,80,192.168.10.5,49188,6,03/07/2017 08:55:58,1,2,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
3,192.168.10.5-8.254.250.126-49188-80-6,8.254.250.126,80,192.168.10.5,49188,6,03/07/2017 08:55:58,1,2,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
4,192.168.10.14-8.253.185.121-49486-80-6,8.253.185.121,80,192.168.10.14,49486,6,03/07/2017 08:56:22,3,2,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN


In [182]:
## KDDcup Corrected File
#KDD_fp = 'data\kddcup\kddcup_corrected.csv'
#KDD_DF = pd.DataFrame(pd.read_csv(KDD_fp))
#KDD_DF.head()

In [183]:
# Helper function to convert IP addresses into integers
def encode_ip(ip_address):
    try:
        return struct.unpack("!L", socket.inet_aton(ip_address))[0]
    except socket.error:
        # Handle the case for invalid IP addresses
        return 0

Custom Dataset Construction

In [184]:
class NetworkDataset(Dataset):
    def __init__(self, directory_path, num_entries=1000):
        self.scaler = StandardScaler()  # Initialize the scaler once
        self.data = self.load_and_process_data(directory_path, num_entries)

    def load_and_process_data(self, file_path, num_entries):
        df = pd.read_csv(file_path)
        if num_entries is not None:
            df = df.iloc[:num_entries]  # Limit entries if needed
        df = self.preprocess_data(df)  # Apply preprocessing
        return df

    def preprocess_data(self, df):
        # Encode features and scale
        df = self.extract_and_encode_features(df)
        df = self.scale_features(df)
        return df

    def scale_features(self, df):
        # Scale model input features
        features_to_scale = ['Time_diff', 'Bandwidth', 'Throughput', 'Latency', 'Flow Duration',
                             'Fwd_to_Bwd_Packets', 'Fwd_to_Bwd_Bytes',
                             'Rolling_Mean_Packet_Size', 'Rolling_Std_Packet_Size','Total Length']
        df[features_to_scale] = self.scaler.fit_transform(df[features_to_scale])
        return df

    def get_dataframe(self):
        return self.data

    def extract_and_encode_features(self, df):
        df.columns = df.columns.str.lstrip()

        # Extract and encode features
        df['source_encoded'] = df['Source IP'].apply(encode_ip)
        df['destination_encoded'] = df['Destination IP'].apply(encode_ip)
        df = pd.get_dummies(df, columns=['Protocol'], prefix='Protocol')

        # Time
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S')
        df['Hour_of_Day'] = pd.to_datetime(df['Timestamp']).dt.hour
        df['Is_Weekend'] = (pd.to_datetime(df['Timestamp']).dt.weekday >= 5).astype(int)

        # Behaviors
        df['Session Duration'] = df.groupby('Flow ID')['Timestamp'].transform(lambda x: x.max() - x.min())
        df['Unique Source Ports'] = df.groupby('Flow ID')['Source Port'].transform('nunique')
        df['Unique Destination Ports'] = df.groupby('Flow ID')['Destination Port'].transform('nunique')

        # Ratios
        df['Fwd_to_Bwd_Packets'] = df['Total Fwd Packets'] / (df['Total Backward Packets'] + 0.001)
        df['Fwd_to_Bwd_Bytes'] = df['Total Length of Fwd Packets'] / (df['Total Length of Bwd Packets'] + 0.001)

        # Rolling Window Statistics
        window_size = 5  # Define the window size
        df['Rolling_Mean_Packet_Size'] = df['Packet Length Mean'].rolling(window=window_size).mean()
        df['Rolling_Std_Packet_Size'] = df['Packet Length Std'].rolling(window=window_size).std()
        df['EMA_Packet_Size'] = df['Packet Length Mean'].ewm(span=window_size).mean()

        # Precompute BW & Throughput
        df['Time_diff'] = (df['Timestamp'] - df['Timestamp'].shift()).dt.total_seconds().fillna(0).clip(lower=1e-5)
        df['Total Length'] = df['Total Length of Fwd Packets'] + df['Total Length of Bwd Packets']
        df['Bandwidth'] = df['Total Length'] / df['Time_diff']
        df['Throughput'] = df['Bandwidth']  # Simplified, adjust if different logic needed
        df['Latency'] = df['Total Length'] / 1000  # Assuming Length is in bytes and Latency is in ms
        float_cols = ['Time_diff', 'Bandwidth', 'Throughput', 'Latency', 'Flow Duration', 'Fwd_to_Bwd_Packets',
                      'Fwd_to_Bwd_Bytes', 'Rolling_Mean_Packet_Size', 'Rolling_Std_Packet_Size']
        df[float_cols] = df[float_cols].astype(float)
        return df

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        features = [
            row['Time_diff'],
            row['source_encoded'],
            row['destination_encoded'],
            row['Total Length'],
            row['Hour_of_Day'],
            row['Is_Weekend'],
            row['Fwd_to_Bwd_Packets'],
            row['Fwd_to_Bwd_Bytes'],
            row['Rolling_Mean_Packet_Size'],
            row['Rolling_Std_Packet_Size'],
            row['EMA_Packet_Size']
        ] + [row[col] for col in self.data.columns if col.startswith('Protocol_')]

        features = torch.tensor(features, dtype=torch.float32)  # Convert to PyTorch tensor
        bandwidth = torch.tensor([row['Bandwidth']], dtype=torch.float32)

        return features, bandwidth

In [185]:
def create_inout_sequences(df, seq_length):
    sequences = []
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

    for i in range(len(df) - seq_length):
        seq_features = []

        for j in range(seq_length):
            # Get numeric features for each timestep in the sequence
            numeric_values = df.iloc[i + j][numeric_cols].values
            numeric_tensor = torch.tensor(numeric_values, dtype=torch.float32)

            # Get non-numeric features for each timestep in the sequence
            non_numeric_values = df.iloc[i + j].drop(numeric_cols, axis=1).values

            # Combine numeric and non-numeric features
            feature_values = torch.cat((numeric_tensor, torch.tensor(non_numeric_values)), dim=0)

            seq_features.append(feature_values)

        seq_label = df.iloc[i + seq_length]['Bandwidth']  # Extract bandwidth value
        sequences.append((seq_features, seq_label))

    return sequences



def custom_collate_fn(batch):
    features = []
    bandwidths = []
    for sample in batch:
        seq_features, seq_label = sample
        features.append(torch.stack(seq_features))
        bandwidths.append(seq_label)
    features = torch.stack(features)
    bandwidths = torch.tensor(bandwidths)
    return features, bandwidths

In [186]:
## Using the Dataset
batch_size = 200
dataset = NetworkDataset('data\Tempdir\Monday-WorkingHours.pcap_ISCX.csv', num_entries= 1000)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=custom_collate_fn)

# Get the data from the dataset
data = dataset.get_dataframe()
df = pd.DataFrame(data)
seq_length = 50

print(df.head())
print(df.keys())

# Create sequences
train_data = create_inout_sequences(df, seq_length)

# Training and Test set splits
train_size = int(len(train_data) * 0.8)
test_size = len(train_data) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(train_data, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last= True, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=custom_collate_fn)

print(len(train_loader.dataset))

AttributeError: 'DataLoader' object has no attribute 'get_dataframe'

In [None]:
for i, data in enumerate(train_loader):
    features, bandwidth = data  # Unpack the tuple
    print(features, bandwidth)
    if i == 1:  # Just check the first couple of batches
        break

for i, data in enumerate(test_loader):
    print(f"Batch {i+1} from test_loader:")
    print(data)  # Print out the batch
    if i == 0:  # Just check the first batch
        break


TypeError: expected Tensor as element 0 in argument 0, but got numpy.ndarray

In [None]:
input_size = 14
hidden_size = 128
output_size = 1
dropout = 0
bidirectional = False

In [None]:
model = project_models.ntwkRNN(input_size, hidden_size, output_size, dropout, bidirectional).to(device)

epochs = 10
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)

rnn_tloss, rnn_vloss = project_models.train_RNN(epochs, model, criterion, optimizer, train_loader, test_loader, device)

TypeError: expected Tensor as element 0 in argument 0, but got numpy.ndarray

In [None]:
#Transformer
model = project_models.ntwkPETransformer(input_size, hidden_size, output_size, num_layers = 4, nhead = 2).to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.01)


Trans_tloss, Trans_vloss, Trans_vacc = project_models.trainTransformer(model,epochs, criterion, optimizer, train_loader, test_loader, device)

ValueError: too many values to unpack (expected 2)