In [145]:
# Libraries
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import time
from sklearn.preprocessing import StandardScaler, RobustScaler

from torch.utils.data import DataLoader, random_split
import pandas as pd
import os, re, struct, socket, csv
from collections import deque


import import_ipynb
import project_models


In [146]:
#GPU Checking
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("GPU is not available. Using CPU.")


os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

Using GPU: NVIDIA GeForce RTX 3060


Data Preprocessing

In [147]:
#IDS2017_Monday ==== No anomolies
IDS2017_data = 'data\TrafficLabelling\Monday-WorkingHours.pcap_ISCX.csv'
IDS2017_DF = pd.DataFrame(pd.read_csv(IDS2017_data))

print(IDS2017_DF.columns)
IDS2017_DF.head()

Index(['Flow ID', ' Source IP', ' Source Port', ' Destination IP',
       ' Destination Port', ' Protocol', ' Timestamp', ' Flow Duration',
       ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
       'Bwd Packet Length Max', ' Bwd Packet Length Min',
       ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s',
       ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max',
       ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std',
       ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean',
       ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags',
       ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags',
       ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s',
       ' Bwd Packets/s', ' Min Packet Length', ' Max Pa

Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,192.168.10.5-8.254.250.126-49188-80-6,8.254.250.126,80,192.168.10.5,49188,6,03/07/2017 08:55:58,4,2,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
1,192.168.10.5-8.254.250.126-49188-80-6,8.254.250.126,80,192.168.10.5,49188,6,03/07/2017 08:55:58,1,2,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
2,192.168.10.5-8.254.250.126-49188-80-6,8.254.250.126,80,192.168.10.5,49188,6,03/07/2017 08:55:58,1,2,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
3,192.168.10.5-8.254.250.126-49188-80-6,8.254.250.126,80,192.168.10.5,49188,6,03/07/2017 08:55:58,1,2,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
4,192.168.10.14-8.253.185.121-49486-80-6,8.253.185.121,80,192.168.10.14,49486,6,03/07/2017 08:56:22,3,2,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN


In [148]:
## KDDcup Corrected File
#KDD_fp = 'data\kddcup\kddcup_corrected.csv'
#KDD_DF = pd.DataFrame(pd.read_csv(KDD_fp))
#KDD_DF.head()

In [149]:
# Helper function to convert IP addresses into integers
def encode_ip(ip_address):
    try:
        return struct.unpack("!L", socket.inet_aton(ip_address))[0]
    except socket.error:
        # Handle the case for invalid IP addresses
        return 0

Custom Dataset Construction

In [150]:
class NetworkDataset(Dataset):
    def __init__(self, file_path, seq_duration=5, time_unit='minute', num_entries=None, window_size=5, decay=0.9):
        self.window_size = window_size
        self.decay = decay
        self.data = self.load_and_process_data(file_path, num_entries)
        self.seq_duration = seq_duration
        self.time_unit = time_unit
        self.seq_length = self.calculate_sequence_length()
        self.scaler = RobustScaler()
        self.scale_features()

    def load_and_process_data(self, file_path, num_entries):
        data = []
        with open(file_path, 'r') as file:
            reader = csv.DictReader(file)
            reader.fieldnames = [column.strip() for column in reader.fieldnames] # Removes Whitespace from headernames
            rolling_stats = {
                'Packet Length Mean': deque(maxlen=self.window_size),
                'Packet Length Std': deque(maxlen=self.window_size)
            }
            ema_packet_size = 0
            for i, row in enumerate(reader):
                if num_entries is not None and i >= num_entries:
                    break
                processed_row = self.preprocess_data(row, rolling_stats, ema_packet_size)
                ema_packet_size = processed_row['EMA_Packet_Size']
                data.append(processed_row)
        return data
    

    def preprocess_data(self, row, rolling_stats, prev_ema_packet_size):
        # Preprocess and encode features
        processed_row = {}
        
        # Add the 'Timestamp' column
        processed_row['Timestamp'] = row['Timestamp']
        
        processed_row['source_encoded'] = encode_ip(row['Source IP'])
        processed_row['destination_encoded'] = encode_ip(row['Destination IP'])
        processed_row['Protocol'] = row['Protocol']

        # Behaviors
        processed_row['Flow Duration'] = float(row['Flow Duration'])
        processed_row['Total Fwd Packets'] = float(row['Total Fwd Packets'])
        processed_row['Total Backward Packets'] = float(row['Total Backward Packets'])
        processed_row['Total Length of Fwd Packets'] = float(row['Total Length of Fwd Packets'])
        processed_row['Total Length of Bwd Packets'] = float(row['Total Length of Bwd Packets'])

        # Ratios
        processed_row['Fwd_to_Bwd_Packets'] = processed_row['Total Fwd Packets'] / (processed_row['Total Backward Packets'] + 0.001)
        processed_row['Fwd_to_Bwd_Bytes'] = processed_row['Total Length of Fwd Packets'] / (processed_row['Total Length of Bwd Packets'] + 0.001)

        # Rolling Window Statistics
        packet_length_mean = float(row['Packet Length Mean'])
        packet_length_std = float(row['Packet Length Std'])
        rolling_stats['Packet Length Mean'].append(packet_length_mean)
        rolling_stats['Packet Length Std'].append(packet_length_std)
        processed_row['Rolling_Mean_Packet_Size'] = sum(rolling_stats['Packet Length Mean']) / len(rolling_stats['Packet Length Mean'])
        processed_row['Rolling_Std_Packet_Size'] = sum(rolling_stats['Packet Length Std']) / len(rolling_stats['Packet Length Std'])

        # Exponential Moving Average (EMA) of Packet Size
        ema_packet_size = self.decay * prev_ema_packet_size + (1 - self.decay) * packet_length_mean
        processed_row['EMA_Packet_Size'] = ema_packet_size

        # Precompute BW, Throughput & Latency
        processed_row['Time_diff'] = 1.0  # Placeholder value, adjust as needed
        processed_row['Total Length'] = processed_row['Total Length of Fwd Packets'] + processed_row['Total Length of Bwd Packets']
        processed_row['Bandwidth'] = processed_row['Total Length'] / processed_row['Time_diff']
        processed_row['Throughput'] = float(row['Flow Bytes/s'])
        processed_row['Latency'] = processed_row['Total Length'] / 1000  # Assuming Length is in bytes and Latency is in ms

        # Label
        processed_row['Label'] = 1 if row['Label'] == 'BENIGN' else 0

        return processed_row


    def scale_features(self):
        # Scale model input features
        features_to_scale = ['Time_diff', 'Bandwidth', 'Throughput', 'Latency', 'Flow Duration',
                            'Fwd_to_Bwd_Packets', 'Fwd_to_Bwd_Bytes',
                            'Rolling_Mean_Packet_Size', 'Rolling_Std_Packet_Size', 'Total Length']
        features = [[row[feature] for feature in features_to_scale] for row in self.data]
        
        # Replace infinity values with a large finite value
        features = np.array(features)
        features[np.isinf(features)] = np.finfo(np.float64).max
        
        # Apply robust scaling
        scaled_features = self.scaler.fit_transform(features)
        
        for i, row in enumerate(self.data):
            for j, feature in enumerate(features_to_scale):
                row[feature] = scaled_features[i][j]


    def get_data(self):
        return self.data


    def __len__(self):
        return len(self.data)
    

    def calculate_sequence_length(self):
        # Calculate the number of entries per sequence based on the time duration and data frequency
        if self.time_unit == 'minute':
            time_delta = pd.Timedelta(minutes=1)
        elif self.time_unit == 'second':
            time_delta = pd.Timedelta(seconds=1)
        else:
            raise ValueError(f"Unsupported time unit: {self.time_unit}")

        start_time = pd.to_datetime(self.data[0]['Timestamp'])
        end_time = start_time + pd.Timedelta(minutes=self.seq_duration)
        seq_length = sum(1 for row in self.data if start_time <= pd.to_datetime(row['Timestamp']) < end_time)

        return seq_length


    def __getitem__(self, idx):
        start_idx = idx * self.seq_length
        end_idx = start_idx + self.seq_length

        sequence_data = []
        sequence_targets = []

        for i in range(start_idx, end_idx):
            if i >= len(self.data):
                break
            row = self.data[i]
            features = [
                row['Time_diff'],
                row['source_encoded'],
                row['destination_encoded'],
                row['Total Length'],
                row['Flow Duration'],
                row['Fwd_to_Bwd_Packets'],
                row['Fwd_to_Bwd_Bytes'],
                row['Rolling_Mean_Packet_Size'],
                row['Rolling_Std_Packet_Size'],
                row['EMA_Packet_Size']
            ] + [int(row['Protocol'] == protocol) for protocol in ['6']]  # One-hot encoding of Protocol

            sequence_data.append(features)

            targets = [
                row['Bandwidth'],
                row['Latency'],
                row['Throughput'],
                row['Label']
            ]
            sequence_targets.append(targets)

        # Pad the sequence if it has less than seq_length entries
        padding_length = self.seq_length - len(sequence_data)
        if padding_length > 0:
            padding_data = [0] * len(features)
            padding_targets = [0] * len(targets)
            sequence_data.extend([padding_data] * padding_length)
            sequence_targets.extend([padding_targets] * padding_length)

        sequence_data = torch.tensor(sequence_data, dtype=torch.float32)
        sequence_data = sequence_data.view(self.seq_length, -1)  # Reshape to (seq_length, input_size)

        sequence_targets = torch.tensor(sequence_targets, dtype=torch.float32)

        return sequence_data, sequence_targets

In [151]:
def collate_fn(batch):
    sequence_data, sequence_targets = zip(*batch)
    
    # Pad the sequences to the maximum length in the batch
    padded_sequence_data = torch.nn.utils.rnn.pad_sequence(sequence_data, batch_first=True)
    
    # Stack the targets into a tensor
    sequence_targets = torch.stack(sequence_targets)
    
    return padded_sequence_data, sequence_targets

In [152]:
## Using the Dataset
batch_size = 200
dataset = NetworkDataset('data\\Tempdir\\Monday-WorkingHours.pcap_ISCX.csv', seq_duration= 5, time_unit='minute', num_entries=1000, window_size=10, decay=0.9)

# Training and Test set splits
train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [153]:
for i, data in enumerate(train_loader):
    try:
        sequence_data, sequence_targets = data  # Unpack the tuple
        features = sequence_data
        targets = sequence_targets

        print("Features:", features)
        print("Targets:")
        print("  Bandwidth:", targets[:, 0])
        print("  Latency:", targets[:, 1])
        print("  Throughput:", targets[:, 2])
        print("  Label:", targets[:, 3])
    except Exception as e:
        print(f"Error processing train data at index {i}: {e}")

    if i == 1:  # Just check the first couple of batches
        break

for i, data in enumerate(test_loader):
    try:
        sequence_data, sequence_targets = data  # Unpack the tuple
        features = sequence_data
        targets = sequence_targets

        print(f"Batch {i+1} from test_loader:")
        print("Features:", features)
        print("Targets:")
        print("  Bandwidth:", targets[:, 0])
        print("  Latency:", targets[:, 1])
        print("  Throughput:", targets[:, 2])
        print("  Label:", targets[:, 3])
    except Exception as e:
        print(f"Error processing test data at index {i}: {e}")

    if i == 0:  # Just check the first batch
        break


UnboundLocalError: cannot access local variable 'features' where it is not associated with a value

In [None]:
input_size = len(features)
hidden_size = 128
output_size = 3
dropout = 0
bidirectional = False

model = project_models.ntwkRNN(input_size, hidden_size, output_size, dropout, bidirectional).to(device)

epochs = 10
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)

rnn_tloss, rnn_vloss = project_models.train_RNN(epochs, model, criterion, optimizer, train_loader, test_loader, device)

Padding sequence_data from length 0 to 276
Padding sequence_data from length 1 to 276
Padding sequence_data from length 2 to 276
Padding sequence_data from length 3 to 276
Padding sequence_data from length 4 to 276
Padding sequence_data from length 5 to 276
Padding sequence_data from length 6 to 276
Padding sequence_data from length 7 to 276
Padding sequence_data from length 8 to 276
Padding sequence_data from length 9 to 276
Padding sequence_data from length 10 to 276
Padding sequence_data from length 11 to 276
Padding sequence_data from length 12 to 276
Padding sequence_data from length 13 to 276
Padding sequence_data from length 14 to 276
Padding sequence_data from length 15 to 276
Padding sequence_data from length 16 to 276
Padding sequence_data from length 17 to 276
Padding sequence_data from length 18 to 276
Padding sequence_data from length 19 to 276
Padding sequence_data from length 20 to 276
Padding sequence_data from length 21 to 276
Padding sequence_data from length 22 to 27

RuntimeError: stack expects each tensor to be equal size, but got [276, 0] at entry 0 and [276, 11] at entry 95

In [None]:
#Transformer
model = project_models.ntwkPETransformer(input_size, hidden_size, output_size, num_layers = 4, nhead = 2).to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.01)


Trans_tloss, Trans_vloss, Trans_vacc = project_models.trainTransformer(model,epochs, criterion, optimizer, train_loader, test_loader, device)

ValueError: too many values to unpack (expected 2)