In [1]:
# Libraries
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import time
from sklearn.preprocessing import StandardScaler

from torch.utils.data import DataLoader, Dataset
import pandas as pd
import os, re, struct, socket

import import_ipynb
import project_models

importing Jupyter notebook from project_models.ipynb


In [2]:
#GPU Checking
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("GPU is not available. Using CPU.")

Using GPU: NVIDIA GeForce RTX 3060


Data Preprocessing

In [3]:
#IDS2017_Monday ==== No anomolies
IDS2017_data = 'data\TrafficLabelling\Monday-WorkingHours.pcap_ISCX.csv'
IDS2017_DF = pd.DataFrame(pd.read_csv(IDS2017_data))


IDS2017_features = IDS2017_DF.iloc[:, [6, 1, 3, 2, 4, 5, 7, 84]] # 85 Columns
# Summing the total length of forward and backward packets
IDS2017_features['Total Length'] = IDS2017_DF.iloc[:, 11] + IDS2017_DF.iloc[:, 12]
IDS2017_features.head()

In [4]:
## KDDcup Corrected File
#KDD_fp = 'data\kddcup\kddcup_corrected.csv'
#KDD_DF = pd.DataFrame(pd.read_csv(KDD_fp))
#KDD_DF.head()

In [5]:
## PCAP DATA
##pcap_fp = 'data\pcap\midnight.csv'
#pcap_DF = pd.DataFrame(pd.read_csv(pcap_fp))
#pcap_DF.head()

In [6]:
# Helper function to convert IP addresses into integers
def encode_ip(ip_address):
    try:
        return struct.unpack("!L", socket.inet_aton(ip_address))[0]
    except socket.error:
        # Handle the case for invalid IP addresses
        return 0


#Info Extracter for Pcap type files
def extract_network_info(info_str):
    # Dictionary to hold the extracted values
    extracted_info = {
        'sending_port': None,
        'receiving_port': None,
        'length': None,
        'seq': None,
        'ack': None,
        'win': None,
        'tsval': None,
        'tsecr': None,
        'sle_sre': [],
        'info_messages': [],
    }
    
    # Check for and extract port and length information
    match = re.search(r'(\S+)\s+\d+\s+(\d+)\s*>\s*(\d+)(?:.*Len=(\d+))?', info_str)
    if match:
        # Protocol and port information
        extracted_info['protocol'] = match.group(1)
        extracted_info['sending_port'] = int(match.group(2))
        extracted_info['receiving_port'] = int(match.group(3))
        
        # Length information if present
        if match.group(4):
            extracted_info['length'] = int(match.group(4))

    # Extract other information if present
    for key in ['seq', 'ack', 'win', 'tsval', 'tsecr']:
        match = re.search(r'{}=(\d+)'.format(key.capitalize()), info_str)
        if match:
            extracted_info[key] = int(match.group(1))

    # Extract SLE and SRE pairs
    extracted_info['sle_sre'] = re.findall(r'SLE=(\d+)\s*SRE=(\d+)', info_str)
    
    # Capture informational messages like "Ignored Unknown Record" or "Continuation Data"
    info_msg_match = re.search(r'\[.*?\]', info_str)
    if info_msg_match:
        extracted_info['info_messages'].append(info_msg_match.group(0))

    return extracted_info


Custom Dataset Construction

In [7]:
## The Dataset
class NetworkDataset(Dataset):
    def __init__(self, directory_path, num_entries=20000):
    # Load and concatenate all CSV files
        self.data = pd.concat(
            [pd.read_csv(os.path.join(directory_path, file)) for file in os.listdir(directory_path) if file.endswith('.csv')],
            ignore_index=True
        )

        # Limit the data to the first num_entries
        self.data = self.data.iloc[:num_entries]

        # Sort data by 'Time' if necessary
        #self.data.sort_values(by='Time', inplace=True)
        
        # Preprocess data (fill missing values, extract features, encode data)
        self.data.fillna(self.get_default_fill_values(), inplace=True)
        self.extract_and_encode_features()

        #Scaling
        self.scaler = StandardScaler()
        self.scale_features()

    def get_default_fill_values(self):
        # Default fill values for missing data
        return {
            'latency': 0,
            'traffic_type': 'unknown',
            'anomaly_label': 0,
            'bottleneck': 'no',
            'Length': 0,
            'Source': '0.0.0.0',
            'Destination': '0.0.0.0',
            'Protocol': 'unknown',
            'Time': self.data['Time'].median() if not self.data['Time'].isna().all() else 0,
            'Info': 'No Information'
        }

    def extract_and_encode_features(self):
        # Extract and encode features
        self.data['Info'] = self.data['Info'].astype(str)
        extracted_info = self.data['Info'].apply(extract_network_info).apply(pd.Series)
        self.data = pd.concat([self.data, extracted_info], axis=1)
        self.data['source_encoded'] = self.data['Source'].apply(encode_ip)
        self.data['destination_encoded'] = self.data['Destination'].apply(encode_ip)
        self.data['Protocol'] = pd.factorize(self.data['Protocol'])[0]
        self.data['Time'] = self.data['Time'].astype(float)
        self.data['Length'] = self.data['Length'].astype(float)

        # Precompute BW & Throughput
        self.data['Time_diff'] = self.data['Time'].diff().fillna(0).clip(lower=1e-5)
        self.data['Bandwidth'] = self.data['Length'] / self.data['Time_diff']
        self.data['Throughput'] = self.data['Bandwidth']  # Simplified, adjust if different logic needed
        self.data['Latency'] = self.data['Length'] / 1000  # Assuming Length is in bytes and Latency is in ms

        # Type Conversions
        self.data['Time'] = self.data['Time'].astype(float)
        self.data['Length'] = self.data['Length'].astype(float)
        self.data['Bandwidth'] = self.data['Bandwidth'].astype(float)
        self.data['Throughput'] = self.data['Throughput'].astype(float)
        self.data['Latency'] = self.data['Latency'].astype(float)

    def scale_features(self):
        # Scale model input feautres
        features_to_scale = ['Time', 'Length', 'Time_diff', 'Bandwidth', 'Throughput', 'Latency']
        self.data[features_to_scale] = self.scaler.fit_transform(self.data[features_to_scale])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        features = torch.tensor([row['Time'], row['source_encoded'], row['destination_encoded'], row['Protocol'], row['Length']], dtype=torch.float32)

        ## Reg Features
        bandwidth = torch.tensor([row['Bandwidth']], dtype=torch.float32)
        throughput = torch.tensor([row['Throughput']], dtype=torch.float32)
        latency = torch.tensor([row['Latency']], dtype=torch.float32)
        # network_flow = Client -> Port -> Switch -> Master Swtich -> Firewall -> Inline Network Encryter -> Router -> Outbound                 
        # resource_usage = no RAM/CPU data available 


        ## Classification targets
        #traffic_type = torch.tensor(pd.factorize(data['traffic_type'])[0], dtype=torch.long)
        #anomaly_detection = torch.tensor(data['anomaly_label'].values, dtype=torch.long)
        #network_resource_status = torch.tensor(pd.factorize(data['resource_status'])[0], dtype=torch.long)
        #bottlenecks = torch.tensor(pd.factorize(data['bottleneck'])[0], dtype=torch.long)
        
        return {
    'features': features,
    'bandwidth': bandwidth,
    'throughput': throughput,
    'latency': latency,
    #'traffic_type': traffic_type,
    #'anomaly_detection': anomaly_detection,
    #'network_resource_status': network_resource_status,
    #'bottlenecks': bottlenecks,
    }


In [8]:
## Using the Dataset
batch_size = 256
dataset = NetworkDataset('data\pcap')
loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Training and Test set splits
train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, num_workers=4)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size, num_workers=4)

In [9]:
for data in loader:
    features = data['features']
    bandwidth = data['bandwidth']
    throughput = data['throughput']
    latency = data['latency']

    print(features, bandwidth, throughput, latency)
    break

tensor([[-1.5510e+00,  3.9003e+08,  6.5837e+08,  0.0000e+00,  7.9091e-01],
        [-1.5510e+00,  3.9003e+08,  6.5837e+08,  1.0000e+00,  7.8577e-01],
        [-1.5510e+00,  1.7289e+09,  2.3501e+08,  0.0000e+00,  7.2417e-01],
        ...,
        [-1.5458e+00,  2.3514e+08,  1.7329e+09,  2.0000e+00, -1.0137e+00],
        [-1.5458e+00,  2.3514e+08,  1.7329e+09,  2.0000e+00, -1.0240e+00],
        [-1.5458e+00,  2.3501e+08,  1.7329e+09,  1.0000e+00, -1.0009e+00]]) tensor([[ 1.0861],
        [ 1.0806],
        [ 0.5605],
        [ 1.0152],
        [ 1.0261],
        [-0.2765],
        [ 1.0915],
        [ 1.0861],
        [ 1.0806],
        [-0.4924],
        [ 1.0915],
        [-0.4490],
        [ 0.8054],
        [-0.3480],
        [ 0.8108],
        [-0.9497],
        [-0.8488],
        [-0.8733],
        [-0.8433],
        [ 0.2670],
        [ 2.9501],
        [-0.9367],
        [-0.8651],
        [-0.8336],
        [ 0.8054],
        [ 0.8163],
        [-0.7164],
        [ 0.8054],
    

In [10]:
input_size = 5
hidden_size = 128
output_size = 1
dropout = 0.2
bidirectional = False

model = project_models.ntwkRNN(input_size, hidden_size, output_size, dropout, bidirectional).to(device)

epochs = 10
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.01)

rnn_tloss, rnn_vloss,rnn_vacc = project_models.train_RNN(epochs, model, criterion, optimizer, train_loader, test_loader, device)



In [None]:
#GRU
model = project_models.ntwkGRU(input_size, hidden_size, output_size, dropout, bidirectional).to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.01)

GRU_tloss, GRU_vloss,GRU_vacc = project_models.train_RNN(epochs, model, criterion, optimizer, train_loader, test_loader, device)

In [None]:
#LSTM
model = project_models.ntwkLSTM(input_size, hidden_size, output_size, dropout, bidirectional).to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.01)

lstm_tloss, lstm_vloss, lstm_vacc = project_models.train_RNN(epochs, model, criterion, optimizer, train_loader, test_loader, device)

In [None]:
#Transformer
model = project_models.ntwkPETransformer(input_size, hidden_size, output_size, num_layers = 5, nhead = 3).to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.01)


Trans_tloss, Trans_vloss, Trans_vacc = project_models.train_Transformer(model,epochs, criterion, optimizer, train_loader, test_loader, device)