In [8]:
# Libraries
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import requests
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import time

from torch.utils.data import DataLoader, Dataset
import pandas as pd
import os, re, struct, socket

import import_ipynb
import project_models


In [9]:
#GPU Checking
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("GPU is not available. Using CPU.")

Using GPU: NVIDIA GeForce RTX 3060


Data Preprocessing

In [18]:
#IDS2017_Monday ==== No anomalies
IDS2017_data = 'data\TrafficLabelling\Monday-WorkingHours.pcap_ISCX.csv'
IDS2017_DF = pd.DataFrame(pd.read_csv(IDS2017_data))


IDS2017_features = IDS2017_DF.iloc[:, [6, 1, 3, 2, 4, 5, 7, 84]] # 85 Columns
# Summing the total length of forward and backward packets
IDS2017_features['Total Length'] = IDS2017_DF.iloc[:, 11] + IDS2017_DF.iloc[:, 12]

IDS2017_features.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  IDS2017_features['Total Length'] = IDS2017_DF.iloc[:, 11] + IDS2017_DF.iloc[:, 12]


Unnamed: 0,Timestamp,Source IP,Destination IP,Source Port,Destination Port,Protocol,Flow Duration,Label,Total Length
0,03/07/2017 08:55:58,8.254.250.126,192.168.10.5,80,49188,6,4,BENIGN,6.0
1,03/07/2017 08:55:58,8.254.250.126,192.168.10.5,80,49188,6,1,BENIGN,6.0
2,03/07/2017 08:55:58,8.254.250.126,192.168.10.5,80,49188,6,1,BENIGN,6.0
3,03/07/2017 08:55:58,8.254.250.126,192.168.10.5,80,49188,6,1,BENIGN,6.0
4,03/07/2017 08:56:22,8.253.185.121,192.168.10.14,80,49486,6,3,BENIGN,6.0


In [26]:
## KDDcup Corrected File
KDD_fp = 'data\kddcup\kddcup_corrected.csv'
KDD_DF = pd.DataFrame(pd.read_csv(KDD_fp))
KDD_DF.head()

Unnamed: 0,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,type
0,tcp,http,SF,215,45076,0,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,tcp,http,SF,162,4528,0,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
2,tcp,http,SF,236,1228,0,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
3,tcp,http,SF,233,2032,0,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
4,tcp,http,SF,239,486,0,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.


In [27]:
# Helper function to convert IP addresses into integers
def ip_to_int(ip_address):
    try:
        return struct.unpack("!L", socket.inet_aton(ip_address))[0]
    except socket.error:
        # Handle the case for invalid IP addresses
        return 0


#Info Extracter for Pcap type files
def extract_network_info(info_str):
    # Dictionary to hold the extracted values
    extracted_info = {
        'sending_port': None,
        'receiving_port': None,
        'length': None,
        'seq': None,
        'ack': None,
        'win': None,
        'tsval': None,
        'tsecr': None,
        'sle_sre': [],
        'info_messages': [],
    }
    
    # Check for and extract port and length information
    match = re.search(r'(\S+)\s+\d+\s+(\d+)\s*>\s*(\d+)(?:.*Len=(\d+))?', entry)
    if match:
        # Protocol and port information
        extracted_info['protocol'] = match.group(1)
        extracted_info['sending_port'] = int(match.group(2))
        extracted_info['receiving_port'] = int(match.group(3))
        
        # Length information if present
        if match.group(4):
            extracted_info['length'] = int(match.group(4))

    # Extract other information if present
    for key in ['seq', 'ack', 'win', 'tsval', 'tsecr']:
        match = re.search(r'{}=(\d+)'.format(key.capitalize()), info_str)
        if match:
            extracted_info[key] = int(match.group(1))

    # Extract SLE and SRE pairs
    extracted_info['sle_sre'] = re.findall(r'SLE=(\d+)\s*SRE=(\d+)', info_str)
    
    # Capture informational messages like "Ignored Unknown Record" or "Continuation Data"
    info_msg_match = re.search(r'\[.*?\]', info_str)
    if info_msg_match:
        extracted_info['info_messages'].append(info_msg_match.group(0))

    return extracted_info





Custom Dataset Construction

In [28]:
## The Dataset
class NetworkDataset(Dataset):
    def __init__(self, directory_path):
        # Assuming CSV files are named in a sequential manner or have identifiable patterns
        self.files = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.csv')]
        self.files.sort()  # Ensure the order is correct

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        data = pd.read_csv(self.files[idx])
        info = data['Info'].apply(extract_network_info)
        
        # Encode source and destination IPs
        data['source_encoded'] = data['Source'].apply(lambda ip: struct.unpack("!L", socket.inet_aton(ip))[0])
        data['destination_encoded'] = data['Destination'].apply(lambda ip: struct.unpack("!L", socket.inet_aton(ip))[0])

        features = torch.tensor(data[['Time', 'source_encoded', 'destination_encoded', 'Protocol', 'Length']].values, dtype=torch.float32)


        # Regression targets
        latency = torch.tensor(data['latency'].values, dtype=torch.float32) # milliseconds usually
        bandwidth = 
        throughput = 
        # network_flow = Client -> Port -> Switch -> Master Swtich -> Firewall -> Inline Network Encryter -> Router -> Outbound                 
        # resource_usage = no RAM/CPU data available 

        # Classification targets
        traffic_type = torch.tensor(pd.factorize(data['traffic_type'])[0], dtype=torch.long)
        anomaly_detection = 
        network_resource_status = 
        bottlenecks = 


        ## Ports
        # Parsing port numbers from the Info column
        data['ports'] = data['Info'].apply(lambda x: re.findall(r'\d+', x))
       
        # Combine ports into a single feature
        data['combined_ports'] = data['ports'].apply(lambda x: f"{x[0]}-{x[1]}" if len(x) > 1 else None)
        #Factorize Ports
        combined_ports = torch.tensor(pd.factorize(data['combined_ports'])[0], dtype=torch.int64)
        
        
        
        return info, features, latency, traffic_type, combined_ports
    

## Using the Dataset
dataset = NetworkDataset('data')
loader = DataLoader(dataset, batch_size=1, shuffle=False)

# Create data loaders
'''
batch_size = 1 # BATCH SIZE MUST BE 1 --> EACH CSV IS ITS OWN BATCH
train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)
'''

SyntaxError: invalid syntax (2320490746.py, line 24)