In [2]:
from pathlib import Path
import random
import os

import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

In [3]:
def set_seed(seed):
    """
    Sets the random seed for reproducibility.
    Args:
        seed (int): The seed value to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda:2")
    print("CUDA is available. Using GPU")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

CUDA is available. Using GPU


## Data

In [5]:
import os
import glob
import numpy as np
import dpkt
import socket
import torch
from torch.utils.data import Dataset, DataLoader
from FlowPic.sessions_plotter import session_2d_histogram  # Assuming FlowPic import
from tqdm import tqdm  # Progress bar
from pathlib import Path

MTU = 1500  # Maximum Transmission Unit (packet size limit)
DELTA_T = 60  # Time interval for splitting sessions
TPS = 60  # Time per session
MIN_TPS = 50  # Minimum time per session to consider

# Multi-label list
LABELS = ["netflix", "youtube", "rdp", "rsync", "scp", "sftp", "skype-chat", "ssh", "vimeo", "voip"]

class FlowPicDataset(Dataset):
    def __init__(self, pcap_dir, cache_dir=None):
        """
        Initialize the dataset with a directory of PCAP files. It reads and stores session data.
        Args:
            pcap_dir (str): Path to the directory containing PCAP files.
            cache_dir (str): Path to directory where cached .npz files are stored.
        """
        self.pcap_files = glob.glob(os.path.join(pcap_dir, "*.pcap"))[:30]
        self.Xs = []
        self.ys = []
        
        if not cache_dir:
            cache_dir = Path(pcap_dir) / 'flowpic_cache'
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)  # Create cache directory if it doesn't exist

        # Process each PCAP file in the directory
        for pcap_file in tqdm(list(self.pcap_files)):
            cache_file = self.cache_dir / f"{Path(pcap_file).stem}.npz"  # Use the file name without an extension
            cache_npz_file = f"{cache_file}.npz"

            if os.path.exists(cache_file):
                # Load from cache if available
                cached_data = np.load(cache_file, allow_pickle=True)
                for d in cached_data['Xs']:
                    sparse_tensor = torch.sparse_coo_tensor(d['indices'], d['values'], d['size'])
                    self.Xs.append(sparse_tensor.to_dense().numpy())  # Convert to dense numpy array
                self.ys.extend(cached_data['ys'])
            else:
                print(f"{cache_file} does not exist, creating it...")
                label = self.extract_label(pcap_file)
                sessions = self.parse_pcap(pcap_file)
                dataset = self.sessions_to_flowpic(sessions)

                Xs, ys = [], []
                for flowpic in dataset:
                    Xs.append(self.to_sparse_dict(flowpic))
                    ys.append(label)

                # Save processed data to cache
                np.savez(cache_npz_file, Xs=Xs, ys=np.array(ys, dtype=object))

                # Rename the file to remove ".npz" suffix
                os.rename(cache_npz_file, cache_file)

                # Append FlowPic and labels to dataset
                for d in Xs:
                    sparse_tensor = torch.sparse_coo_tensor(d['indices'], d['values'], d['size'])
                    self.Xs.append(sparse_tensor.to_dense().numpy())  # Convert to dense numpy array
                self.ys.extend(ys)

    def extract_label(self, file_path):
        """
        Extract the multi-label based on the file name.
        Args:
            file_path (str): Path to the pcap file.
        Returns:
            label (list): Binary list corresponding to LABELS.
        """
        label = [0] * len(LABELS)
        for i, l in enumerate(LABELS):
            if l in file_path.lower():
                label[i] = 1
        return label

    def parse_pcap(self, pcap_path):
        """
        Parse a pcap file assuming raw IP packets (no Ethernet headers).
        Each session is a tuple of (session_key, [timestamps], [sizes]).
        """
        sessions = {}
        total_packets = 0
        non_ip_packets = 0
        non_tcp_udp_packets = 0
        protocol_counts = {}

        with open(pcap_path, 'rb') as f:
            pcap = dpkt.pcap.Reader(f)
            for ts, packet in pcap:
                total_packets += 1

                try:
                    ip = dpkt.ip.IP(packet)  # Directly treat the packet as an IP packet
                except dpkt.UnpackError:
                    non_ip_packets += 1
                    continue  # Skip if it's not a valid IP packet

                proto = ip.data
                proto_name = type(proto).__name__
                protocol_counts[proto_name] = protocol_counts.get(proto_name, 0) + 1

                if not isinstance(proto, (dpkt.tcp.TCP, dpkt.udp.UDP)):
                    non_tcp_udp_packets += 1
                    continue  # Only TCP and UDP are handled

                session_key = (socket.inet_ntoa(ip.src), proto.sport, socket.inet_ntoa(ip.dst), proto.dport, proto_name)
                if session_key not in sessions:
                    sessions[session_key] = (ts, [], [])

                d = sessions[session_key]
                size = len(ip)  # Packet size
                d[1].append(ts - d[0])  # Time delta
                d[2].append(size)  # Packet size

        return sessions

    def sessions_to_flowpic(self, sessions):
        """
        Convert session data to FlowPic format.
        """
        dataset = []
        for session_key, (start_ts, ts_list, sizes) in sessions.items():
            ts = np.array(ts_list)
            sizes = np.array(sizes)

            if len(ts) > 1:  # Consider sessions with at least 2 packets
                for t in range(int(ts[-1] / DELTA_T - TPS / DELTA_T) + 1):
                    mask = (ts >= t * DELTA_T) & (ts <= (t * DELTA_T + TPS))
                    ts_mask = ts[mask]
                    sizes_mask = sizes[mask]

                    if len(ts_mask) > 1:  # Adjust packet count check here
                        h = session_2d_histogram(ts_mask, sizes_mask)
                        dataset.append(h)

        return dataset

    def to_sparse_dict(self, flowpic):
        """
        Convert a dense FlowPic matrix to a dictionary of sparse representation (indices, values, size).
        """
        flowpic = np.array(flowpic, dtype=np.float32)
        indices = np.nonzero(flowpic)
        values = flowpic[indices]
        size = flowpic.shape
        return {'indices': np.array(indices), 'values': values, 'size': size}

    def __len__(self):
        """
        Return the total number of FlowPic samples in the dataset.
        """
        return len(self.Xs)

    def __getitem__(self, idx):
        """
        Return the FlowPic representation of a session along with its label.
        """
        flowpic, label = self.Xs[idx], self.ys[idx]
        
        # Ensure the flowpic is a numpy array of type float32 before converting to tensor
        flowpic = np.array(flowpic, dtype=np.float32)
        label = np.array(label, dtype=np.float32)
        
        return torch.tensor(flowpic), torch.tensor(label, dtype=torch.float32)


# Helper function to create DataLoader
def create_dataloader(pcap_dir, batch_size=32, shuffle=True, num_workers=4, cache_dir=None):
    """
    Create a DataLoader for FlowPicDataset.
    Args:
        pcap_dir (str): Path to the directory containing PCAP files.
        batch_size (int): Number of samples per batch.
        shuffle (bool): Whether to shuffle the data.
        num_workers (int): Number of subprocesses to use for data loading.
        cache_dir (str): Directory where cached .npy files are stored.
    """
    dataset = FlowPicDataset(pcap_dir, cache_dir=cache_dir)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
    return dataloader


# Example usage
pcap_dir = Path('/home/anatbr/students/noamshakedc/da4etc/data/vpnnonvpn')  # Path to your PCAP directory
dataloader = create_dataloader(pcap_dir, batch_size=16)

# Iterate over the data
labelss = []
for batch, labels in dataloader:
    labelss.append(labels)
    if len(labelss) > 5:
        break
    # print(f"Batch shape: {batch.shape}, Labels: {labels}")


100%|███████████████████████████████████████████████████████████████████████████████████████████| 30/30 [02:42<00:00,  5.40s/it]


## Model

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import random_split

# Number of classes based on the LABELS list
num_classes = len(LABELS)  # This will be the output size of the model
learning_rate = 1e-3

class TrafficCNN(nn.Module):
    def __init__(self, num_classes=num_classes):
        super(TrafficCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=10, stride=5, padding=1)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=10, stride=5, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(20 * 15 * 15, 64)  # Adjust the dimensions if needed
        self.fc2 = nn.Linear(64, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = x.view(-1, 20 * 15 * 15)  # Flatten
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)  # Multi-class classification

# Create train and validation split
# dataset = FlowPicDataset(pcap_dir)  # Assuming this is your dataset
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=4)

# Model initialization
model = TrafficCNN(num_classes=num_classes)

# Optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()  # For multi-class classification

# Training loop
def train(model, train_loader, val_loader, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs.unsqueeze(1))  # Add channel dimension for Conv2D
            loss = criterion(outputs, torch.argmax(labels, dim=1))  # CrossEntropyLoss expects class indices, not one-hot
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        print(f'Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}')

        # Validation phase
        model.eval()
        correct = 0
        total = 0
        val_loss = 0.0
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs.unsqueeze(1))  # Add channel dimension for Conv2D
                loss = criterion(outputs, torch.argmax(labels, dim=1))
                val_loss += loss.item()

                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == torch.argmax(labels, dim=1)).sum().item()

        print(f'Validation Loss: {val_loss / len(val_loader)}, Accuracy: {100 * correct / total}%')

# Running the training
train(model, train_loader, val_loader, num_epochs)


KeyboardInterrupt



In [None]:
import os

# Define the structure and file contents based on the notebook
flowpic_dir = 'FlowPic'

# Ensure the directory for FlowPic exists
os.makedirs(flowpic_dir, exist_ok=True)

# Split the code into different files and modules for a Python package structure
# File: FlowPic/utils.py (contains set_seed and device setup)

utils_py = """
import random
import numpy as np
import torch

def set_seed(seed):
    \"\"\"
    Sets the random seed for reproducibility.
    Args:
        seed (int): The seed value to set.
    \"\"\"
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def get_device():
    if torch.cuda.is_available():
        device = torch.device("cuda:2")
        print("CUDA is available. Using GPU")
    else:
        device = torch.device("cpu")
        print("CUDA is not available. Using CPU.")
    return device
"""

with open(os.path.join(flowpic_dir, 'utils.py'), 'w') as f:
    f.write(utils_py)

# File: FlowPic/model.py (contains model definition)

model_py = """
import torch.nn as nn
import torch.nn.functional as F

class FlowPicCNN(nn.Module):
    def __init__(self, num_classes):
        super(FlowPicCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(64 * 37 * 37, 128)  # Adjust based on input image size
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 64 * 37 * 37)  # Flatten the tensor
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
"""

with open(os.path.join(flowpic_dir, 'model.py'), 'w') as f:
    f.write(model_py)

# File: FlowPic/train.py (contains training and validation logic)

train_py = """
import torch
import torch.nn as nn
import torch.optim as optim

def train(model, train_loader, val_loader, num_epochs, device):
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs.unsqueeze(1))  # Add channel dimension for Conv2D
            loss = criterion(outputs, torch.argmax(labels, dim=1))  # CrossEntropyLoss expects class indices

            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print(f'Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}')

        # Validation phase
        model.eval()
        correct = 0
        total = 0
        val_loss = 0.0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs.unsqueeze(1))  # Add channel dimension for Conv2D
                loss = criterion(outputs, torch.argmax(labels, dim=1))
                val_loss += loss.item()

                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == torch.argmax(labels, dim=1)).sum().item()

        print(f'Validation Loss: {val_loss / len(val_loader)}, Accuracy: {100 * correct / total}%')
"""

with open(os.path.join(flowpic_dir, 'train.py'), 'w') as f:
    f.write(train_py)

# Main script: main.py

main_py = """
import torch
from torch.utils.data import DataLoader, TensorDataset
from FlowPic.utils import set_seed, get_device
from FlowPic.model import FlowPicCNN
from FlowPic.train import train

def main():
    # Set the seed
    set_seed(42)

    # Get the device (GPU or CPU)
    device = get_device()

    # Assuming data is loaded somewhere here (train_loader, val_loader)
    # Example:
    # train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    # val_loader = DataLoader(val_dataset, batch_size=32)

    # Initialize the model and move it to the device
    num_classes = 10  # Adjust based on your problem
    model = FlowPicCNN(num_classes).to(device)

    # Run training
    num_epochs = 10  # Adjust as needed
    train(model, train_loader, val_loader, num_epochs, device)

if __name__ == "__main__":
    main()
"""

# Save the main.py file
with open('main.py', 'w') as f:
    f.write(main_py)

# Notify the user that files are created
"Files have been successfully created in the FlowPic directory and main.py."
