<a href="https://colab.research.google.com/github/SanyaKapoor/netFound/blob/main/Part_III_NetFound_vs_DeepPacket.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Comparison with DeepPacket

###DeepPacket expects Parquet files, however, we are working with pcaps, so we will convert them to desired format and then evaluate the pre-trained model.

In [1]:
!pip install scapy

Collecting scapy
  Downloading scapy-2.6.1-py3-none-any.whl.metadata (5.6 kB)
Downloading scapy-2.6.1-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scapy
Successfully installed scapy-2.6.1


In [2]:
!pip install pytorch-lightning==1.8.5
!pip install datasets

Collecting pytorch-lightning==1.8.5
  Downloading pytorch_lightning-1.8.5-py3-none-any.whl.metadata (25 kB)
Collecting tensorboardX>=2.2 (from pytorch-lightning==1.8.5)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lightning==1.8.5)
  Downloading torchmetrics-1.7.1-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities!=0.4.0,>=0.3.0 (from pytorch-lightning==1.8.5)
  Downloading lightning_utilities-0.14.3-py3-none-any.whl.metadata (5.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.9.0->pytorch-lightning==1.8.5)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.9.0->pytorch-lightning==1.8.5)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.9.0->pytorch-lightning==1.8.5)
  D

In [3]:
def train_cnn(
    c1_kernel_size,
    c1_output_dim,
    c1_stride,
    c2_kernel_size,
    c2_output_dim,
    c2_stride,
    output_dim,
    data_path,
    epoch,
    model_path,
    signal_length,
    logger,
):
    # prepare dir for model path
    if model_path:
        model_path = Path(model_path)
        model_path.parent.mkdir(parents=True, exist_ok=True)

    # seed everything
    seed_everything(seed=9876, workers=True)

    model = CNN(
        c1_kernel_size=c1_kernel_size,
        c1_output_dim=c1_output_dim,
        c1_stride=c1_stride,
        c2_kernel_size=c2_kernel_size,
        c2_output_dim=c2_output_dim,
        c2_stride=c2_stride,
        output_dim=output_dim,
        data_path=data_path,
        signal_length=signal_length,
    ).float()
    trainer = Trainer(
        val_check_interval=1.0,
        max_epochs=epoch,
        devices="auto",
        accelerator="auto",
        logger=logger,
        callbacks=[
            EarlyStopping(
                monitor="training_loss", mode="min", check_on_train_epoch_end=True
            )
        ],
    )
    trainer.fit(model)

    # save model
    trainer.save_checkpoint(str(model_path.absolute()))


def train_resnet(
    c1_kernel_size,
    c1_output_dim,
    c1_stride,
    c1_groups,
    c1_n_block,
    output_dim,
    data_path,
    epoch,
    model_path,
    signal_length,
    logger,
):
    # prepare dir for model path
    if model_path:
        model_path = Path(model_path)
        model_path.parent.mkdir(parents=True, exist_ok=True)

    # seed everything
    seed_everything(seed=9876, workers=True)

    model = ResNet(
        c1_kernel_size=c1_kernel_size,
        c1_output_dim=c1_output_dim,
        c1_stride=c1_stride,
        c1_groups=c1_groups,
        c1_n_block=c1_n_block,
        output_dim=output_dim,
        data_path=data_path,
        signal_length=signal_length,
    ).float()
    trainer = Trainer(
        val_check_interval=1.0,
        max_epochs=epoch,
        devices="auto",
        accelerator="auto",
        logger=logger,
        callbacks=[
            EarlyStopping(
                monitor="training_loss", mode="min", check_on_train_epoch_end=True
            )
        ],
    )
    trainer.fit(model)

    # save model
    trainer.save_checkpoint(str(model_path.absolute()))


def train_application_classification_cnn_model(data_path, model_path):
    logger = TensorBoardLogger(
        "application_classification_cnn_logs", "application_classification_cnn"
    )
    train_cnn(
        c1_kernel_size=4,
        c1_output_dim=200,
        c1_stride=3,
        c2_kernel_size=5,
        c2_output_dim=200,
        c2_stride=1,
        output_dim=17,
        data_path=data_path,
        epoch=20,
        model_path=model_path,
        signal_length=1500,
        logger=logger,
    )


def train_application_classification_resnet_model(data_path, model_path):
    logger = TensorBoardLogger(
        "application_classification_resnet_logs", "application_classification_resnet"
    )
    train_resnet(
        c1_kernel_size=4,
        c1_output_dim=16,
        c1_stride=3,
        c1_groups=1,
        c1_n_block=4,
        output_dim=17,
        data_path=data_path,
        epoch=40,
        model_path=model_path,
        signal_length=1500,
        logger=logger,
    )


def train_traffic_classification_cnn_model(data_path, model_path):
    logger = TensorBoardLogger(
        "traffic_classification_cnn_logs", "traffic_classification_cnn"
    )
    train_cnn(
        c1_kernel_size=5,
        c1_output_dim=200,
        c1_stride=3,
        c2_kernel_size=4,
        c2_output_dim=200,
        c2_stride=3,
        output_dim=12,
        data_path=data_path,
        epoch=20,
        model_path=model_path,
        signal_length=1500,
        logger=logger,
    )


def train_traffic_classification_resnet_model(data_path, model_path):
    logger = TensorBoardLogger(
        "traffic_classification_resnet_logs", "traffic_classification_resnet"
    )
    train_resnet(
        c1_kernel_size=5,
        c1_output_dim=16,
        c1_stride=3,
        c1_groups=1,
        c1_n_block=4,
        output_dim=12,
        data_path=data_path,
        epoch=40,
        model_path=model_path,
        signal_length=1500,
        logger=logger,
    )


def load_cnn_model(model_path, gpu):
    if gpu:
        device = "cuda"
    else:
        device = "cpu"
    model = (
        CNN.load_from_checkpoint(
            str(Path(model_path).absolute()), map_location=torch.device(device)
        )
        .float()
        .to(device)
    )

    model.eval()

    return model


def load_resnet_model(model_path, gpu):
    if gpu:
        device = "cuda"
    else:
        device = "cpu"
    model = (
        ResNet.load_from_checkpoint(
            str(Path(model_path).absolute()), map_location=torch.device(device)
        )
        .float()
        .to(device)
    )

    model.eval()

    return model


def load_application_classification_cnn_model(model_path, gpu=False):
    return load_cnn_model(model_path=model_path, gpu=gpu)


def load_application_classification_resnet_model(model_path, gpu=False):
    return load_resnet_model(model_path=model_path, gpu=gpu)


def load_traffic_classification_cnn_model(model_path, gpu=False):
    return load_cnn_model(model_path=model_path, gpu=gpu)


def load_traffic_classification_resnet_model(model_path, gpu=False):
    return load_resnet_model(model_path=model_path, gpu=gpu)


def normalise_cm(cm):
    with np.errstate(all="ignore"):
        normalised_cm = cm / cm.sum(axis=1, keepdims=True)
        normalised_cm = np.nan_to_num(normalised_cm)
        return normalised_cm

def dataset_collate_function(batch):
    feature = torch.stack([torch.tensor([data["feature"]]) for data in batch])
    label = torch.tensor([data["label"] for data in batch])
    transformed_batch = {"feature": feature, "label": label}
    return transformed_batch

import multiprocessing

import datasets
import torch
from pytorch_lightning import LightningModule
from torch import nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

class CNN(LightningModule):
    def __init__(
        self,
        c1_output_dim,
        c1_kernel_size,
        c1_stride,
        c2_output_dim,
        c2_kernel_size,
        c2_stride,
        output_dim,
        data_path,
        signal_length,
    ):
        super().__init__()
        # save parameters to checkpoint
        self.save_hyperparameters()

        # two convolution, then one max pool
        self.conv1 = nn.Sequential(
            nn.Conv1d(
                in_channels=1,
                out_channels=self.hparams.c1_output_dim,
                kernel_size=self.hparams.c1_kernel_size,
                stride=self.hparams.c1_stride,
            ),
            nn.ReLU(),
        )
        self.conv2 = nn.Sequential(
            nn.Conv1d(
                in_channels=self.hparams.c1_output_dim,
                out_channels=self.hparams.c2_output_dim,
                kernel_size=self.hparams.c2_kernel_size,
                stride=self.hparams.c2_stride,
            ),
            nn.ReLU(),
        )

        self.max_pool = nn.MaxPool1d(kernel_size=2)

        # flatten, calculate the output size of max pool
        # use a dummy input to calculate
        dummy_x = torch.rand(1, 1, self.hparams.signal_length, requires_grad=False)
        dummy_x = self.conv1(dummy_x)
        dummy_x = self.conv2(dummy_x)
        dummy_x = self.max_pool(dummy_x)
        max_pool_out = dummy_x.view(1, -1).shape[1]

        # followed by 5 dense layers
        self.fc1 = nn.Sequential(
            nn.Linear(in_features=max_pool_out, out_features=200),
            nn.Dropout(p=0.05),
            nn.ReLU(),
        )
        self.fc2 = nn.Sequential(
            nn.Linear(in_features=200, out_features=100), nn.Dropout(p=0.05), nn.ReLU()
        )
        self.fc3 = nn.Sequential(
            nn.Linear(in_features=100, out_features=50), nn.Dropout(p=0.05), nn.ReLU()
        )

        # finally, output layer
        self.out = nn.Linear(in_features=50, out_features=self.hparams.output_dim)

    def forward(self, x):
        # make sure the input is in [batch_size, channel, signal_length]
        # where channel is 1
        # signal_length is 1500 by default
        batch_size = x.shape[0]

        # 2 conv 1 max
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.max_pool(x)

        x = x.reshape(batch_size, -1)

        # 3 fc
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)

        # output
        x = self.out(x)

        return x

    def train_dataloader(self):
        # expect to get train folder
        dataset_dict = datasets.load_dataset(self.hparams.data_path)
        dataset = dataset_dict[list(dataset_dict.keys())[0]]
        try:
            num_workers = multiprocessing.cpu_count()
        except:
            num_workers = 1
        dataloader = DataLoader(
            dataset,
            batch_size=16,
            num_workers=num_workers,
            collate_fn=dataset_collate_function,
            shuffle=True,
        )

        return dataloader

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def training_step(self, batch, batch_idx):
        x = batch["feature"].float()
        y = batch["label"].long()
        y_hat = self(x)

        entropy = F.cross_entropy(y_hat, y)
        self.log(
            "training_loss",
            entropy,
            prog_bar=True,
            logger=True,
            on_step=True,
            on_epoch=True,
        )
        loss = {"loss": entropy}

        return loss


class CustomConv1d(nn.Module):
    """
    extend nn.Conv1d to support SAME padding
    """

    def __init__(self, in_channels, out_channels, kernel_size, stride, groups=1):
        super(CustomConv1d, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.groups = groups
        self.conv = torch.nn.Conv1d(
            in_channels=self.in_channels,
            out_channels=self.out_channels,
            kernel_size=self.kernel_size,
            stride=self.stride,
            groups=self.groups,
        )

    def forward(self, x):
        net = x

        # compute pad shape
        in_dim = net.shape[-1]
        out_dim = (in_dim + self.stride - 1) // self.stride
        p = max(0, (out_dim - 1) * self.stride + self.kernel_size - in_dim)
        pad_left = p // 2
        pad_right = p - pad_left
        net = F.pad(net, (pad_left, pad_right), "constant", 0)

        net = self.conv(net)

        return net


class CustomMaxPool1d(nn.Module):
    """
    extend nn.MaxPool1d to support SAME padding
    """

    def __init__(self, kernel_size):
        super(CustomMaxPool1d, self).__init__()
        self.kernel_size = kernel_size
        self.stride = 1
        self.max_pool = torch.nn.MaxPool1d(kernel_size=self.kernel_size)

    def forward(self, x):
        net = x

        # compute pad shape
        in_dim = net.shape[-1]
        out_dim = (in_dim + self.stride - 1) // self.stride
        p = max(0, (out_dim - 1) * self.stride + self.kernel_size - in_dim)
        pad_left = p // 2
        pad_right = p - pad_left
        net = F.pad(net, (pad_left, pad_right), "constant", 0)

        net = self.max_pool(net)

        return net


class BasicBlock(nn.Module):
    """
    ResNet Basic Block
    """

    def __init__(
        self,
        in_channels,
        out_channels,
        kernel_size,
        stride,
        groups,
        downsample,
        use_bn,
        use_do,
        is_first_block=False,
    ):
        super(BasicBlock, self).__init__()

        self.in_channels = in_channels
        self.kernel_size = kernel_size
        self.out_channels = out_channels
        self.stride = stride
        self.groups = groups
        self.downsample = downsample
        if self.downsample:
            self.stride = stride
        else:
            self.stride = 1
        self.is_first_block = is_first_block
        self.use_bn = use_bn
        self.use_do = use_do

        # the first conv
        self.bn1 = nn.BatchNorm1d(in_channels)
        self.relu1 = nn.ReLU()
        self.do1 = nn.Dropout(p=0.5)
        self.conv1 = CustomConv1d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=self.stride,
            groups=self.groups,
        )

        # the second conv
        self.bn2 = nn.BatchNorm1d(out_channels)
        self.relu2 = nn.ReLU()
        self.do2 = nn.Dropout(p=0.5)
        self.conv2 = CustomConv1d(
            in_channels=out_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=1,
            groups=self.groups,
        )

        self.max_pool = CustomMaxPool1d(kernel_size=self.stride)

    def forward(self, x):

        identity = x

        # the first conv
        out = x
        if not self.is_first_block:
            if self.use_bn:
                out = self.bn1(out)
            out = self.relu1(out)
            if self.use_do:
                out = self.do1(out)
        out = self.conv1(out)

        # the second conv
        if self.use_bn:
            out = self.bn2(out)
        out = self.relu2(out)
        if self.use_do:
            out = self.do2(out)
        out = self.conv2(out)

        # if downsample, also downsample identity
        if self.downsample:
            identity = self.max_pool(identity)

        # if expand channel, also pad zeros to identity
        if self.out_channels != self.in_channels:
            identity = identity.transpose(-1, -2)
            ch1 = (self.out_channels - self.in_channels) // 2
            ch2 = self.out_channels - self.in_channels - ch1
            identity = F.pad(identity, (ch1, ch2), "constant", 0)
            identity = identity.transpose(-1, -2)

        # shortcut
        out += identity

        return out


class ResNet1d(nn.Module):
    """

    Input:
        X: (n_samples, n_channel, n_length)
        Y: (n_samples)

    Output:
        out: (n_samples)

    Pararmetes:
        in_channels: dim of input, the same as n_channel
        base_filters: number of filters in the first several Conv layer, it will double at every 4 layers
        kernel_size: width of kernel
        stride: stride of kernel moving
        groups: set larget to 1 as ResNeXt
        n_block: number of blocks
        n_classes: number of classes

    """

    def __init__(
        self,
        in_channels,
        base_filters,
        kernel_size,
        stride,
        groups,
        n_block,
        n_classes,
        downsample_gap=2,
        increasefilter_gap=4,
        use_bn=True,
        use_do=True,
        verbose=False,
    ):
        super(ResNet1d, self).__init__()

        self.verbose = verbose
        self.n_block = n_block
        self.kernel_size = kernel_size
        self.stride = stride
        self.groups = groups
        self.use_bn = use_bn
        self.use_do = use_do

        self.downsample_gap = downsample_gap  # 2 for base model
        self.increasefilter_gap = increasefilter_gap  # 4 for base model

        # first block
        self.first_block_conv = CustomConv1d(
            in_channels=in_channels,
            out_channels=base_filters,
            kernel_size=self.kernel_size,
            stride=1,
        )
        self.first_block_bn = nn.BatchNorm1d(base_filters)
        self.first_block_relu = nn.ReLU()
        out_channels = base_filters

        # residual blocks
        self.basicblock_list = nn.ModuleList()
        for i_block in range(self.n_block):
            # is_first_block
            if i_block == 0:
                is_first_block = True
            else:
                is_first_block = False
            # downsample at every self.downsample_gap blocks
            if i_block % self.downsample_gap == 1:
                downsample = True
            else:
                downsample = False
            # in_channels and out_channels
            if is_first_block:
                in_channels = base_filters
                out_channels = in_channels
            else:
                # increase filters at every self.increasefilter_gap blocks
                in_channels = int(
                    base_filters * 2 ** ((i_block - 1) // self.increasefilter_gap)
                )
                if (i_block % self.increasefilter_gap == 0) and (i_block != 0):
                    out_channels = in_channels * 2
                else:
                    out_channels = in_channels

            tmp_block = BasicBlock(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=self.kernel_size,
                stride=self.stride,
                groups=self.groups,
                downsample=downsample,
                use_bn=self.use_bn,
                use_do=self.use_do,
                is_first_block=is_first_block,
            )
            self.basicblock_list.append(tmp_block)

        # final prediction
        self.final_bn = nn.BatchNorm1d(out_channels)
        self.final_relu = nn.ReLU(inplace=True)
        # self.do = nn.Dropout(p=0.5)
        self.dense = nn.Linear(out_channels, n_classes)
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, x):

        out = x

        # first conv
        if self.verbose:
            print("input shape", out.shape)
        out = self.first_block_conv(out)
        if self.verbose:
            print("after first conv", out.shape)
        if self.use_bn:
            out = self.first_block_bn(out)
        out = self.first_block_relu(out)

        # residual blocks, every block has two conv
        for i_block in range(self.n_block):
            net = self.basicblock_list[i_block]
            if self.verbose:
                print(
                    "i_block: {0}, in_channels: {1}, out_channels: {2}, downsample: {3}".format(
                        i_block, net.in_channels, net.out_channels, net.downsample
                    )
                )
            out = net(out)
            if self.verbose:
                print(out.shape)

        # final prediction
        if self.use_bn:
            out = self.final_bn(out)
        out = self.final_relu(out)
        out = out.mean(-1)
        if self.verbose:
            print("final pooling", out.shape)
        # out = self.do(out)
        out = self.dense(out)
        if self.verbose:
            print("dense", out.shape)
        # out = self.softmax(out)
        if self.verbose:
            print("softmax", out.shape)

        return out


class ResNet(LightningModule):
    def __init__(
        self,
        c1_output_dim,
        c1_kernel_size,
        c1_stride,
        c1_groups,
        c1_n_block,
        output_dim,
        data_path,
        signal_length,
    ):
        super().__init__()
        # save parameters to checkpoint
        self.save_hyperparameters()

        # two convolution, then one max pool
        self.conv1 = nn.Sequential(
            ResNet1d(
                in_channels=1,
                base_filters=self.hparams.c1_output_dim,
                kernel_size=self.hparams.c1_kernel_size,
                stride=self.hparams.c1_stride,
                groups=self.hparams.c1_groups,
                n_block=self.hparams.c1_n_block,
                n_classes=self.hparams.c1_output_dim,
            ),
            nn.ReLU(),
        )

        self.max_pool = nn.MaxPool1d(kernel_size=2)

        # flatten, calculate the output size of max pool
        # use a dummy input to calculate
        dummy_x = torch.rand(1, 1, self.hparams.signal_length, requires_grad=False)
        dummy_x = self.conv1(dummy_x)
        dummy_x = self.max_pool(dummy_x)
        max_pool_out = dummy_x.view(1, -1).shape[1]

        # followed by 5 dense layers
        self.fc1 = nn.Sequential(
            nn.Linear(in_features=max_pool_out, out_features=200),
            nn.Dropout(p=0.05),
            nn.ReLU(),
        )
        self.fc2 = nn.Sequential(
            nn.Linear(in_features=200, out_features=100), nn.Dropout(p=0.05), nn.ReLU()
        )
        self.fc3 = nn.Sequential(
            nn.Linear(in_features=100, out_features=50), nn.Dropout(p=0.05), nn.ReLU()
        )

        # finally, output layer
        self.out = nn.Linear(in_features=50, out_features=self.hparams.output_dim)

    def forward(self, x):
        # make sure the input is in [batch_size, channel, signal_length]
        # where channel is 1
        # signal_length is 1500 by default
        batch_size = x.shape[0]

        # 1 conv 1 max
        x = self.conv1(x)
        x = self.max_pool(x)

        x = x.reshape(batch_size, -1)

        # 3 fc
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)

        # output
        x = self.out(x)

        return x

    def train_dataloader(self):
        # expect to get train folder
        dataset_dict = datasets.load_dataset(self.hparams.data_path)
        dataset = dataset_dict[list(dataset_dict.keys())[0]]
        try:
            num_workers = multiprocessing.cpu_count()
        except:
            num_workers = 1
        dataloader = DataLoader(
            dataset,
            batch_size=16,
            num_workers=num_workers,
            collate_fn=dataset_collate_function,
            shuffle=True,
        )

        return dataloader

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def training_step(self, batch, batch_idx):
        x = batch["feature"].float()
        y = batch["label"].long()
        y_hat = self(x)

        entropy = F.cross_entropy(y_hat, y)
        self.log(
            "training_loss",
            entropy,
            prog_bar=True,
            logger=True,
            on_step=True,
            on_epoch=True,
        )
        loss = {"loss": entropy}

        return loss


from pathlib import Path

import numpy as np
import torch
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.utilities.seed import seed_everything

from pathlib import Path

from scapy.layers.dns import DNS
from scapy.layers.inet import TCP
from scapy.packet import Padding
from scapy.utils import PcapReader

In [4]:
import os
import glob
import numpy as np
import torch
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
import gzip
import json
from joblib import Parallel, delayed
from scapy.compat import raw
from scapy.layers.inet import IP, UDP
from scapy.layers.l2 import Ether
from scapy.packet import Padding
from scipy import sparse
from google.colab import drive

drive.mount('/content/drive')

from pathlib import Path

from scapy.layers.dns import DNS
from scapy.layers.inet import TCP
from scapy.packet import Padding
from scapy.utils import PcapReader

def read_pcap(path: Path):
    packets = PcapReader(str(path))

    return packets

def should_omit_packet(packet):
    # SYN, ACK or FIN flags set to 1 and no payload
    if TCP in packet and (packet.flags & 0x13):
        # not payload or contains only padding
        layers = packet[TCP].payload.layers()
        if not layers or (Padding in layers and len(layers) == 1):
            return True

    # DNS segment
    if DNS in packet:
        return True

    return False

# Install Trustee if not already installed
# !pip install trustee

# from trustee.explainers import IntegratedGradients, Saliency
# import trustee.visualizers as vis

# Functions from your second code snippet for PCAP processing
def remove_ether_header(packet):
    if Ether in packet:
        return packet[Ether].payload
    return packet

def mask_ip(packet):
    if IP in packet:
        packet[IP].src = "0.0.0.0"
        packet[IP].dst = "0.0.0.0"
    return packet

def pad_udp(packet):
    if UDP in packet:
        layer_after = packet[UDP].payload.copy()
        pad = Padding()
        pad.load = "\x00" * 12
        layer_before = packet.copy()
        layer_before[UDP].remove_payload()
        packet = layer_before / pad / layer_after
        return packet
    return packet

def packet_to_sparse_array(packet, max_length=1500):
    arr = np.frombuffer(raw(packet), dtype=np.uint8)[0:max_length] / 255
    if len(arr) < max_length:
        pad_width = max_length - len(arr)
        arr = np.pad(arr, pad_width=(0, pad_width), constant_values=0)
    arr = sparse.csr_matrix(arr)
    return arr

def transform_packet(packet):
    if should_omit_packet(packet):
        return None
    packet = remove_ether_header(packet)
    packet = pad_udp(packet)
    packet = mask_ip(packet)
    arr = packet_to_sparse_array(packet)
    return arr

!pip install fastparquet

def process_pcap_to_parquet(pcap_path, output_dir, batch_size=10000):
    """Process a PCAP file directly to Parquet format in batches."""
    print(f"Processing {pcap_path}")
    output_path = Path(output_dir) / f"{Path(pcap_path).name}.parquet"

    if output_path.exists():
        print(f"{output_path} already exists, skipping")
        return str(output_path)

    # Reverse mapping from app name to ID
    APP_NAME_TO_ID = {v.lower(): k for k, v in ID_TO_APP.items()}
    filename_prefix = Path(pcap_path).stem.lower()
    app_label = APP_NAME_TO_ID.get(filename_prefix)

    if app_label is None:
        print(f"Warning: Could not determine label for {filename_prefix}")
        return None

    rows = []
    batch_id = 0
    for i, packet in enumerate(read_pcap(pcap_path)):
        arr = transform_packet(packet)
        if arr is not None:
            row = {
                "app_label": app_label,
                "feature": arr.todense().tolist()[0]
            }
            rows.append(row)

        # Write a batch to Parquet
        if len(rows) >= batch_size:
            df_batch = pd.DataFrame(rows)
            file_exists = Path(output_path).exists()
            df_batch.to_parquet(
                output_path,
                index=False,
                engine='fastparquet',
                compression='snappy',
                append=file_exists
            )
            #df_batch.to_parquet(output_path, index=False, engine='fastparquet', compression='snappy', append=True)
            print(f"Wrote batch {batch_id} with {len(rows)} rows")
            rows = []  # Clear memory
            batch_id += 1

    # Write remaining rows
    if rows:
        df_batch = pd.DataFrame(rows)
        file_exists = Path(output_path).exists()

        df_batch.to_parquet(output_path, index=False, engine='fastparquet', compression='snappy', append=True)
        print(f"Wrote final batch with {len(rows)} rows")

    print(f"Finished processing {pcap_path}")
    return str(output_path)


# Function to process multiple PCAP files in parallel
def process_pcap_files(pcap_paths, output_dir, n_jobs=-1):
    """Process multiple PCAP files to parquet format"""
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    parquet_paths = []
    for pcap_path in pcap_paths:
        parquet_path = process_pcap_to_parquet(pcap_path, output_dir)
        parquet_paths.append(parquet_path)

    return parquet_paths

# Load your model
def load_model(model_path):
    """Load the pretrained deep packet model"""
    model = load_application_classification_cnn_model(model_path, gpu=True)
    return model

# Create PyTorch model wrapper for Trustee
class TorchModelWrapper:
    def __init__(self, model):
        self.model = model
        self.model.eval()

    def predict(self, X):
        # Handle reshaping if needed for CNN
        if len(X.shape) == 2:
            X = X[:, np.newaxis, :]  # Ensures shape (batch, channels=1, sequence_length)

        # Convert to PyTorch tensor
        X_tensor = torch.tensor(X, dtype=torch.float32)

        # Get predictions
        with torch.no_grad():
            outputs = self.model(X_tensor)
            probas = torch.softmax(outputs, dim=1).numpy()

        return probas

    def predict_proba(self, X):
        return self.predict(X)

# Load parquet data
def load_parquet_data(parquet_path, max_samples=1000):
    """Load data from a parquet file"""
    df = pd.read_parquet(parquet_path)

    # Extract features and labels
    X = np.array(df['feature'].tolist(), dtype=np.float32)
    y = np.array(df['app_label'].tolist())

    # Limit samples if needed
    if max_samples and len(X) > max_samples:
        indices = np.random.choice(len(X), max_samples, replace=False)
        X = X[indices]
        y = y[indices]

    return X, y

# # Analyze feature importance with Trustee
# def analyze_feature_importance(model, X, y, class_names, output_dir):
#     """Analyze feature importance using Trustee"""
#     # Create output directory
#     Path(output_dir).mkdir(parents=True, exist_ok=True)

#     # Create model wrapper
#     wrapped_model = TorchModelWrapper(model)

#     # Define feature names
#     feature_names = [f"Feature_{i}" for i in range(X.shape[1])]

#     # Create explainers
#     ig_explainer = IntegratedGradients(wrapped_model,
#                                       feature_names=feature_names,
#                                       class_names=class_names)

#     saliency_explainer = Saliency(wrapped_model,
#                                  feature_names=feature_names,
#                                  class_names=class_names)

#     # Calculate importances
#     results = {}

#     # Use a smaller subset for explanation
#     explanation_samples = min(20, len(X))
#     X_explain = X[:explanation_samples]

#     # For each class
#     for class_id, app_name in class_names.items():
#         print(f"Analyzing class: {app_name} (ID: {class_id})")

#         # Calculate importances using different methods
#         ig_importances = ig_explainer.explain(X_explain, y=np.ones(explanation_samples) * class_id)
#         saliency_importances = saliency_explainer.explain(X_explain, y=np.ones(explanation_samples) * class_id)

#         # Store results
#         results[app_name] = {
#             'integrated_gradients': ig_importances,
#             'saliency': saliency_importances
#         }

#     # Visualize the results
#     for app_name, methods in results.items():
#         print(f"Visualizing results for {app_name}...")

#         # Create a figure for this class
#         plt.figure(figsize=(15, 12))

#         # Plot Integrated Gradients results
#         plt.subplot(2, 1, 1)
#         ig_imp = methods['integrated_gradients'].mean(axis=0)
#         top_n = 20
#         sorted_idx = np.argsort(ig_imp)[-top_n:]
#         plt.barh(range(len(sorted_idx)), ig_imp[sorted_idx])
#         plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
#         plt.title(f"Top {top_n} features for {app_name} (Integrated Gradients)")
#         plt.xlabel("Importance Score")

#         # Plot Saliency results
#         plt.subplot(2, 1, 2)
#         saliency_imp = methods['saliency'].mean(axis=0)
#         sorted_idx = np.argsort(saliency_imp)[-top_n:]
#         plt.barh(range(len(sorted_idx)), saliency_imp[sorted_idx])
#         plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
#         plt.title(f"Top {top_n} features for {app_name} (Saliency Maps)")
#         plt.xlabel("Importance Score")

#         plt.tight_layout()
#         plt.savefig(f'{output_dir}/{app_name}_feature_importance.png', dpi=300)
#         plt.close()

#     # Calculate and visualize global importance
#     global_importance = np.zeros((len(feature_names),))
#     method_counts = 0

#     for app_name, methods in results.items():
#         for method_name, imp in methods.items():
#             if len(imp.shape) > 1:
#                 imp_avg = imp.mean(axis=0)
#             else:
#                 imp_avg = imp
#             global_importance += imp_avg
#             method_counts += 1

#     global_importance /= method_counts

#     plt.figure(figsize=(12, 8))
#     top_n = 30
#     sorted_idx = np.argsort(global_importance)[-top_n:]
#     plt.barh(range(len(sorted_idx)), global_importance[sorted_idx])
#     plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
#     plt.title(f"Top {top_n} Global Feature Importance (Across All Classes and Methods)")
#     plt.xlabel("Importance Score")
#     plt.tight_layout()
#     plt.savefig(f'{output_dir}/global_feature_importance.png', dpi=300)
#     plt.show()

#     # Save feature importance to CSV
#     importance_df = pd.DataFrame(index=feature_names)

#     # Save individual method results
#     for app_name, methods in results.items():
#         for method_name, imp in methods.items():
#             if len(imp.shape) > 1:
#                 imp_avg = imp.mean(axis=0)
#             else:
#                 imp_avg = imp
#             importance_df[f"{app_name}_{method_name}"] = imp_avg

#     # Add global importance
#     importance_df['global'] = global_importance

#     # Save to CSV
#     importance_df.to_csv(f'{output_dir}/feature_importances.csv')
#     print(f"Feature importances saved to '{output_dir}/feature_importances.csv'")

#     return importance_df

ID_TO_APP = {
    0: "AIM Chat",
    1: "Email",
    2: "Facebook",
    3: "FTPS",
    4: "Gmail",
    5: "Hangouts",
    6: "ICQ",
    7: "Netflix",
    8: "SCP",
    9: "SFTP",
    10: "Skype",
    11: "Spotify",
    12: "Vimeo",
    13: "Voipbuster",
    14: "Youtube",
}

# for app identification
PREFIX_TO_APP_ID = {
    # AIM chat
    "aim_chat_3a": 0,
    "aim_chat_3b": 0,
    "aimchat1": 0,
    "aimchat2": 0,
    # Email
    "email1a": 1,
    "email1b": 1,
    "email2a": 1,
    "email2b": 1,
    # Facebook
    "facebook_audio1a": 2,
    "facebook_audio1b": 2,
    "facebook_audio2a": 2,
    "facebook_audio2b": 2,
    "facebook_audio3": 2,
    "facebook_audio4": 2,
    "facebook_chat_4a": 2,
    "facebook_chat_4b": 2,
    "facebook_video1a": 2,
    "facebook_video1b": 2,
    "facebook_video2a": 2,
    "facebook_video2b": 2,
    "facebookchat1": 2,
    "facebookchat2": 2,
    "facebookchat3": 2,
    # FTPS
    "ftps_down_1a": 3,
    "ftps_down_1b": 3,
    "ftps_up_2a": 3,
    "ftps_up_2b": 3,
    # Gmail
    "gmailchat1": 4,
    "gmailchat2": 4,
    "gmailchat3": 4,
    # Hangouts
    "hangout_chat_4b": 5,
    "hangouts_audio1a": 5,
    "hangouts_audio1b": 5,
    "hangouts_audio2a": 5,
    "hangouts_audio2b": 5,
    "hangouts_audio3": 5,
    "hangouts_audio4": 5,
    "hangouts_chat_4a": 5,
    "hangouts_video1b": 5,
    "hangouts_video2a": 5,
    "hangouts_video2b": 5,
    # ICQ
    "icq_chat_3a": 6,
    "icq_chat_3b": 6,
    "icqchat1": 6,
    "icqchat2": 6,
    # Netflix
    "netflix1": 7,
    "netflix2": 7,
    "netflix3": 7,
    "netflix4": 7,
    # SCP
    "scp1": 8,
    "scpdown1": 8,
    "scpdown2": 8,
    "scpdown3": 8,
    "scpdown4": 8,
    "scpdown5": 8,
    "scpdown6": 8,
    "scpup1": 8,
    "scpup2": 8,
    "scpup3": 8,
    "scpup5": 8,
    "scpup6": 8,
    # SFTP
    "sftp1": 9,
    "sftp_down_3a": 9,
    "sftp_down_3b": 9,
    "sftp_up_2a": 9,
    "sftp_up_2b": 9,
    "sftpdown1": 9,
    "sftpdown2": 9,
    "sftpup1": 9,
    # Skype
    "skype_audio1a": 10,
    "skype_audio1b": 10,
    "skype_audio2a": 10,
    "skype_audio2b": 10,
    "skype_audio3": 10,
    "skype_audio4": 10,
    "skype_chat1a": 10,
    "skype_chat1b": 10,
    "skype_file1": 10,
    "skype_file2": 10,
    "skype_file3": 10,
    "skype_file4": 10,
    "skype_file5": 10,
    "skype_file6": 10,
    "skype_file7": 10,
    "skype_file8": 10,
    "skype_video1a": 10,
    "skype_video1b": 10,
    "skype_video2a": 10,
    "skype_video2b": 10,
    # Spotify
    "spotify1": 11,
    "spotify2": 11,
    "spotify3": 11,
    "spotify4": 11,
    # Vimeo
    "vimeo1": 12,
    "vimeo2": 12,
    "vimeo3": 12,
    "vimeo4": 12,
    # Voipbuster
    "voipbuster1b": 13,
    "voipbuster2b": 13,
    "voipbuster3b": 13,
    "voipbuster_4a": 13,
    "voipbuster_4b": 13,
    # Youtube
    "youtube1": 14,
    "youtube2": 14,
    "youtube3": 14,
    "youtube4": 14,
    "youtube5": 14,
    "youtube6": 14,
    "youtubehtml5_1": 14,
}

def load_cnn_model(model_path, gpu):
    if gpu:
        device = "cuda"
    else:
        device = "cpu"
    model = (
        CNN.load_from_checkpoint(
            str(Path(model_path).absolute()), map_location=torch.device(device)
        )
        .float()
        .to(device)
    )

    model.eval()

    return model

# Main function to orchestrate everything
def main(pcap_paths, model_path, output_dir, n_jobs=-1):
    """Main function to process PCAP files and analyze feature importance"""
    # Step 1: Process PCAP files to parquet
    parquet_dir = os.path.join(output_dir, "parquet_files")
    parquet_paths = process_pcap_files(pcap_paths, parquet_dir, n_jobs)

    # Step 2: Load model
    model = load_model(model_path)

    # Step 3: Define class names
    class_names = ID_TO_APP  # Assuming this is available from your code

    # Step 4: Analyze each parquet file
    for parquet_path in parquet_paths:
        print(f"Analyzing {parquet_path}")

        # Create output directory for this file
        file_name = Path(parquet_path).stem
        file_output_dir = os.path.join(output_dir, "analysis", file_name)

        # Load data
        X, y = load_parquet_data(parquet_path)

        # # Analyze feature importance
        # analyze_feature_importance(model, X, y, class_names, file_output_dir)

    print("Analysis complete!")

def load_application_classification_cnn_model(model_path, gpu=False):
    return load_cnn_model(model_path=model_path, gpu=gpu)

# # Example usage
# if __name__ == "__main__":
#     # Define paths to your 6 large PCAP files
#     pcap_paths = [
#         "/content/drive/My Drive/NetFound/Shiva_Folder/netFound/data/test/test_finetuning/raw/0/Netflix.pcap",
#         "/content/drive/My Drive/NetFound/Shiva_Folder/netFound/data/test/test_finetuning/raw/1/Youtube.pcap",
#         "/content/drive/My Drive/NetFound/Shiva_Folder/netFound/data/test/test_finetuning/raw/2/Gmail.pcap",
#         "/content/drive/My Drive/NetFound/Shiva_Folder/netFound/data/test/test_finetuning/raw/3/Skype.pcap",
#         "/content/drive/My Drive/NetFound/Shiva_Folder/netFound/data/test/test_finetuning/raw/4/Linkedin.pcap",
#         "/content/drive/My Drive/NetFound/Shiva_Folder/netFound/data/test/test_finetuning/raw/5/Twitter.pcap"
#     ]

#     # Path to your pretrained model
#     model_path = "/content/drive/My Drive/COL867_Assignment2/application_classification.cnn.model"

#     # Output directory
#     output_dir = "/content/drive/My Drive/NetFound/Parquet_Part3"

#     # Run the analysis
#     main(pcap_paths, model_path, output_dir, n_jobs=-1)  # Adjust n_jobs based on your CPU

Mounted at /content/drive
Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fastparquet
Successfully installed fastparquet-2024.11.0


###Evaluation

In [None]:
import os
import numpy as np
import torch
import pandas as pd
from pathlib import Path
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# Monkey patch to fix deprecated usage
np.float = float

def load_parquet_data(parquet_path, max_samples=None):
    """Load data from a parquet file"""
    df = pd.read_parquet(parquet_path)

    # Extract features and labels
    X = np.array(df['feature'].tolist(), dtype=np.float32)
    y = np.array(df['app_label'].tolist()) if 'app_label' in df.columns else np.array(df['label'].tolist())

    # Limit samples if needed
    if max_samples and len(X) > max_samples:
        indices = np.random.choice(len(X), max_samples, replace=False)
        X = X[indices]
        y = y[indices]

    if 'app_label' in df.columns:
        y = np.array(df['app_label'].tolist())
        print("Using 'app_label' as target.")
    else:
        y = np.array(df['label'].tolist())
        print("Using 'label' as target.")

    # Print some information to verify
    print(f"Sample labels (first 10): {y[:10]}")
    print(f"Unique labels in this parquet: {np.unique(y)}")
    print(f"Label distribution:\n{pd.Series(y).value_counts()}")

    return X, y

def evaluate_model(model, parquet_path, batch_size=32):
    """Evaluate model performance on a parquet file"""
    # Load data
    X, y_true = load_parquet_data(parquet_path)

    # Prepare data for model
    if len(X.shape) == 2:
        X = X[:, np.newaxis, :]  # Reshape for CNN if needed (batch, channels=1, sequence_length)

    # Make predictions in batches to avoid memory issues
    model.eval()
    y_pred = []

    with torch.no_grad():
        for i in range(0, len(X), batch_size):
            batch_X = torch.tensor(X[i:i+batch_size], dtype=torch.float32)
            outputs = model(batch_X)
            _, predicted = torch.max(outputs, 1)
            y_pred.extend(predicted.cpu().numpy())

    y_pred = np.array(y_pred)

    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_weighted = f1_score(y_true, y_pred, average='weighted')

    # Generate confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # # Detailed classification report
    # report = classification_report(y_true, y_pred, target_names=[ID_TO_APP[i] for i in sorted(set(y_true))])

    labels = sorted(set(y_true))  # or list of expected label indices
    target_names = [ID_TO_APP[i] for i in labels]

    report = classification_report(y_true, y_pred, labels=labels, target_names=target_names)


    return {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'confusion_matrix': cm,
        'classification_report': report
    }

def evaluate_all_parquets(model_path, parquet_paths, output_dir):
    """Evaluate model on multiple parquet files and save results"""
    # Create output directory
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    # Load model
    model = load_application_classification_cnn_model(model_path, gpu=False)

    # Overall results dictionary
    all_results = {}

    # Evaluate each parquet file
    for parquet_path in parquet_paths:
        file_name = Path(parquet_path).stem
        print(f"Evaluating model on {file_name}...")

        results = evaluate_model(model, parquet_path)
        all_results[file_name] = results

        # Print results
        print(f"Results for {file_name}:")
        print(f"Accuracy: {results['accuracy']:.4f}")
        print(f"F1 Score (Macro): {results['f1_macro']:.4f}")
        print(f"F1 Score (Weighted): {results['f1_weighted']:.4f}")
        print("\nClassification Report:")
        print(results['classification_report'])
        print("\nConfusion Matrix:")
        print(results['confusion_matrix'])
        print("-" * 50)

        # Save confusion matrix visualization
        plt.figure(figsize=(12, 10))
        cm = results['confusion_matrix']
        class_names = [ID_TO_APP[i] for i in range(len(ID_TO_APP)) if i in set(np.unique(np.concatenate([cm.reshape(-1), np.arange(len(ID_TO_APP))])))]



        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title(f'Confusion Matrix - {file_name}')
        plt.savefig(f'{output_dir}/{file_name}_confusion_matrix.png', dpi=300, bbox_inches='tight')
        plt.close()

    # Aggregate results across all files
    print("Aggregating results across all files...")

    # Calculate average metrics
    avg_accuracy = np.mean([results['accuracy'] for results in all_results.values()])
    avg_f1_macro = np.mean([results['f1_macro'] for results in all_results.values()])
    avg_f1_weighted = np.mean([results['f1_weighted'] for results in all_results.values()])

    # Save summary to CSV
    summary_df = pd.DataFrame({
        'file': list(all_results.keys()),
        'accuracy': [results['accuracy'] for results in all_results.values()],
        'f1_macro': [results['f1_macro'] for results in all_results.values()],
        'f1_weighted': [results['f1_weighted'] for results in all_results.values()]
    })

    # Add average row
    summary_df.loc[len(summary_df)] = ['AVERAGE', avg_accuracy, avg_f1_macro, avg_f1_weighted]

    # Save to CSV
    summary_df.to_csv(f'{output_dir}/evaluation_summary.csv', index=False)

    print("Evaluation complete!")
    print(f"Average Accuracy: {avg_accuracy:.4f}")
    print(f"Average F1 Score (Macro): {avg_f1_macro:.4f}")
    print(f"Average F1 Score (Weighted): {avg_f1_weighted:.4f}")
    print(f"Detailed results saved to {output_dir}")

    return all_results

# Run the evaluation
if __name__ == "__main__":
    # Import missing dependencies
    import matplotlib.pyplot as plt
    import seaborn as sns

    # Path to model
    model_path = "/content/drive/My Drive/COL867_Assignment2/application_classification.cnn.model"

    # List of parquet files to evaluate
    parquet_paths = [
        "/content/drive/My Drive/NetFound/Parquet_Part3/parquet_files/Netflix.pcap.parquet",
        # "/content/drive/My Drive/NetFound/Parquet_Part3/parquet_files/Youtube.pcap.parquet",
        # "/content/drive/My Drive/NetFound/Parquet_Part3/parquet_files/Gmail.pcap.parquet",
        # "/content/drive/My Drive/NetFound/Parquet_Part3/parquet_files/Skype.pcap.parquet",
        # "/content/drive/My Drive/NetFound/Parquet_Part3/parquet_files/Linkedin.pcap.parquet",
        # "/content/drive/My Drive/NetFound/Parquet_Part3/parquet_files/Twitter.pcap.parquet"
    ]

    # Output directory
    output_dir = "/content/drive/My Drive/NetFound/"

    # Run evaluation
    evaluate_all_parquets(model_path, parquet_paths, output_dir)

Evaluating model on Netflix.pcap...


In [None]:
import os
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

np.float = float  # Monkey patch for older parquet files using np.float

def load_parquet_data(parquet_path, max_samples=None):
    """Load features and labels from a parquet file."""
    df = pd.read_parquet(parquet_path)

    if 'app_label' in df.columns:
        y = df['app_label'].to_numpy()
        print("Using 'app_label' as target.")
    else:
        y = df['label'].to_numpy()
        print("Using 'label' as target.")

    X = np.stack(df['feature'].values).astype(np.float32)

    if max_samples and len(X) > max_samples:
        idx = np.random.choice(len(X), max_samples, replace=False)
        X, y = X[idx], y[idx]

    print(f"Sample labels (first 10): {y[:10]}")
    print(f"Unique labels: {np.unique(y)}")
    print(f"Label distribution:\n{pd.Series(y).value_counts()}")
    return X, y


def predict_in_batches(model, X, batch_size=32):
    """Run model predictions in batches to save memory."""
    model.eval()
    y_pred = []

    with torch.no_grad():
        for i in range(0, len(X), batch_size):
            batch = torch.tensor(X[i:i + batch_size], dtype=torch.float32)
            outputs = model(batch)
            preds = outputs.argmax(dim=1).cpu().numpy()
            y_pred.extend(preds)

    return np.array(y_pred)


def evaluate_model(model, parquet_path, batch_size=32, label_map=None):
    """Evaluate a model on a single parquet file."""
    X, y_true = load_parquet_data(parquet_path)

    if X.ndim == 2:
        X = X[:, np.newaxis, :]

    y_pred = predict_in_batches(model, X, batch_size)

    results = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1_macro': f1_score(y_true, y_pred, average='macro'),
        'f1_weighted': f1_score(y_true, y_pred, average='weighted'),
        'confusion_matrix': confusion_matrix(y_true, y_pred),
        'classification_report': classification_report(
            y_true,
            y_pred,
            labels=sorted(set(y_true)),
            target_names=[label_map[i] for i in sorted(set(y_true))] if label_map else None
        )
    }

    return results


def plot_confusion_matrix(cm, labels, title, save_path):
    """Plot and save confusion matrix."""
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=labels, yticklabels=labels)
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.savefig(save_path, dpi=300)
    plt.close()


def evaluate_all_parquets(model_path, parquet_paths, output_dir, label_map):
    """Evaluate model on all parquet files and save results."""
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    model = load_application_classification_cnn_model(model_path, gpu=False)

    all_results = {}

    for parquet_path in parquet_paths:
        file_name = Path(parquet_path).stem
        print(f"Evaluating {file_name}...")

        results = evaluate_model(model, parquet_path, label_map=label_map)
        all_results[file_name] = results

        print(f"Accuracy: {results['accuracy']:.4f}")
        print(f"F1 Macro: {results['f1_macro']:.4f}")
        print(f"F1 Weighted: {results['f1_weighted']:.4f}")
        print("\nClassification Report:")
        print(results['classification_report'])

        cm_path = os.path.join(output_dir, f"{file_name}_confusion_matrix.png")
        plot_confusion_matrix(results['confusion_matrix'],
                              [label_map[i] for i in sorted(set(np.unique(results['confusion_matrix'])))],
                              f"Confusion Matrix - {file_name}",
                              cm_path)

    # Save summary
    summary_df = pd.DataFrame([
        {
            "file": k,
            "accuracy": v['accuracy'],
            "f1_macro": v['f1_macro'],
            "f1_weighted": v['f1_weighted']
        } for k, v in all_results.items()
    ])
    summary_df.loc[len(summary_df)] = {
        "file": "AVERAGE",
        "accuracy": summary_df['accuracy'].mean(),
        "f1_macro": summary_df['f1_macro'].mean(),
        "f1_weighted": summary_df['f1_weighted'].mean()
    }
    summary_path = os.path.join(output_dir, "evaluation_summary.csv")
    summary_df.to_csv(summary_path, index=False)

    print("Evaluation complete!")
    print(summary_df)
    return all_results


if __name__ == "__main__":

    model_path = "/content/drive/My Drive/COL867_Assignment2/application_classification.cnn.model"
    parquet_paths = [
        "/content/drive/My Drive/NetFound/Parquet_Part3/parquet_files/Netflix.pcap.parquet",
        # Add more if needed
    ]
    output_dir = "/content/drive/My Drive/NetFound/Evaluation_Output"

    evaluate_all_parquets(model_path, parquet_paths, output_dir, label_map=ID_TO_APP)


Evaluating Netflix.pcap...


KeyboardInterrupt: 

In [None]:
import pyarrow.parquet as pq
import numpy as np
import torch
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import ast
from pathlib import Path
import os
import gc  # For garbage collection

def read_parquet_in_batches(parquet_path, batch_size):
    table = pq.read_table(parquet_path, columns=["feature", "app_label"])
    num_rows = table.num_rows
    for start in range(0, num_rows, batch_size):
        end = min(start + batch_size, num_rows)
        batch = table.slice(start, end).to_pandas()
        X = np.stack(batch["feature"].values).astype(np.float32)
        y = batch["app_label"].to_numpy()
        yield X, y

# def predict_and_evaluate_streamed(model, parquet_path, batch_size=64, max_rows=None):
#     y_true, y_pred = [], []

#     with torch.no_grad():
#         for X_batch, y_batch in read_parquet_in_batches(parquet_path, batch_size, max_rows):
#             inputs = torch.from_numpy(X_batch)  # Faster than torch.tensor
#             outputs = model(inputs)
#             preds = outputs.argmax(dim=1).cpu().numpy()

#             y_true.extend(y_batch)
#             y_pred.extend(preds)

def predict_and_evaluate_streamed(model, parquet_path, batch_size=64, max_rows=None):
    """Evaluate model on parquet data in streamed batches."""
    y_true, y_pred = [], []

    with torch.no_grad():
        for X_batch, y_batch in read_parquet_in_batches(parquet_path, batch_size, max_rows):
            inputs = torch.tensor(X_batch, dtype=torch.float32).unsqueeze(1)  # Add channel dimension
            outputs = model(inputs)
            preds = outputs.argmax(dim=1).cpu().numpy()

            y_true.extend(y_batch)
            y_pred.extend(preds)

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    results = {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1_macro": f1_score(y_true, y_pred, average="macro"),
        "f1_weighted": f1_score(y_true, y_pred, average="weighted"),
        "confusion_matrix": confusion_matrix(y_true, y_pred),
        "classification_report": classification_report(y_true, y_pred)
    }

    return results, y_true, y_pred


def plot_confusion_matrix(cm, labels, title, save_path):
    annot = True if len(labels) <= 50 else False  # Avoid annotating huge matrices
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=annot, fmt='d', cmap='Blues',
                xticklabels=labels, yticklabels=labels)
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.savefig(save_path, dpi=300)
    plt.close()

def evaluate_parquet_files_streamed(model, parquet_paths, output_dir, label_map, batch_size=64):
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    all_results = []

    for parquet_path in parquet_paths:
        file_name = Path(parquet_path).stem
        print(f"\nStreaming evaluation for {file_name}...")
        results = predict_and_evaluate_streamed(model, parquet_path, batch_size, max_rows=5000)

        #results = predict_and_evaluate_streamed(model, parquet_path, batch_size)

        print(f"\n{file_name} Accuracy: {results['accuracy']:.4f}")
        print(results["classification_report"])

        cm_path = os.path.join(output_dir, f"{file_name}_confusion_matrix.png")
        labels = [label_map.get(i, str(i)) for i in range(len(label_map))]
        plot_confusion_matrix(results["confusion_matrix"], labels, f"Confusion Matrix - {file_name}", cm_path)

        all_results.append({
            "file": file_name,
            "accuracy": results["accuracy"],
            "f1_macro": results["f1_macro"],
            "f1_weighted": results["f1_weighted"]
        })

        # Release memory
        del results
        torch.cuda.empty_cache()
        gc.collect()

    # Summary
    df = pd.DataFrame(all_results)
    df.loc[len(df)] = {
        "file": "AVERAGE",
        "accuracy": df["accuracy"].mean(),
        "f1_macro": df["f1_macro"].mean(),
        "f1_weighted": df["f1_weighted"].mean()
    }
    df.to_csv(os.path.join(output_dir, "streamed_evaluation_summary.csv"), index=False)
    print(df)
    return df

def read_parquet_in_batches(parquet_path, batch_size, max_rows=5000):
    table = pq.read_table(parquet_path, columns=["feature", "app_label"])
    num_rows = min(table.num_rows, max_rows)
    for start in range(0, num_rows, batch_size):
        end = min(start + batch_size, num_rows)
        batch = table.slice(start, end).to_pandas()
        X = np.stack(batch["feature"].apply(ast.literal_eval)).astype(np.float32)
        y = batch["app_label"].to_numpy()
        yield X, y

if __name__ == "__main__":
    model_path = "/content/drive/My Drive/COL867_Assignment2/application_classification.cnn.model"
    parquet_paths = [
        "/content/drive/My Drive/NetFound/Parquet_Part3/parquet_files/Netflix.pcap.parquet"
    ]
    output_dir = "/content/drive/My Drive/NetFound/Evaluation_Streamed"

    model = load_application_classification_cnn_model(model_path, gpu=False)
    evaluate_parquet_files_streamed(model, parquet_paths, output_dir, label_map=ID_TO_APP, batch_size=64)



Streaming evaluation for Netflix.pcap...


In [None]:
import pyarrow.parquet as pq
import numpy as np
import torch
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import ast
from pathlib import Path
import os
import gc  # For garbage collection

def read_parquet_in_batches(parquet_path, batch_size, max_rows=10000):
    table = pq.read_table(parquet_path, columns=["feature", "app_label"])
    num_rows = min(table.num_rows, max_rows)  # Limit to max_rows (10,000)
    for start in range(0, num_rows, batch_size):
        end = min(start + batch_size, num_rows)
        batch = table.slice(start, end).to_pandas()
        X = np.stack(batch["feature"].apply(ast.literal_eval)).astype(np.float32)
        y = batch["app_label"].to_numpy()
        yield X, y

def predict_and_evaluate_streamed(model, parquet_path, batch_size=64, max_rows=10000):
    """Evaluate model on parquet data in streamed batches."""
    y_true, y_pred = [], []

    with torch.no_grad():
        for X_batch, y_batch in read_parquet_in_batches(parquet_path, batch_size, max_rows):
            inputs = torch.tensor(X_batch, dtype=torch.float32).unsqueeze(1)  # Add channel dimension
            outputs = model(inputs)
            preds = outputs.argmax(dim=1).cpu().numpy()

            y_true.extend(y_batch)
            y_pred.extend(preds)

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    results = {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1_macro": f1_score(y_true, y_pred, average="macro"),
        "f1_weighted": f1_score(y_true, y_pred, average="weighted"),
        "confusion_matrix": confusion_matrix(y_true, y_pred),
        "classification_report": classification_report(y_true, y_pred)
    }

    return results, y_true, y_pred

def plot_confusion_matrix(cm, labels, title, save_path):
    annot = True if len(labels) <= 50 else False  # Avoid annotating huge matrices
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=annot, fmt='d', cmap='Blues',
                xticklabels=labels, yticklabels=labels)
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.savefig(save_path, dpi=300)
    plt.close()

def evaluate_parquet_files_streamed(model, parquet_paths, output_dir, label_map, batch_size=64):
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    all_results = []

    for parquet_path in parquet_paths:
        file_name = Path(parquet_path).stem
        print(f"\nStreaming evaluation for {file_name}...")

        # Process only first 10,000 rows
        results = predict_and_evaluate_streamed(model, parquet_path, batch_size, max_rows=10000)

        print(f"\n{file_name} Accuracy: {results['accuracy']:.4f}")
        print(results["classification_report"])

        cm_path = os.path.join(output_dir, f"{file_name}_confusion_matrix.png")
        labels = [label_map.get(i, str(i)) for i in range(len(label_map))]
        plot_confusion_matrix(results["confusion_matrix"], labels, f"Confusion Matrix - {file_name}", cm_path)

        all_results.append({
            "file": file_name,
            "accuracy": results["accuracy"],
            "f1_macro": results["f1_macro"],
            "f1_weighted": results["f1_weighted"]
        })

        # Release memory
        del results
        torch.cuda.empty_cache()
        gc.collect()

    # Summary
    df = pd.DataFrame(all_results)
    df.loc[len(df)] = {
        "file": "AVERAGE",
        "accuracy": df["accuracy"].mean(),
        "f1_macro": df["f1_macro"].mean(),
        "f1_weighted": df["f1_weighted"].mean()
    }
    df.to_csv(os.path.join(output_dir, "streamed_evaluation_summary.csv"), index=False)
    print(df)
    return df

if __name__ == "__main__":
    model_path = "/content/drive/My Drive/COL867_Assignment2/application_classification.cnn.model"
    parquet_paths = [
        "/content/drive/My Drive/NetFound/Parquet_Part3/parquet_files/Netflix.pcap.parquet"
    ]
    output_dir = "/content/drive/My Drive/NetFound/Evaluation_Streamed"

    model = load_application_classification_cnn_model(model_path, gpu=False)
    evaluate_parquet_files_streamed(model, parquet_paths, output_dir, label_map=ID_TO_APP, batch_size=64)


Streaming evaluation for Netflix.pcap...
