In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
file_path = "./processed_metrics.csv"

df = pd.read_csv(file_path)

In [3]:
df.columns

Index(['switch', 'timestamp', 'Average Flow Duration ',
       'Average Packets Per Flow ', 'Bridge Controller Status',
       'CPU Utilization', 'Rate of Packet in Messages ',
       'Rate of Port Flapping ', 'Interface Utilization', 'label'],
      dtype='object')

In [4]:
X = df.drop(columns=["label", "timestamp", "switch", "Interface Utilization"])
Y = df["label"]

In [5]:
for index, row in X.iterrows():
    min_len = 99999
    for col in X.columns:
        a = row[col][1:-1].split(",")
        if "" in a:
            data_points_len = a.index("")
            min_len = min(data_points_len, min_len)
        else:
            min_len = min(data_points_len, len(a))
    
    for col in X.columns:
        try:
            row[col] = [float(x) for x in row[col][1:-1].split(",")[:min_len]]
        except Exception as e:
            print(e)
            print(col, index)

In [6]:
X.iloc[0]["Average Packets Per Flow "]

[0.0,
 0.0,
 5.666666666666667,
 11.0,
 20.6,
 25.266666666666666,
 20.583333333333332,
 15.875,
 18.75,
 11.909090909090908,
 18.153846153846153,
 18.923076923076923,
 18.923076923076923,
 19.818181818181817,
 27.181818181818183,
 26.454545454545453,
 15.375,
 18.125,
 12.363636363636363,
 14.181818181818182,
 4.2,
 0.0,
 0.0]

In [7]:
time_series_data = []

for _, row in X.iterrows():
    stacked = np.stack([row[col] for col in X.columns])

    time_series_data.append(stacked.T)

len(time_series_data), len(time_series_data[10]), len(time_series_data[10][0])

(95, 24, 6)

In [8]:
import numpy as np

label_map = {0: 0, 2: 1, 3: 2}
y = np.array([label_map[label] for label in df["label"].astype(int).values])

y


array([0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2])

In [17]:
import torch
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    time_series_data,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

In [10]:

# batch_X_list = [torch.tensor(x, dtype=torch.float32) for x in time_series_data]

# lengths = [len(x) for x in batch_X_list]

# X_padded = torch.nn.utils.rnn.pad_sequence(batch_X_list, batch_first=True)
# packed = torch.nn.utils.rnn.pack_padded_sequence(X_padded, lengths, batch_first=True, enforce_sorted=False)

In [18]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = [torch.tensor(seq, dtype=torch.float32) for seq in sequences]
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

# Collate function for padding
def collate_fn(batch):
    batch.sort(key=lambda x: len(x[0]), reverse=True)  # sort by sequence length
    sequences, labels = zip(*batch)

    lengths = [len(seq) for seq in sequences]
    padded_seqs = pad_sequence(sequences, batch_first=True)
    labels = torch.tensor(labels, dtype=torch.long)

    return padded_seqs, lengths, labels


In [19]:
train_dataset = TimeSeriesDataset(X_train, y_train)
test_dataset = TimeSeriesDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


In [20]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence

class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, num_layers=2, dropout=0.3, bidirectional=True):
        super().__init__()
        self.bidirectional = bidirectional
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0,
            bidirectional=bidirectional
        )

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size * (2 if bidirectional else 1), num_classes)

    def forward(self, x, lengths):
        # Pack padded sequence
        packed = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=True)
        packed_output, (hn, cn) = self.lstm(packed)

        # hn shape: (num_layers * num_directions, batch, hidden_size)
        # Get the last layer's hidden states for both directions if bidirectional
        if self.bidirectional:
            # Concatenate the final forward and backward hidden states
            last_hidden = torch.cat((hn[-2], hn[-1]), dim=1)
        else:
            last_hidden = hn[-1]

        out = self.dropout(last_hidden)
        out = self.fc(out)
        return out


In [21]:
input_size = X_train[0].shape[1]      # Number of features
hidden_size = 64
num_classes = len(set(y_train.numpy()))   # Or however many classes you have

model = RNNClassifier(input_size, hidden_size, num_classes+1)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [22]:
for epoch in range(10):
    model.train()
    total_loss = 0

    for batch_x, lengths, batch_y in train_loader:
        optimizer.zero_grad()
        output = model(batch_x, lengths)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 4.2227
Epoch 2, Loss: 3.8716
Epoch 3, Loss: 3.4819
Epoch 4, Loss: 3.0215
Epoch 5, Loss: 2.6432
Epoch 6, Loss: 2.2366
Epoch 7, Loss: 1.6954
Epoch 8, Loss: 1.4837
Epoch 9, Loss: 1.1050
Epoch 10, Loss: 1.2251


In [23]:
from sklearn.metrics import f1_score

model.eval()
correct, total = 0, 0
all_preds = []
all_labels = []

with torch.no_grad():
    for batch_x, lengths, batch_y in test_loader:
        output = model(batch_x, lengths)
        preds = output.argmax(dim=1)

        correct += (preds == batch_y).sum().item()
        total += batch_y.size(0)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch_y.cpu().numpy())

accuracy = 100 * correct / total
f1 = f1_score(all_labels, all_preds, average='macro')  # use 'weighted' if you have imbalanced classes

print(f"Test Accuracy: {accuracy:.2f}%")
print(f"F1 Score: {f1:.4f}")


Test Accuracy: 89.47%
F1 Score: 0.8457
