In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import re

### Prepare Dataset

In [3]:
def read_data(algo='cm', row=3, width=4096, level=1, seed=1, count=1, flowkey='srcIP', 
              epochs=['10'], dataset='zipf2a_3_caida20180517_7.pcap', type='accumulate.txt', window_size=500):
    
    res = []
    cnt = 0
    for epoch in epochs:
        
        path = f"../SketchPatternQuery/{algo}/{dataset}/"\
                f"{flowkey}/row_{row}_width_{width}_level_{level}_epoch_{epoch}_count_{count}_seed_{seed}/"
        
        for dir in sorted(os.listdir(path)):
            p = os.path.join(path, dir)
            if os.path.isdir(p): 
                window_dir = "window_" + str(window_size)
                full_path = os.path.join(path, dir, window_dir, type)    
                with open(full_path, 'r') as f:
                    for line in f:
                        num_str = re.match(r'\(srcIP = +(\d+\.\d+\.\d+\.\d+)\) +(.+)', line).group(2)
                        num_list = [int(num) for num in num_str.split()]
                        # print(num_list)
                        res.append(num_list)
                        
                        cnt += 1
                        if cnt == 5:
                            break
                        
    bp = int(dataset.split("_")[1])*(int(1000/window_size))+1
                        
    np1 = np.array(res)
    np2 = np.rot90(np1)
    np2 = np.rot90(np2)
    np2 = np.rot90(np2)
            
    extract_x = []
    extract_x.append(np2[bp-6:bp].tolist())
    extract_x.append(np2[bp-5:bp+1].tolist())
    
    extract_y = [0,1]
    
    return extract_x, extract_y

read_data()

([[[1077, 1, 2, 2, 0],
   [12104, 3, 5, 4, 5],
   [14541, 3, 5, 8, 6],
   [16717, 4, 8, 10, 10],
   [21013, 4, 13, 11, 12],
   [21736, 4, 17, 12, 24]],
  [[12104, 3, 5, 4, 5],
   [14541, 3, 5, 8, 6],
   [16717, 4, 8, 10, 10],
   [21013, 4, 13, 11, 12],
   [21736, 4, 17, 12, 24],
   [21783, 2367, 1778, 892, 2585]]],
 [0, 1])

In [4]:
def get_training_data():

    data = []
    label = []

    algo = 'cm'
    widths = [2048, 4096, 8192, 16384, 32768, 65536, 131072]
    dataset = []

    path = "/home/ming/SketchMercator/pattern_detection/lstm/SketchPatternQuery/" + algo + "/"
    for dir in sorted(os.listdir(path)):
        dataset.append(dir)
        
    # print(dataset)

    for d in dataset:
        for w in widths:
            da, la = read_data(algo=algo, row=3, width=w, level=1, seed=1, count=1, flowkey='srcIP', 
                    epochs=['10'], dataset=d, type='accumulate.txt', window_size=500)
            
            data.append(da[0])
            data.append(da[1])
            label.append(la[0])
            label.append(la[1])
            
    # print(data)
    print(np.array(data).shape)
    # print(label)
    print(np.array(label).shape)
    
    return data, label

get_training_data()


(756, 6, 5)
(756,)


([[[241, 60, 221, 702, 644],
   [521, 126, 604, 945, 1124],
   [810, 188, 1096, 1384, 1628],
   [1060, 240, 1430, 1516, 2040],
   [1323, 295, 1748, 1899, 3050],
   [1667, 356, 2322, 2175, 3398]],
  [[521, 126, 604, 945, 1124],
   [810, 188, 1096, 1384, 1628],
   [1060, 240, 1430, 1516, 2040],
   [1323, 295, 1748, 1899, 3050],
   [1667, 356, 2322, 2175, 3398],
   [2832, 775, 3287, 3905, 4925]],
  [[235, 35, 205, 667, 445],
   [500, 68, 563, 863, 887],
   [751, 103, 1058, 1219, 1350],
   [998, 126, 1381, 1294, 1721],
   [1254, 147, 1730, 1631, 2701],
   [1478, 175, 2301, 1873, 2923]],
  [[500, 68, 563, 863, 887],
   [751, 103, 1058, 1219, 1350],
   [998, 126, 1381, 1294, 1721],
   [1254, 147, 1730, 1631, 2701],
   [1478, 175, 2301, 1873, 2923],
   [2643, 586, 3132, 3586, 4000]],
  [[212, 13, 188, 651, 437],
   [457, 23, 501, 834, 873],
   [688, 32, 952, 1175, 1333],
   [913, 39, 1228, 1243, 1692],
   [1149, 51, 1514, 1569, 2653],
   [1362, 54, 2050, 1805, 2759]],
  [[457, 23, 501, 834, 8

### Splitting the dataset

In [5]:
from sklearn.preprocessing import MinMaxScaler

data, label = get_training_data()

npData = np.array(data)
npLabel = np.array(label)

scaler = MinMaxScaler(feature_range=(0, 1))

# Reshape the data to 2D array
data_reshaped = npData.reshape(-1, npData.shape[-1])
# print(data_reshaped.shape)

rescaledData = scaler.fit_transform(data_reshaped)
# print(data_reshaped)
# print(rescaledData)

normalizedData  = rescaledData.reshape(npData.shape)

x_train = normalizedData[:600]
y_train = npLabel[:600]

x_valid  = normalizedData[600:]
y_valid  = npLabel[600:]

# x_test  = normalizedData[675:]
# y_test  = npLabel[675:]

print(x_train.shape)
print(x_valid.shape)
# print(x_test.shape)
# print(x_test)

(756, 6, 5)
(756,)
(600, 6, 5)
(156, 6, 5)


### Model

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torchsummary import summary

x_train = torch.tensor(x_train).float()
y_train = torch.tensor(y_train).float()

x_valid = torch.tensor(x_valid).float()
y_valid = torch.tensor(y_valid).float()

train_dataset = TensorDataset(x_train,y_train)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, pin_memory=True)

valid_dataset = TensorDataset(x_valid,y_valid)
valid_dataloader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

In [7]:
from torch import nn

class NeuralNetwork(nn.Module):
    def __init__(self, num_feature, hidden_size):
        super(NeuralNetwork, self).__init__()
        self.lstm  = nn.LSTM(num_feature, hidden_size, batch_first=True, bidirectional=True)
        self.fc    = nn.Linear(hidden_size*2,1)  # Output size changed to 1 for binary classification
        
    def forward(self, x):
        output, (hidden, cell) = self.lstm(x)
        
        # Concatenate the hidden states from both directions
        # print(hidden.shape)
        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        # print(hidden.shape)
        
        x = self.fc(hidden)  # Extract the hidden state of the last time step
        x = torch.sigmoid(x)  # Apply sigmoid activation function for binary classification
        return x

model = NeuralNetwork(5, 64)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  
model = model.to(device)

# summary(model, (5, ))


In [8]:
# optimizer = optim.Adam(model.parameters())
optimizer = optim.AdamW(model.parameters(), lr=0.001)
criterion  = nn.BCEWithLogitsLoss()

In [9]:
from sklearn.metrics import accuracy_score

def train(dataloader):
    labels, preds = [], []
    epoch_loss = 0
    model.train()  
    
    for batch in dataloader:
        optimizer.zero_grad()          
        x,y= batch
        x = x.to(device)
        y = y.to(device)
        pred = model(x)
        
        pred = pred.view(-1)
        # print(pred)
        # print(y)
        
        loss = criterion(pred, y)        
        loss.backward()               
        optimizer.step()      
        epoch_loss += loss.item()  

        labels += y.cpu().tolist()
        preds += torch.round(pred).cpu().tolist()
        # print(len(labels), len(preds))
        
    # print(labels)
    # print(preds)
    train_acc = accuracy_score(labels, preds)
        
    return train_acc, epoch_loss

# train(train_dataloader)

In [10]:
def evaluate(dataloader):
    labels, preds = [], []
    epoch_loss = 0
    model.eval()  
    
    with torch.no_grad():
        for batch in dataloader:   
            x,y= batch
            x = x.to(device)
            y = y.to(device)
            
            pred = model(x)
            pred = pred.view(-1)
            
            loss = criterion(pred,y)              
            epoch_loss += loss.item()  
          
            labels += y.cpu().tolist()
            preds += torch.round(pred).cpu().tolist()
            
    train_acc = accuracy_score(labels, preds)
        
    # return train_acc, (epoch_loss / len(dataloader))
    return train_acc, epoch_loss

In [11]:
n_epochs = 150
best_valid_loss = float('inf')

for epoch in range(1, n_epochs + 1):

    train_accuracy, train_loss = train(train_dataloader)
    vaild_accuracy, valid_loss = evaluate(valid_dataloader)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model, 'saved_weights.pt')

    # print("Epoch ",epoch+1)
    print(f'\tTrain Loss: {train_loss:.5f} | ' + f'\tVal Loss: {valid_loss:.5f}\n')
    print(f'\tTrain Accuracy: {train_accuracy:.5f} | ' + f'\tVal Accuracy: {vaild_accuracy:.5f}\n')

	Train Loss: 13.65987 | 	Val Loss: 3.57022

	Train Accuracy: 0.51333 | 	Val Accuracy: 0.50000

	Train Loss: 13.38609 | 	Val Loss: 3.49361

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

	Train Loss: 13.21267 | 	Val Loss: 3.46834

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

	Train Loss: 13.17414 | 	Val Loss: 3.46697

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

	Train Loss: 13.17335 | 	Val Loss: 3.46650

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

	Train Loss: 13.17222 | 	Val Loss: 3.46623

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

	Train Loss: 13.17133 | 	Val Loss: 3.46616

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

	Train Loss: 13.17113 | 	Val Loss: 3.46607

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

	Train Loss: 13.17070 | 	Val Loss: 3.46602

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

	Train Loss: 13.17071 | 	Val Loss: 3.46596

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

	Train Loss: 13.17059 | 	Val Loss: 3.46596

	Train

In [152]:
model=torch.load('saved_weights.pt')

In [154]:
x_test= torch.tensor(x_test).float()

with torch.no_grad():
  y_test_pred = model(x_test)
  
print(y_test_pred)
print(len(y_test_pred))

y_test_pred = y_test_pred.numpy()[0]
print(y_test_pred)

tensor([[0.4907],
        [0.4850],
        [0.4906],
        [0.4850],
        [0.4906],
        [0.4850],
        [0.4907],
        [0.4850],
        [0.4907],
        [0.4850],
        [0.4907],
        [0.4881],
        [0.4895],
        [0.4880],
        [0.4894],
        [0.4880],
        [0.4894],
        [0.4880],
        [0.4893],
        [0.4880],
        [0.4893],
        [0.4880],
        [0.4893],
        [0.4880],
        [0.4893],
        [0.4819],
        [0.4825],
        [0.4818],
        [0.4827],
        [0.4818],
        [0.4827],
        [0.4817],
        [0.4827],
        [0.4817],
        [0.4826],
        [0.4817],
        [0.4826],
        [0.4817],
        [0.4826],
        [0.4854],
        [0.4867],
        [0.4853],
        [0.4865],
        [0.4852],
        [0.4865],
        [0.4852],
        [0.4864],
        [0.4852],
        [0.4864],
        [0.4852],
        [0.4864],
        [0.4852],
        [0.4864],
        [0.4914],
        [0.4932],
        [0

  x_test= torch.tensor(x_test).float()
