In [63]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import re

### Prepare Dataset

In [64]:
def read_data(algo='cm', row=3, width=4096, level=1, seed=1, count=1, flowkey='srcIP', 
              epochs=['10'], dataset='zipf2a_4_caida20180517_6.pcap', type='accumulate.txt', window_size=200):
    
    res = []
    cnt = 0
    for epoch in epochs:
        
        path = f"../SketchPatternQuery/{algo}/{dataset}/"\
                f"{flowkey}/row_{row}_width_{width}_level_{level}_epoch_{epoch}_count_{count}_seed_{seed}/"
        
        for dir in sorted(os.listdir(path)):
            p = os.path.join(path, dir)
            if os.path.isdir(p): 
                window_dir = "window_" + str(window_size)
                full_path = os.path.join(path, dir, window_dir, type)    
                with open(full_path, 'r') as f:
                    for line in f:
                        num_str = re.match(r'\(srcIP = +(\d+\.\d+\.\d+\.\d+)\) +(.+)', line).group(2)
                        num_list = [int(num) for num in num_str.split()]
                        # print(num_list)
                        res.append(num_list)
                        
                        cnt += 1
                        if cnt == 5:
                            break
                        
    bp = int(dataset.split("_")[1])*(int(1000/window_size))+1
    # print(bp)
                        
    np1 = np.array(res)
    np2 = np.rot90(np1)
    np2 = np.rot90(np2)
    np2 = np.rot90(np2)
    # print(np2)
            
    extract_x = []
    extract_x.append(np2[bp-20:bp].tolist())
    extract_x.append(np2[bp-19:bp+1].tolist())
    extract_x.append(np2[bp-18:bp+2].tolist())
    extract_x.append(np2[bp-17:bp+3].tolist())
    
    extract_y = [0,1,1,0]
    
    return extract_x, extract_y

read_data()

([[[0, 0, 0, 578, 0],
   [0, 2, 1, 889, 0],
   [0, 2, 1, 1534, 1],
   [0, 2, 1, 8071, 4],
   [1, 5, 3, 12104, 5],
   [1, 5, 3, 12547, 5],
   [2, 5, 3, 14119, 5],
   [6, 5, 3, 15513, 7],
   [7, 6, 4, 16170, 8],
   [7, 8, 4, 16717, 10],
   [8, 11, 4, 19393, 10],
   [10, 12, 4, 20740, 12],
   [11, 14, 4, 21239, 12],
   [11, 15, 4, 21519, 13],
   [12, 15, 4, 21736, 14],
   [14, 17, 4, 22052, 15],
   [16, 17, 4, 23310, 15],
   [16, 17, 6, 26064, 16],
   [16, 18, 9, 26420, 16],
   [16, 31, 12, 26631, 29]],
  [[0, 2, 1, 889, 0],
   [0, 2, 1, 1534, 1],
   [0, 2, 1, 8071, 4],
   [1, 5, 3, 12104, 5],
   [1, 5, 3, 12547, 5],
   [2, 5, 3, 14119, 5],
   [6, 5, 3, 15513, 7],
   [7, 6, 4, 16170, 8],
   [7, 8, 4, 16717, 10],
   [8, 11, 4, 19393, 10],
   [10, 12, 4, 20740, 12],
   [11, 14, 4, 21239, 12],
   [11, 15, 4, 21519, 13],
   [12, 15, 4, 21736, 14],
   [14, 17, 4, 22052, 15],
   [16, 17, 4, 23310, 15],
   [16, 17, 6, 26064, 16],
   [16, 18, 9, 26420, 16],
   [16, 31, 12, 26631, 29],
   [538, 67

In [65]:
def get_training_data():

    data = []
    label = []

    algo = 'cm'
    widths = [2048, 4096, 8192, 16384, 32768, 65536, 131072]
    dataset = []
    window_size = 200
    sliding_window_size = 20 / (1000/window_size)

    path = "/home/ming/SketchMercator/pattern_detection/lstm/SketchPatternQuery/" + algo + "/"
    for dir in sorted(os.listdir(path)):
        dataset.append(dir)
        
    # print(dataset)

    for d in dataset:
        if int(d.split("_")[1]) < sliding_window_size:
            continue
        for w in widths:
            da, la = read_data(algo=algo, row=3, width=w, level=1, seed=1, count=1, flowkey='srcIP', 
                    epochs=['10'], dataset=d, type='accumulate.txt', window_size=window_size)
            
            # print(d, w)
            # print(da)
            
            for dd in da:
                data.append(dd)
            for ll in la:
                label.append(ll)
            
            # data.append(da[0])
            # data.append(da[1])
            # label.append(la[0])
            # label.append(la[1])
            
    # print(data)
    print(np.array(data).shape)
    # print(label)
    print(np.array(label).shape)
    
    return data, label

get_training_data()


(1260, 20, 5)
(1260,)


([[[24, 15, 262, 1533, 390],
   [40, 29, 683, 2515, 469],
   [74, 239, 806, 3337, 1062],
   [96, 390, 896, 4217, 1099],
   [126, 604, 945, 5371, 1124],
   [151, 732, 1095, 6731, 1150],
   [181, 926, 1251, 8123, 1617],
   [199, 1120, 1402, 9215, 1655],
   [223, 1320, 1468, 10433, 1738],
   [240, 1430, 1516, 11373, 2040],
   [261, 1606, 1546, 12284, 2587],
   [282, 1736, 1893, 13586, 2986],
   [312, 1940, 2035, 14391, 3091],
   [337, 2131, 2089, 15505, 3263],
   [356, 2322, 2175, 16414, 3398],
   [389, 2579, 2229, 18228, 3689],
   [413, 2895, 2399, 19275, 4879],
   [444, 3266, 2453, 20685, 6152],
   [485, 3637, 2666, 22016, 6329],
   [511, 3964, 2794, 23097, 6457]],
  [[40, 29, 683, 2515, 469],
   [74, 239, 806, 3337, 1062],
   [96, 390, 896, 4217, 1099],
   [126, 604, 945, 5371, 1124],
   [151, 732, 1095, 6731, 1150],
   [181, 926, 1251, 8123, 1617],
   [199, 1120, 1402, 9215, 1655],
   [223, 1320, 1468, 10433, 1738],
   [240, 1430, 1516, 11373, 2040],
   [261, 1606, 1546, 12284, 2587],

### Splitting the dataset

In [66]:
from sklearn.preprocessing import MinMaxScaler

data, label = get_training_data()

npData = np.array(data)
npLabel = np.array(label)

scaler = MinMaxScaler(feature_range=(0, 1))

# Reshape the data to 2D array
data_reshaped = npData.reshape(-1, npData.shape[-1])
# print(data_reshaped.shape)

rescaledData = scaler.fit_transform(data_reshaped)
# print(data_reshaped)
# print(rescaledData)

normalizedData  = rescaledData.reshape(npData.shape)

data_size =  round(len(npLabel)*0.8)
# print(len(normalizedData))
x_train = normalizedData[:data_size]
y_train = npLabel[:data_size]

x_valid  = normalizedData[data_size:]
y_valid  = npLabel[data_size:]
# x_test  = normalizedData[675:]
# y_test  = npLabel[675:]

print(x_train.shape)
print(x_valid.shape)
# print(x_test.shape)
# print(x_train)

(1260, 20, 5)
(1260,)
(1008, 20, 5)
(252, 20, 5)


### Model

In [67]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torchsummary import summary

x_train = torch.tensor(x_train).float()
y_train = torch.tensor(y_train).float()

x_valid = torch.tensor(x_valid).float()
y_valid = torch.tensor(y_valid).float()

train_dataset = TensorDataset(x_train,y_train)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, pin_memory=True)

valid_dataset = TensorDataset(x_valid,y_valid)
valid_dataloader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

In [74]:
from torch import nn

class NeuralNetwork(nn.Module):
    def __init__(self, num_feature, hidden_size):
        super(NeuralNetwork, self).__init__()
        self.lstm  = nn.LSTM(num_feature, hidden_size, batch_first=True, bidirectional=True)
        self.fc    = nn.Linear(hidden_size*2,1)  # Output size changed to 1 for binary classification
        
    def forward(self, x):
        output, (hidden, cell) = self.lstm(x)
        
        # Concatenate the hidden states from both directions
        # print(hidden.shape)
        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        # print(hidden.shape)
        
        x = self.fc(hidden)  # Extract the hidden state of the last time step
        x = torch.sigmoid(x)  # Apply sigmoid activation function for binary classification
        return x

model = NeuralNetwork(5, 64)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  
model = model.to(device)



In [69]:
# optimizer = optim.Adam(model.parameters())
optimizer = optim.AdamW(model.parameters(), lr=0.001)
criterion  = nn.BCEWithLogitsLoss()

In [70]:
from sklearn.metrics import accuracy_score

def train(dataloader):
    labels, preds = [], []
    epoch_loss = 0
    model.train()  
    
    for batch in dataloader:
        optimizer.zero_grad()          
        x,y= batch
        x = x.to(device)
        y = y.to(device)
        pred = model(x)
        
        pred = pred.view(-1)
        # print(pred)
        # print(y)
        
        loss = criterion(pred, y)        
        loss.backward()               
        optimizer.step()      
        epoch_loss += loss.item()  

        labels += y.cpu().tolist()
        preds += torch.round(pred).cpu().tolist()
        # print(len(labels), len(preds))
        
    # print(labels)
    # print(preds)
    train_acc = accuracy_score(labels, preds)
        
    return train_acc, epoch_loss

# train(train_dataloader)

In [71]:
def evaluate(dataloader):
    labels, preds = [], []
    epoch_loss = 0
    model.eval()  
    
    with torch.no_grad():
        for batch in dataloader:   
            x,y= batch
            x = x.to(device)
            y = y.to(device)
            
            pred = model(x)
            pred = pred.view(-1)
            
            loss = criterion(pred,y)              
            epoch_loss += loss.item()  
          
            labels += y.cpu().tolist()
            preds += torch.round(pred).cpu().tolist()
            
    train_acc = accuracy_score(labels, preds)
        
    # return train_acc, (epoch_loss / len(dataloader))
    return train_acc, epoch_loss

In [72]:
n_epochs = 50
best_valid_loss = float('inf')

for epoch in range(1, n_epochs + 1):

    train_accuracy, train_loss = train(train_dataloader)
    vaild_accuracy, valid_loss = evaluate(valid_dataloader)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model, 'saved_weights.pt')

    # print("Epoch ",epoch+1)
    print(f'epoch: {epoch}\n')
    print(f'\tTrain Loss: {train_loss:.5f} | ' + f'\tVal Loss: {valid_loss:.5f}\n')
    print(f'\tTrain Accuracy: {train_accuracy:.5f} | ' + f'\tVal Accuracy: {vaild_accuracy:.5f}\n')

epoch: 1

	Train Loss: 22.68836 | 	Val Loss: 5.55603

	Train Accuracy: 0.50099 | 	Val Accuracy: 0.50000

epoch: 2

	Train Loss: 22.18039 | 	Val Loss: 5.54536

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

epoch: 3

	Train Loss: 22.18223 | 	Val Loss: 5.54527

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

epoch: 4

	Train Loss: 22.18233 | 	Val Loss: 5.54522

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

epoch: 5

	Train Loss: 22.18080 | 	Val Loss: 5.54520

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

epoch: 6

	Train Loss: 22.18081 | 	Val Loss: 5.54521

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

epoch: 7

	Train Loss: 22.18141 | 	Val Loss: 5.54520

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

epoch: 8

	Train Loss: 22.18139 | 	Val Loss: 5.54520

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

epoch: 9

	Train Loss: 22.18078 | 	Val Loss: 5.54520

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

epoch: 10

	Train Loss: 22.18105 | 	Val Loss: 5.54519



In [152]:
model=torch.load('saved_weights.pt')

In [154]:
x_test= torch.tensor(x_test).float()

with torch.no_grad():
  y_test_pred = model(x_test)
  
print(y_test_pred)
print(len(y_test_pred))

y_test_pred = y_test_pred.numpy()[0]
print(y_test_pred)

tensor([[0.4907],
        [0.4850],
        [0.4906],
        [0.4850],
        [0.4906],
        [0.4850],
        [0.4907],
        [0.4850],
        [0.4907],
        [0.4850],
        [0.4907],
        [0.4881],
        [0.4895],
        [0.4880],
        [0.4894],
        [0.4880],
        [0.4894],
        [0.4880],
        [0.4893],
        [0.4880],
        [0.4893],
        [0.4880],
        [0.4893],
        [0.4880],
        [0.4893],
        [0.4819],
        [0.4825],
        [0.4818],
        [0.4827],
        [0.4818],
        [0.4827],
        [0.4817],
        [0.4827],
        [0.4817],
        [0.4826],
        [0.4817],
        [0.4826],
        [0.4817],
        [0.4826],
        [0.4854],
        [0.4867],
        [0.4853],
        [0.4865],
        [0.4852],
        [0.4865],
        [0.4852],
        [0.4864],
        [0.4852],
        [0.4864],
        [0.4852],
        [0.4864],
        [0.4852],
        [0.4864],
        [0.4914],
        [0.4932],
        [0

  x_test= torch.tensor(x_test).float()
