In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import re

### Prepare Dataset

In [12]:
def read_data(algo='cm', row=3, width=4096, level=1, seed=1, count=1, flowkey='srcIP', 
              epochs=['10'], dataset='zipf2a_4_caida20180517_6.pcap', type='accumulate.txt', window_size=500):
    
    res = []
    cnt = 0
    for epoch in epochs:
        
        path = f"../SketchPatternQuery/{algo}/{dataset}/"\
                f"{flowkey}/row_{row}_width_{width}_level_{level}_epoch_{epoch}_count_{count}_seed_{seed}/"
        
        for dir in sorted(os.listdir(path)):
            p = os.path.join(path, dir)
            if os.path.isdir(p): 
                window_dir = "window_" + str(window_size)
                full_path = os.path.join(path, dir, window_dir, type)    
                with open(full_path, 'r') as f:
                    for line in f:
                        num_str = re.match(r'\(srcIP = +(\d+\.\d+\.\d+\.\d+)\) +(.+)', line).group(2)
                        num_list = [int(num) for num in num_str.split()]
                        # print(num_list)
                        res.append(num_list)
                        
                        cnt += 1
                        if cnt == 5:
                            break
                        
    bp = int(dataset.split("_")[1])*(int(1000/window_size))+1
    # print(bp)
                        
    np1 = np.array(res)
    np2 = np.rot90(np1)
    np2 = np.rot90(np2)
    np2 = np.rot90(np2)
            
    extract_x = []
    extract_x.append(np2[bp-8:bp].tolist())
    extract_x.append(np2[bp-7:bp+1].tolist())
    # extract_x.append(np2[bp-6:bp+2].tolist())
    # extract_x.append(np2[bp-5:bp+3].tolist())
    
    extract_y = [0,1]
    # extract_y = [0,1,1,0]
    
    return extract_x, extract_y

read_data()

([[[0, 2, 1, 1077, 0],
   [1, 5, 3, 12104, 5],
   [4, 5, 3, 14541, 6],
   [7, 8, 4, 16717, 10],
   [11, 13, 4, 21013, 12],
   [12, 15, 4, 21736, 14],
   [16, 17, 6, 25871, 16],
   [16, 21, 12, 26631, 29]],
  [[1, 5, 3, 12104, 5],
   [4, 5, 3, 14541, 6],
   [7, 8, 4, 16717, 10],
   [11, 13, 4, 21013, 12],
   [12, 15, 4, 21736, 14],
   [16, 17, 6, 25871, 16],
   [16, 21, 12, 26631, 29],
   [1985, 1781, 2373, 26678, 2590]]],
 [0, 1])

In [13]:
def get_training_data():

    data = []
    label = []

    algo = 'cm'
    widths = [2048, 4096, 8192, 16384, 32768, 65536, 131072]
    dataset = []
    window_size = 500
    sliding_window_size = 8 / (1000/window_size)

    path = "/home/ming/SketchMercator/pattern_detection/lstm/SketchPatternQuery/" + algo + "/"
    for dir in sorted(os.listdir(path)):
        dataset.append(dir)
        
    # print(dataset)

    for d in dataset:
        if int(d.split("_")[1]) < sliding_window_size:
            continue
        for w in widths:
            da, la = read_data(algo=algo, row=3, width=w, level=1, seed=1, count=1, flowkey='srcIP', 
                    epochs=['10'], dataset=d, type='accumulate.txt', window_size=window_size)
            
            # print(d, w)
            # print(da)
            
            for dd in da:
                data.append(dd)
            for ll in la:
                label.append(ll)
            
            # data.append(da[0])
            # data.append(da[1])
            # label.append(la[0])
            # label.append(la[1])
            
    # print(data)
    print(np.array(data).shape)
    # print(label)
    print(np.array(label).shape)
    
    return data, label

get_training_data()


(630, 8, 5)
(630,)


([[[60, 221, 702, 3020, 644],
   [126, 604, 945, 5371, 1124],
   [188, 1096, 1384, 8604, 1628],
   [240, 1430, 1516, 11373, 2040],
   [295, 1748, 1899, 13974, 3050],
   [356, 2322, 2175, 16414, 3398],
   [431, 2898, 2431, 19857, 6096],
   [511, 3964, 2794, 23097, 6457]],
  [[126, 604, 945, 5371, 1124],
   [188, 1096, 1384, 8604, 1628],
   [240, 1430, 1516, 11373, 2040],
   [295, 1748, 1899, 13974, 3050],
   [356, 2322, 2175, 16414, 3398],
   [431, 2898, 2431, 19857, 6096],
   [511, 3964, 2794, 23097, 6457],
   [930, 4963, 4524, 23120, 7984]],
  [[35, 205, 667, 3014, 445],
   [68, 563, 863, 5358, 887],
   [103, 1058, 1219, 8584, 1350],
   [126, 1381, 1294, 11336, 1721],
   [147, 1730, 1631, 13930, 2701],
   [175, 2301, 1873, 16359, 2923],
   [199, 2870, 2098, 19788, 5323],
   [223, 3936, 2454, 23025, 5637]],
  [[68, 563, 863, 5358, 887],
   [103, 1058, 1219, 8584, 1350],
   [126, 1381, 1294, 11336, 1721],
   [147, 1730, 1631, 13930, 2701],
   [175, 2301, 1873, 16359, 2923],
   [199, 287

### Splitting the dataset

In [14]:
from sklearn.preprocessing import MinMaxScaler

data, label = get_training_data()

npData = np.array(data)
npLabel = np.array(label)

scaler = MinMaxScaler(feature_range=(0, 1))

# Reshape the data to 2D array
data_reshaped = npData.reshape(-1, npData.shape[-1])
# print(data_reshaped.shape)

rescaledData = scaler.fit_transform(data_reshaped)
# print(data_reshaped)
# print(rescaledData)

normalizedData  = rescaledData.reshape(npData.shape)

data_size =  round(len(npLabel)*0.8)
# print(len(normalizedData))
x_train = normalizedData[:data_size]
y_train = npLabel[:data_size]

x_valid  = normalizedData[data_size:]
y_valid  = npLabel[data_size:]

# x_test  = normalizedData[675:]
# y_test  = npLabel[675:]

print(x_train.shape)
print(x_valid.shape)
# print(x_test.shape)
# print(x_train)

(630, 8, 5)
(630,)
(504, 8, 5)
(126, 8, 5)


### Model

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torchsummary import summary

x_train = torch.tensor(x_train).float()
y_train = torch.tensor(y_train).float()

x_valid = torch.tensor(x_valid).float()
y_valid = torch.tensor(y_valid).float()

train_dataset = TensorDataset(x_train,y_train)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, pin_memory=True)

valid_dataset = TensorDataset(x_valid,y_valid)
valid_dataloader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

In [16]:
from torch import nn

class NeuralNetwork(nn.Module):
    def __init__(self, num_feature, hidden_size):
        super(NeuralNetwork, self).__init__()
        self.lstm  = nn.LSTM(num_feature, hidden_size, batch_first=True, bidirectional=True)
        self.fc    = nn.Linear(hidden_size*2,1)  # Output size changed to 1 for binary classification
        
    def forward(self, x):
        output, (hidden, cell) = self.lstm(x)
        
        # Concatenate the hidden states from both directions
        # print(hidden.shape)
        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        # print(hidden.shape)
        
        x = self.fc(hidden)  # Extract the hidden state of the last time step
        x = torch.sigmoid(x)  # Apply sigmoid activation function for binary classification
        return x

model = NeuralNetwork(5, 64)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  
model = model.to(device)

# summary(model, (5, ))


In [17]:
# optimizer = optim.Adam(model.parameters())
optimizer = optim.AdamW(model.parameters(), lr=0.001)
criterion  = nn.BCEWithLogitsLoss()

In [21]:
from sklearn.metrics import accuracy_score

def train(dataloader):
    labels, preds = [], []
    epoch_loss = 0
    model.train()  
    
    for batch in dataloader:
        optimizer.zero_grad()          
        x,y= batch
        x = x.to(device)
        y = y.to(device)
        pred = model(x)
        
        pred = pred.view(-1)
        # print(pred)
        # print(y)
        
        loss = criterion(pred, y)        
        loss.backward()               
        optimizer.step()      
        epoch_loss += loss.item()  

        labels += y.cpu().tolist()
        preds += torch.round(pred).cpu().tolist()
        # print(len(labels), len(preds))
        
    # print(labels)
    # print(preds)
    train_acc = accuracy_score(labels, preds)
        
    return train_acc, epoch_loss

# train(train_dataloader)

In [19]:
def evaluate(dataloader):
    labels, preds = [], []
    epoch_loss = 0
    model.eval()  
    
    with torch.no_grad():
        for batch in dataloader:   
            x,y= batch
            x = x.to(device)
            y = y.to(device)
            
            pred = model(x)
            pred = pred.view(-1)
            
            loss = criterion(pred,y)              
            epoch_loss += loss.item()  
          
            labels += y.cpu().tolist()
            preds += torch.round(pred).cpu().tolist()
            
    train_acc = accuracy_score(labels, preds)
        
    # return train_acc, (epoch_loss / len(dataloader))
    return train_acc, epoch_loss

In [22]:
n_epochs = 50
best_valid_loss = float('inf')

for epoch in range(1, n_epochs + 1):

    train_accuracy, train_loss = train(train_dataloader)
    vaild_accuracy, valid_loss = evaluate(valid_dataloader)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model, 'saved_weights.pt')

    # print("Epoch ",epoch+1)
    print(f'\tTrain Loss: {train_loss:.5f} | ' + f'\tVal Loss: {valid_loss:.5f}\n')
    print(f'\tTrain Accuracy: {train_accuracy:.5f} | ' + f'\tVal Accuracy: {vaild_accuracy:.5f}\n')

	Train Loss: 11.09036 | 	Val Loss: 2.77260

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

	Train Loss: 11.09036 | 	Val Loss: 2.77260

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

	Train Loss: 11.09040 | 	Val Loss: 2.77260

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

	Train Loss: 11.09034 | 	Val Loss: 2.77260

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

	Train Loss: 11.09038 | 	Val Loss: 2.77260

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

	Train Loss: 11.09036 | 	Val Loss: 2.77260

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

	Train Loss: 11.09038 | 	Val Loss: 2.77260

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

	Train Loss: 11.09037 | 	Val Loss: 2.77260

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

	Train Loss: 11.09037 | 	Val Loss: 2.77260

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

	Train Loss: 11.09032 | 	Val Loss: 2.77260

	Train Accuracy: 0.50000 | 	Val Accuracy: 0.50000

	Train Loss: 11.09031 | 	Val Loss: 2.77260

	Train

In [152]:
model=torch.load('saved_weights.pt')

In [154]:
x_test= torch.tensor(x_test).float()

with torch.no_grad():
  y_test_pred = model(x_test)
  
print(y_test_pred)
print(len(y_test_pred))

y_test_pred = y_test_pred.numpy()[0]
print(y_test_pred)

tensor([[0.4907],
        [0.4850],
        [0.4906],
        [0.4850],
        [0.4906],
        [0.4850],
        [0.4907],
        [0.4850],
        [0.4907],
        [0.4850],
        [0.4907],
        [0.4881],
        [0.4895],
        [0.4880],
        [0.4894],
        [0.4880],
        [0.4894],
        [0.4880],
        [0.4893],
        [0.4880],
        [0.4893],
        [0.4880],
        [0.4893],
        [0.4880],
        [0.4893],
        [0.4819],
        [0.4825],
        [0.4818],
        [0.4827],
        [0.4818],
        [0.4827],
        [0.4817],
        [0.4827],
        [0.4817],
        [0.4826],
        [0.4817],
        [0.4826],
        [0.4817],
        [0.4826],
        [0.4854],
        [0.4867],
        [0.4853],
        [0.4865],
        [0.4852],
        [0.4865],
        [0.4852],
        [0.4864],
        [0.4852],
        [0.4864],
        [0.4852],
        [0.4864],
        [0.4852],
        [0.4864],
        [0.4914],
        [0.4932],
        [0

  x_test= torch.tensor(x_test).float()
