In [1]:
from sklearn.model_selection import train_test_split
from hexdump import hexdump
from PIL import Image
from glob import glob

import numpy as np
import pandas as pd
import os, sys
import time
import math
import text_to_image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.init as init
import matplotlib.pyplot as plt
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.utils.data import random_split

In [2]:
print(torch.__version__)
print(torch.backends.mps.is_available())

1.12.1
True


In [3]:
print(torch.backends.mps.is_built())

True


In [4]:
device = torch.device('mps')

In [5]:
train = pd.read_csv("./track_b_learn.csv", index_col = [0])
label = pd.read_csv("./track_b_learn_label.csv", index_col = [0])
payload = train['payload']

In [6]:
train[train['s_port'] <= 1024]['s_port'].value_counts()

80     4849
68        3
432       1
951       1
278       1
Name: s_port, dtype: int64

In [7]:
train = pd.concat([train, label], axis = 1)

In [8]:
train['class'].value_counts()

2_exploit           19691
4_unknown           16089
3_post              11718
1_reconnaissance     2502
Name: class, dtype: int64

In [9]:
def convert_1d_to_2d(list, cols):
    return [list[j:j + cols] for j in range(0, len(list), cols)]

def preprocess(train):
    if (len(train) != 0):
        size = 100
        payload_images = list(range(0,len(train)))
        image_buff = np.zeros([len(train), size, size])
        for id, idx in enumerate(train, start = 0):
            ln = len(str(idx).encode('utf-8'))
            wid = math.pow(ln, 0.5)
            rem = ln%wid  # line 10
            hex_payload = [alpha for alpha in str(idx)]
            
            for i in range(ln): # a -> array('B')
                hex_payload[i] = hex_payload[i].encode('utf-8').hex()
                hex_payload[i] = int(hex_payload[i], 16)
            hex_payload = np.array(hex_payload)
            grayscale = convert_1d_to_2d(hex_payload, int((ln/wid)))
            grayscale = pd.DataFrame(grayscale)
            gray_row = len(grayscale.index)
            gray_col = len(grayscale.columns)
            grayscale = pd.DataFrame(grayscale).fillna(0)
            grayscale = np.pad(grayscale, ((0, size-gray_row), (0, size-gray_col)), 'constant', constant_values=0)
            grayscale = grayscale.astype(np.uint8)
            image_buff[id] = grayscale
            
    return image_buff

def buff_to_grayscale_image(image_buff):
    for i, row in enumerate(image_buff, start = 0):
        grayscale_image = Image.fromarray(row, 'L')
        grayscale_image.save("/Users/timkh/Desktop/kisa/contest/grayscale_class/1_reconnaissance/" + str(i) + ".jpg", "JPEG")

In [10]:
# image_buff = preprocess(payload)
# buff_to_grayscale_image(image_buff)

In [11]:
# image_dump_train_dataset_list = glob('/Users/timkh/Desktop/kisa/contest/grayscale_image/*.png')

In [12]:
class_label = np.zeros(len(label))
for i in range(len(label)):
    class_label[i] = label['class'][i][:1]
class_label = pd.DataFrame(class_label).astype(str)

In [13]:
trans = transforms.Compose([transforms.Resize((100, 100)),
                            transforms.ToTensor(),
                            transforms.Normalize((0),(0.5))
                           ])
trainset = torchvision.datasets.ImageFolder( root="/Users/timkh/Desktop/kisa/contest/grayscale_class", transform = trans)

In [14]:
classes = trainset.classes
print(classes)
print(trainset)

['1_reconnaissance', '2_exploit', '3_post', '4_unknown']
Dataset ImageFolder
    Number of datapoints: 50012
    Root location: /Users/timkh/Desktop/kisa/contest/grayscale_class
    StandardTransform
Transform: Compose(
               Resize(size=(100, 100), interpolation=bilinear, max_size=None, antialias=None)
               ToTensor()
               Normalize(mean=0, std=0.5)
           )


In [15]:
train_size = int(0.8 * len(trainset))
test_size = len(trainset) - train_size

train_dataset, test_dataset = torch.utils.data.random_split(trainset, [train_size, test_size])

In [16]:
# X_train, X_test, y_train, y_test = train_test_split(image_dump_train_dataset_list, class_label, test_size = 0.4, shuffle = True, random_state = 1004)

batch_size = 16
learning_rate = 1e-4
num_epoch = 50

train_loader = torch.utils.data.DataLoader(train_dataset,batch_size=batch_size,shuffle=True,num_workers=4,drop_last=True)
test_loader = torch.utils.data.DataLoader(test_dataset,batch_size=batch_size,shuffle=False,num_workers=4,drop_last=True)

class Net(nn.Module):
  def __init__(self):
    super(Net, self).__init__()
    self.layer = nn.Sequential(
      nn.Conv2d(3, 16, 3), 
      nn.MaxPool2d(2,2),
      nn.Conv2d(16, 32, 3),
      nn.MaxPool2d(2, 2),
      nn.Conv2d(32, 64, 3),
      nn.MaxPool2d(2, 2)
    )
    self.fc_layer = nn.Sequential(nn.Linear(6400,16))
    
  def forward(self, x):
    # x = x.permute(0, 
    out = self.layer(x)
    out = out.view(batch_size, -1)
    out = self.fc_layer(out)
    return out

model = Net()
model = model.to(device)
print(model)
loss_func = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(model.parameters(), lr= learning_rate)

Net(
  (layer): Sequential(
    (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1))
    (1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1))
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc_layer): Sequential(
    (0): Linear(in_features=6400, out_features=16, bias=True)
  )
)


In [17]:
loss_arr = []

for i in range(num_epoch):
    for j,[image,label] in enumerate(train_loader):
        x = image
        y_ = label

        x = x.to(device)
        y_ = y_.to(device)
        
        optimizer.zero_grad() #optimizer
        output = model.forward(x) # CNN
        loss = loss_func(output,y_)
        loss.backward()#Back Propagation
        optimizer.step()
        if j % 1000 == 0 :
            print(loss)
            loss_arr.append(loss.cpu().detach().numpy())
            
correct = 0
total = 0
with torch.no_grad():
    for image,label in test_loader : 
        x = image
        y_ = label
        
        x = x.to(device)
        y_ = y_.to(device)
        
        output = model.forward(x)
        _,output_index = torch.max(output,1)
        total += label.size(0)
        correct += (output_index == y_).sum().float()
    print("Accuracy of Test Data : {}".format(100*correct/total))

  nonzero_finite_vals = torch.masked_select(tensor_view, torch.isfinite(tensor_view) & tensor_view.ne(0))


tensor(2.7491, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(0.4588, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(0.6638, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(0.1690, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(0.0787, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(0.0482, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(0.1029, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(0.0110, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(0.3838, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(0.2643, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(0.0401, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(0.0047, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(0.6276, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(0.0121, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(0.1534, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(0.0293, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(0.0071, device='mps:0', grad_fn=<NllLossBackward0

In [18]:
correct

tensor(9837., device='mps:0')

In [19]:
total

10000

In [20]:
# from torchmetrics.classification import MulticlassPrecision

correct = 0
total = 0
y_pred_list = []
y_true_list = []
with torch.no_grad():
    for image,label in test_loader : 
        x = image.to(device)
        # y_ = label
        y_test_pred = model(x)
        _, y_pred_tags = torch.max(y_test_pred, 1)
        y_pred_list.append(y_pred_tags.cpu().numpy())
        y_true_list.append(label.cpu().numpy())

# y_pred_list = [a.squeeze().tolist() for a in y_pred_list]
# y_true_list = [a.squeeze().tolist() for a in y_true_list]

#         output = model.forward(x)
#         _,output_index = torch.max(output,1)
#         total += label.size(0)
#         correct += (output_index == y_).sum().float()
    # print("Accuracy of Test Data : {}".format(100*correct/total))
    # print(precision_score(y_true, y_pred, average='macro'))

In [34]:
from torchmetrics import AveragePrecision
from torchmetrics.classification import MulticlassPrecision
# from iterable import chain

y_pred_list = torch.Tensor(np.array(y_pred_list).flatten())
y_true_list = torch.Tensor(np.array(y_true_list).flatten())

metrics = MulticlassPrecision(num_classes=4, average = 'macro')
print(metrics(y_true_list, y_pred_list))

tensor(0.9565)


In [41]:
# cleverhans adversarial example add  
# !pip3 install cleverhans
# from absl import app, flags
from cleverhans.torch.attacks.projected_gradient_descent import(
    projected_gradient_descent, fast_gradient_method)
from easydict import EasyDict

# FLAGS = flags.FLAGS
eps = 0.3 #epsilon
nb_epochs = 8
adv_train = False

# flags.DEFINE_float("eps", 0.3, "Total epsilon for FGM and PGD attacks")
# flags.DEFINE_bool("adv_train", False, "User adversarial training on PGD adversarial examples")

# Evaluate on clean and adversarial data
model.eval()
report = EasyDict(nb_test=0, correct=0, correct_fgm=0, correct_pgd=0)

for x, y in test_loader:
    x, y = x.to(device), y.to(device)
    x_fgm = fast_gradient_method(model, x, eps, np.inf)
    x_pgd = projected_gradient_descent(model, x, eps, 0.01, 40, np.inf)
    _, y_pred = model(x).max(1) # pure
    _, y_pred_fgm = model(x_fgm).max(1) # FGM adv example
    _, y_pred_pgd = model(x_pgd).max(1) # PGD adv example
    
    report.nb_test += y.size(0)
    report.correct += y_pred.eq(y).sum().item()
    report.correct_fgm += y_pred_fgm.eq(y).sum().item()
    report.correct_pgd += y_pred_pgd.eq(y).sum().item()

print("test acc on clean (%) : {:.3f}".format(report.correct / report.nb_test * 100.0))
print("test acc on FGM (%) : {:.3f}".format(report.correct_fgm / report.nb_test * 100.0))
print("test acc on PGD (%) : {:.3f}".format(report.correct_pgd / report.nb_test * 100.0))



test acc on clean (%) : 98.370
test acc on FGM (%) : 1.340
test acc on PGD (%) : 1.200


In [None]:
# loss_arr = []

# for i in range(num_epoch):
#     for j,[image,label] in enumerate(train_loader):
#         x = image
#         y_ = label

#         x = x.to(device)
#         y_ = y_.to(device)
        
#         optimizer.zero_grad() #optimizer
#         output = model.forward(x) # CNN
#         loss = loss_func(output,y_)
#         loss.backward()#Back Propagation
#         optimizer.step()
#         if j % 1000 == 0 :
#             print(loss)
#             loss_arr.append(loss.cpu().detach().numpy())
            
# correct = 0
# total = 0
# with torch.no_grad():
#     for image,label in test_loader : 
#         x = image
#         y_ = label
        
#         x = x.to(device)
#         y_ = y_.to(device)
        
#         output = model.forward(x)
#         _,output_index = torch.max(output,1)
#         total += label.size(0)
#         correct += (output_index == y_).sum().float()
#     print("Accuracy of Test Data : {}".format(100*correct/total))

In [None]:
# cross validation