Yea we train in Jupyter now, deal with it.

In [1]:
import numpy as np

import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch.nn.functional import one_hot
from torch.utils.data import WeightedRandomSampler

from assembly_extract import OPS_LENGTH

import random

import networkx as nx
import matplotlib.pyplot as plt

import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool
from torch_geometric.nn import GATv2Conv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def visualize_graph(G, color = None):
    plt.figure(figsize=(7,7))
    plt.xticks([])
    plt.yticks([])
    nx.draw_networkx(G, pos = nx.spring_layout(G, seed = 42), with_labels = False,
                     node_size = 32, node_color = color, cmap = "Set2", alpha = 0.5)
    plt.show()


def visualize_embedding(h, color = None, epoch = None, loss = None):
    plt.figure(figsize = (7,7))
    plt.xticks([])
    plt.yticks([])
    h = h.detach().cpu().numpy()
    plt.scatter(h[:, 0], h[:, 1], s = 140, cmap = "Set2")
    if epoch is not None and loss is not None:
        plt.xlabel(f'Epoch: {epoch}, Loss: {loss.item():.4f}', fontsize = 16)
    plt.show()

In [3]:
dataset = torch.load('data/dataset16N.pt')
random.shuffle(dataset)

print(f"Amount of items: {len(dataset)}")

bn = 0
rs = 0

for d in dataset:
    if d.y.item() == 1:
        rs += 1
    elif d.y.item() == 0:
        bn += 1

print(f"Amount of ransomware: {rs}")
print(f"Amount of benign: {bn}")

data_len = len(dataset)
train_amo = round(data_len * 0.8)

train_data = dataset[:train_amo]
test_data = dataset[train_amo:]

bn = 0
rs = 0

for d in train_data:
    if d.y.item() == 1:
        rs += 1
    elif d.y.item() == 0:
        bn += 1
        
class_weight = [1 / bn, 1 / rs]
sample_weight = torch.from_numpy(np.array([class_weight[data.y] for data in train_data]))
sampler = WeightedRandomSampler(sample_weight.type(torch.DoubleTensor), len(sample_weight))

train_loader = DataLoader(train_data, batch_size=64, sampler = sampler)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

Amount of items: 6980
Amount of ransomware: 5001
Amount of benign: 1979


In [4]:
from torch_geometric.utils import to_networkx

print(f'Number of graphs: {len(dataset)}')

nodes = []
for d in dataset:
    nodes.append(d.x.shape[0])
    
nodes = np.array(nodes)
nodes_avg = np.mean(nodes)
nodes_sd  = np.std(nodes)

print(f"Avg nodes count: {nodes_avg:.2f}")
print(f"Avg nodes sd: {nodes_sd:.2f}")

Number of graphs: 6980
Avg nodes count: 11008.85
Avg nodes sd: 27661.78


In [5]:
#G = to_networkx(data[9], to_undirected = False)
#visualize_graph(G)

In [6]:
class GCN(torch.nn.Module):
    def __init__(self, hidden_channels, embed_dim = 16):
        super(GCN, self).__init__()
        
        #self.embed = nn.Embedding(OPS_LENGTH, embed_dim)
        self.conv1 = GCNConv(OPS_LENGTH, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        #self.lin = nn.Linear(hidden_channels, 2)
        self.lin = nn.Sequential(
            nn.Linear(hidden_channels, 128),
            nn.Sigmoid(),
            nn.Linear(128, 64),
            nn.Sigmoid(),
            nn.Linear(64, 2)
        ) 

    def forward(self, x, edge_index, batch):
        #print(f"input shape : {x.shape}")
        #print(f"edge shape : {edge_index.shape}")
        #x = self.embed(x)
        
        #print(f"embeded shape : {x.shape}")
        
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        x = global_mean_pool(x, batch)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        
        return x

In [33]:
class GAT(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GAT, self).__init__()
        self.conv1 = GATv2Conv(OPS_LENGTH, hidden_channels)
        self.conv2 = GATv2Conv(hidden_channels, hidden_channels * 2)
        self.conv3 = GATv2Conv(hidden_channels * 2, hidden_channels)
        # self.lin = nn.Sequential(
        #     nn.Linear(hidden_channels, 128),
        #     nn.ReLU(),
        #     nn.Linear(128, 64),
        #     nn.ReLU(),
        #     nn.Linear(64, 2),
        #     nn.Sigmoid(),
        # ) 
        
        self.lin1 = nn.Linear(hidden_channels, 128)
        self.lin2 = nn.Linear(128, 64)
        self.lin3 = nn.Linear(64, 2)
        #self.lin3 = nn.Linear(64, 32)
        #self.lin4 = nn.Linear(32, 2)
        
        self.bng = nn.BatchNorm1d(hidden_channels)
        self.bn1 = nn.BatchNorm1d(128)
        self.bn2 = nn.BatchNorm1d(64)
        #self.bn3 = nn.BatchNorm1d(32)

    def forward(self, x, edge_index, batch):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)
        
        x = global_mean_pool(x, batch)
        
        x = F.dropout(x, p = 0.5, training = self.training)
        x = self.bng(x)
        
        #print(f"x1 = {x}")
        x = self.lin1(x)
        x = x.relu()
        x = self.bn1(x)
        
        #print(f"x2 = {x}")
        x = self.lin2(x)
        x = x.relu()
        x = self.bn2(x)
        
        #print(f"x3 = {x}")
        x = self.lin3(x)
        x = x.relu()
        #x = self.bn3(x)
        
        #print(f"x4 = {x}")
        #x = self.lin4(x)
        #x = x.relu()
        
        
        return x

In [40]:
from tqdm import tqdm
import matplotlib.pyplot as plt

EPOCH = 1
latent_dim = 32
embed_dim = 1260
lr = 1e-4
WEIGHT_PATH = "data/weightGAT2.pt"

#model = GCN(hidden_channels = latent_dim, embed_dim = embed_dim)
model = GAT(hidden_channels = latent_dim)
optimizer = torch.optim.Adam(model.parameters(), lr = lr)
criterion = torch.nn.CrossEntropyLoss()

print(model)

GAT(
  (conv1): GATv2Conv(16, 32, heads=1)
  (conv2): GATv2Conv(32, 64, heads=1)
  (conv3): GATv2Conv(64, 32, heads=1)
  (lin1): Linear(in_features=32, out_features=128, bias=True)
  (lin2): Linear(in_features=128, out_features=64, bias=True)
  (lin3): Linear(in_features=64, out_features=2, bias=True)
  (bng): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [41]:
model.load_state_dict(torch.load(WEIGHT_PATH))

<All keys matched successfully>

In [42]:
from torchmetrics import ConfusionMatrix

f1 = []
acc = []
best = 0

def train():
    model.train()
    test_loop = 0

    for data in tqdm(train_loader):
        out = model(data.x, data.edge_index, data.batch)
        loss = criterion(out, data.y)
        loss.backward()  
        optimizer.step()  
        optimizer.zero_grad()  
        
        test_loop += 1
        if test_loop >= 10:
            test_loop = 0
            test()

def test():
    model.eval()

    Apred = []
    Atarg = []
    
    for data in test_loader:  
        out = model(data.x, data.edge_index, data.batch)
        pred = out.argmax(dim = 1)  
        
        Apred.append(pred)
        Atarg.append(data.y)
        
    Apred = torch.concat(Apred)
    Atarg = torch.concat(Atarg)
    
    confmat = ConfusionMatrix(num_classes = 2)
    conf = confmat(Apred, Atarg).numpy()
    
    test_f1 = conf[0][0] / (conf[0][0] + 1/2 * (conf[0][1] + conf[1][0]))
    test_acc = (conf[0][0] + conf[1][1]) / (conf[0][0] + conf[0][1] + conf[1][0] + conf[1][1])
    
    print(f"Confusion matrix: {conf}")
    
    f1.append(test_f1)
    acc.append(test_acc)
    
    return test_f1, test_acc

for e in range(EPOCH):
    train()
    test_f1, test_acc = test()
    
    if test_f1 > best:
        best = test_f1
        torch.save(model.state_dict(), WEIGHT_PATH)
    
    print(f'Epoch: {e+1:03d}, Test F1: {test_f1:.4f}, Test acc: {test_acc:.4f}')

plt.plot(f1)
plt.plot(acc)
plt.show()

 10%|█         | 9/88 [01:37<14:24, 10.94s/it]

tensor([[1.0209, 0.9209],
        [2.2658, 0.0000],
        [1.0046, 0.9846],
        [0.2230, 1.7072],
        [0.0875, 1.6944],
        [1.1493, 0.0671],
        [2.3403, 0.0000],
        [0.5291, 1.2010],
        [2.1904, 0.0000],
        [1.7966, 0.0000],
        [2.2631, 0.0000],
        [2.6676, 0.0000],
        [1.1968, 0.6930],
        [0.0000, 1.7254],
        [0.0000, 1.0143],
        [2.8355, 0.0000],
        [0.0000, 2.1896],
        [0.0000, 2.2371],
        [0.0000, 0.9144],
        [0.0000, 0.0000],
        [0.4405, 1.0888],
        [0.0000, 0.9882],
        [0.0000, 1.8506],
        [0.4673, 0.9766],
        [1.0014, 0.8560],
        [2.7674, 0.0000],
        [0.2230, 1.7072],
        [0.2230, 1.7072],
        [2.1170, 0.0000],
        [0.0000, 2.2544],
        [0.0000, 1.1123],
        [0.0000, 0.6376],
        [1.7279, 0.0000],
        [1.3145, 0.6158],
        [1.7918, 0.0000],
        [1.6309, 0.0000],
        [0.3282, 1.0899],
        [0.0000, 0.4024],
        [0.0

 10%|█         | 9/88 [01:56<17:04, 12.97s/it]


KeyboardInterrupt: 

## Load & Interence

In [43]:
WEIGHT_PATH = "data/weightGAT2.pt"
latent_dim = 32
inference_model = GAT(hidden_channels = latent_dim)
inference_model.load_state_dict(torch.load(WEIGHT_PATH))
inference_model.eval()
print(inference_model)

GAT(
  (conv1): GATv2Conv(16, 32, heads=1)
  (conv2): GATv2Conv(32, 64, heads=1)
  (conv3): GATv2Conv(64, 32, heads=1)
  (lin1): Linear(in_features=32, out_features=128, bias=True)
  (lin2): Linear(in_features=128, out_features=64, bias=True)
  (lin3): Linear(in_features=64, out_features=2, bias=True)
  (bng): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [48]:
from dismgrp import dismgrp
from torch.nn.functional import softmax

def isRansom(path):
    try:
        data = dismgrp(path)
    except Exception as e:
        raise e
    
    with torch.no_grad():
        res = inference_model(data.x, data.edge_index, data.batch)
        res = res[0]
    
    pre = torch.argmax(res).item()
    sm = softmax(res, dim = 0)
    print(f"\t{data.x}")
    print(f"\t{res = }")
    print(f"\t{sm = }")
    prob = sm[pre].item() * 100
    return pre, prob

In [50]:
import os

p = "D:/Work/SIIT/4th year/1 - 2022/CSS453 Cyber crime/project/pe/benign/"
i = 40
lst = os.listdir(p)
for f in lst[i : i + 10]:
    pth = p + f
    try:
        pre, prob = isRansom(pth)
        res = "Ransom" if pre == 1 else "Safe"
        print(f"File: {f}: {prob:.0f}% {res}")
    except Exception as e:
        print(f"File: {f}: Error, " + str(e))

	tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 1.0000e+00,
         2.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 3.0000e+00, 4.0000e+00,
         5.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 9.4840e+04, 9.4841e+04,
         9.4842e+04],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 9.4856e+04, 9.4857e+04,
         9.4858e+04],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 9.4872e+04, 9.4873e+04,
         9.4874e+04]])
	res = tensor([0., 0.])
	sm = tensor([0.5000, 0.5000])
File: AcrobatNotificationClient.exe: 50% Safe
	tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 1.0000e+00,
         2.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 3.0000e+00,
         4.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 7.0000e+00, 8.0000e+00,
         9.0000e+00],
        ..

In [None]:
pre, prob = isRansom("D:/Work/SIIT/4th year/1 - 2022/CSS453 Cyber crime/project/pe/benign/7z.exe")
res = "Ransom" if pre == 1 else "Safe"
print(f"{prob:.0f}% {res}")

81% Safe
