In [11]:
# import angr
# import torch
# from sentence_transformers import SentenceTransformer
# from torch_geometric.data import Data

# proj = angr.Project('./binaries/game.exe', auto_load_libs=False)
# cfg = proj.analyses.CFG()

# model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# # Store nodes and edges
# nodes = []
# edge_index = []

# # Iterate over functions
# for func in proj.kb.functions.values():
#     for block in func.blocks:
#         # Extract assembly instructions or use function names/APIs as node text
#         node_text = f"Block at {block.addr}"
#         # Get embedding from MiniLM
#         embedding = model.encode(node_text)
#         nodes.append(embedding)  # Add embedding for each block
    
#         block_node = cfg.get_any_node(block.addr)
#         if block_node and block_node.successors:
#             for succ in block_node.successors:
#                 edge_index.append([block.addr, succ.addr])

# # Convert to tensor
# node_embeddings = torch.tensor(nodes, dtype=torch.float)
# edge_index_tensor = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

# # Prepare PyTorch Geometric data
# data = Data(x=node_embeddings, edge_index=edge_index_tensor)

In [12]:
import os
import angr
import torch
from sentence_transformers import SentenceTransformer
from torch_geometric.data import Data

binary_dir = './binaries'

data_list = []

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

for binary_file in os.listdir(binary_dir):
    if binary_file.endswith('.exe'):  
        angr_project = angr.Project(os.path.join(binary_dir, binary_file), auto_load_libs=False)
        config = angr_project.analyses.CFG()

        nodes = []
        edge_index = []
        addess_to_index = {}

        index = 0
        for function in angr_project.kb.functions.values():
            for block in function.blocks:
                node_text = f"Block at {block.addr}"
                embedding = model.encode(node_text)
                nodes.append(embedding)

                addess_to_index[block.addr] = index
                index += 1

        for function in angr_project.kb.functions.values():
            for block in function.blocks:
                block_node = config.get_any_node(block.addr)
                if block_node and block_node.successors:
                    for successor in block_node.successors:
                        if block.addr in addess_to_index and successor.addr in addess_to_index:
                            edge_index.append([addess_to_index[block.addr], addess_to_index[successor.addr]])

        node_embeddings = torch.tensor(nodes, dtype=torch.float)
        edge_index_tensor = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

        label = 10 if "malware" in binary_file else 12
        data = Data(x=node_embeddings, edge_index=edge_index_tensor, y=torch.tensor([label], dtype=torch.long))
        data_list.append(data)

ERROR    | 2024-09-23 12:28:23,953 | angr.analyses.propagator.engine_vex.SimEnginePropagatorVEX | Unsupported statement type CAS.
ERROR    | 2024-09-23 12:28:30,145 | angr.analyses.propagator.engine_vex.SimEnginePropagatorVEX | Unsupported statement type CAS.


In [13]:
data_list

[Data(x=[19079, 384], edge_index=[2, 29348], y=[1]),
 Data(x=[3355, 384], edge_index=[2, 5788], y=[1])]

In [14]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GINConv, global_add_pool
from torch_geometric.nn import GINConv

class GIN(torch.nn.Module):
    def __init__(self):
        super(GIN, self).__init__()
        self.conv1 = GINConv(
            torch.nn.Sequential(
                torch.nn.Linear(384, 64),  # Input: 384-dim embeddings, Output: 64-dim
                torch.nn.ReLU(),
                torch.nn.Linear(64, 64)
            )
        )
        self.conv2 = GINConv(
            torch.nn.Sequential(
                torch.nn.Linear(64, 64),
                torch.nn.ReLU(),
                torch.nn.Linear(64, 64)
            )
        )
        self.fc1 = torch.nn.Linear(64, 32)
        self.fc2 = torch.nn.Linear(32, 2)  # Binary classification: malware or benign

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        
        x = global_add_pool(x, data.batch)
        
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

model = GIN()

output = model(data)

In [15]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(data_list, batch_size=32, shuffle=True)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.CrossEntropyLoss()

epochs = 10

for epoch in range(epochs):
    for batch in train_loader:
        optimizer.zero_grad()
        output = model(batch)
        loss = loss_fn(output, batch.y)
        loss.backward()
        optimizer.step()

IndexError: Target 10 is out of bounds.

In [6]:
output

tensor([[   0.0000,  -18.1997],
        [   0.0000, -215.7819]], grad_fn=<LogSoftmaxBackward0>)