In [6]:
# import angr
# import torch
# from sentence_transformers import SentenceTransformer
# from torch_geometric.data import Data

# proj = angr.Project('./binaries/game.exe', auto_load_libs=False)
# cfg = proj.analyses.CFG()

# model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# # Store nodes and edges
# nodes = []
# edge_index = []

# # Iterate over functions
# for func in proj.kb.functions.values():
#     for block in func.blocks:
#         # Extract assembly instructions or use function names/APIs as node text
#         node_text = f"Block at {block.addr}"
#         # Get embedding from MiniLM
#         embedding = model.encode(node_text)
#         nodes.append(embedding)  # Add embedding for each block
    
#         block_node = cfg.get_any_node(block.addr)
#         if block_node and block_node.successors:
#             for succ in block_node.successors:
#                 edge_index.append([block.addr, succ.addr])

# # Convert to tensor
# node_embeddings = torch.tensor(nodes, dtype=torch.float)
# edge_index_tensor = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

# # Prepare PyTorch Geometric data
# data = Data(x=node_embeddings, edge_index=edge_index_tensor)

In [8]:
import os
import angr
import torch
from sentence_transformers import SentenceTransformer
from torch_geometric.data import Data

# Directory containing multiple binaries
binary_dir = './binaries'

# Placeholder to store data for all binaries
data_list = []

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

for binary_file in os.listdir(binary_dir):
    if binary_file.endswith('.exe'):  # Only process .exe files
        # Load binary and extract CFG
        proj = angr.Project(os.path.join(binary_dir, binary_file), auto_load_libs=False)
        cfg = proj.analyses.CFG()

        nodes = []
        edge_index = []

        # Process nodes and edges as before
        for func in proj.kb.functions.values():
            for block in func.blocks:
                node_text = f"Block at {block.addr}"
                embedding = model.encode(node_text)
                nodes.append(embedding)
                block_node = cfg.get_any_node(block.addr)
                if block_node and block_node.successors:
                    for succ in block_node.successors:
                        edge_index.append([block.addr, succ.addr])

        # Convert to tensor
        node_embeddings = torch.tensor(nodes, dtype=torch.float)
        edge_index_tensor = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

        # Prepare graph data for each binary
        data = Data(x=node_embeddings, edge_index=edge_index_tensor)
        data_list.append(data)

ERROR    | 2024-09-11 11:04:16,749 | angr.analyses.propagator.engine_vex.SimEnginePropagatorVEX | Unsupported statement type CAS.
ERROR    | 2024-09-11 11:04:21,693 | angr.analyses.propagator.engine_vex.SimEnginePropagatorVEX | Unsupported statement type CAS.


In [9]:
data_list

[Data(x=[3355, 384], edge_index=[2, 5788]),
 Data(x=[19079, 384], edge_index=[2, 29348])]

In [5]:
from sentence_transformers import SentenceTransformer
import torch
import torch.nn.functional as F
from torch_geometric.nn import GINConv, global_add_pool
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GINConv

class GIN(torch.nn.Module):
    def __init__(self):
        super(GIN, self).__init__()
        self.conv1 = GINConv(
            torch.nn.Sequential(
                torch.nn.Linear(384, 64),  # Input: 384-dim embeddings, Output: 64-dim
                torch.nn.ReLU(),
                torch.nn.Linear(64, 64)
            )
        )
        self.conv2 = GINConv(
            torch.nn.Sequential(
                torch.nn.Linear(64, 64),
                torch.nn.ReLU(),
                torch.nn.Linear(64, 64)
            )
        )
        self.fc1 = torch.nn.Linear(64, 32)
        self.fc2 = torch.nn.Linear(32, 2)  # Binary classification: malware or benign

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        
        # Global pooling (summing node embeddings to create a graph embedding)
        x = global_add_pool(x, torch.zeros(x.size(0), dtype=torch.long))
        
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

# Initialize the GIN model
model = GIN()

# Example forward pass with the constructed graph
output = model(data)

Data(x=[3355, 384], edge_index=[2, 5788])

In [10]:
from torch_geometric.loader import DataLoader

# Create a DataLoader for your graph dataset
train_loader = DataLoader(data_list, batch_size=32, shuffle=True)

# Example training loop
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.CrossEntropyLoss()

epochs = 10

labels = [1 if "malware" in binary_file else 0 for binary_file in os.listdir(binary_dir)]

for epoch in range(epochs):
    for batch in train_loader:
        optimizer.zero_grad()
        output = model(batch)
        loss = loss_fn(output, labels)
        loss.backward()
        optimizer.step()


KeyError: 'input_ids'