## Extract Graphs from JSON

In [None]:
import json
import networkx as nx
import torch
from torch_geometric.data import Data
import os

def process_syscall_file(file_path):
    """
    Processes a JSON file containing system calls and creates a directed graph with sequential edges.
    
    Parameters:
    - file_path: str, path to the JSON file.
    
    Returns:
    - graph_data: PyTorch Geometric Data object, the graph formatted for GNN use.
    """
    # Initialize directed graph
    G = nx.DiGraph()

    try:
        # Load the JSON file
        with open(file_path, 'r') as f:
            data = json.load(f)

        # Extract system call details
        syscalls = []
        if 'behaviors' in data and 'dynamic' in data['behaviors']:
            dynamic_behavior = data['behaviors']['dynamic']['host']
            
            # Loop over system call records and gather relevant information
            for record in dynamic_behavior:
                if 'low' in record:
                    for syscall in record['low']:
                        # Collect essential details
                        syscall_id = syscall.get('id')
                        syscall_name = syscall.get('sysname')
                        timestamp = syscall.get('ts')
                        parameters = syscall.get('parameters', [])
                        
                        # Skip syscalls with None sysname
                        if syscall_name is None:
                            continue
                        
                        # Add to syscalls list
                        syscalls.append({
                            'id': syscall_id,
                            'name': syscall_name,
                            'timestamp': float(timestamp),
                            'parameters': parameters
                        })

        # Sort the syscalls based on the timestamp to create sequential relationships
        syscalls.sort(key=lambda x: x['timestamp'])

        # Add nodes and sequential edges to the graph
        node_features = []
        node_index_map = {}  # Mapping from syscall_id to node index for edge creation
        for i, syscall in enumerate(syscalls):
            # Create a unique identifier for each syscall node
            node_id = syscall['id']
            node_index_map[node_id] = i  # Store the index for later edge creation
            
            # Example: Create a feature vector (this can be expanded)
            # Here, we use a simple feature: length of syscall name (as a placeholder)
            feature = len(syscall['name'])
            node_features.append([feature])
            
            # Add node to the graph
            G.add_node(node_id, name=syscall['name'], timestamp=syscall['timestamp'], parameters=syscall['parameters'])

        # Add sequential edges based on temporal order
        for i in range(len(syscalls) - 1):
            current_id = syscalls[i]['id']
            next_id = syscalls[i + 1]['id']
            G.add_edge(current_id, next_id, type='sequential')

        # Prepare data for PyTorch Geometric
        edge_index = []
        for u, v in G.edges():
            edge_index.append([node_index_map[u], node_index_map[v]])
        
        # Convert to PyTorch Geometric tensors
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        x = torch.tensor(node_features, dtype=torch.float)

        # Create PyTorch Geometric Data object
        graph_data = Data(x=x, edge_index=edge_index)

        return graph_data

    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

def save_graph_data(graph_data, output_path):
    """
    Saves the processed graph data to a file.
    
    Parameters:
    - graph_data: PyTorch Geometric Data object.
    - output_path: str, path to save the graph data.
    """
    torch.save(graph_data, output_path)

def process_benign_folder(input_folder, output_folder):
    """
    Processes all JSON files in a given folder, converts each to a graph, and saves it.
    
    Parameters:
    - input_folder: str, path to the folder containing JSON files.
    - output_folder: str, path to the folder to save the .pt graph files.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Process each JSON file in the input folder
    for file_name in os.listdir(input_folder):
        if file_name.endswith(".json"):
            file_path = os.path.join(input_folder, file_name)
            graph = process_syscall_file(file_path)

            if graph:
                output_file = os.path.join(output_folder, file_name.replace('.json', '.pt'))
                save_graph_data(graph, output_file)
                print(f"Saved graph for {file_name} to {output_file}")
            else:
                print(f"Failed to process {file_name}")

# Example Usage
if __name__ == "__main__":
    input_folder = "/home/belief/Desktop/MalwareDetection/JSONs/Capturing-logs/Unlabled"  # Replace with your actual path
    output_folder = "/home/belief/Desktop/MalwareDetection/Graphs/Unlabled"  # Replace with your actual output path

    process_benign_folder(input_folder, output_folder)
