## Extract Graphs from JSON files

In [None]:
def load_text_file_as_set(file_path):
    """
    Load a text file and return a set where each line is an element of the set.
    
    :param file_path: Path to the text file
    :return: A set containing lines from the text file
    """
    result_set = set()
    try:
        with open(file_path, 'r') as file:
            for line in file:
                result_set.add(line.strip())
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
    return result_set

# Example usage
file_path = '/workspace/JSONs/Capturing-logs/unique_sysnames_procnames.txt'
loaded_set = load_text_file_as_set(file_path)
print(loaded_set)
print(len(loaded_set))  # Number of elements in the set

In [None]:
import os
import json
import torch
from torch_geometric.data import Data

# Define root directory, categories with labels, and output directory for saved graphs
root_dir = '/workspace/JSONs/Capturing-logs'
output_dir = '/workspace/Dynamic 2/ProcessedGraphs'
os.makedirs(output_dir, exist_ok=True)

categories = {
    "AdwareJsonMini": 1,
    "BankingJsonMini": 1,
    "BenignJsonMini": 0,
    "RiskWareJsonMini": 1,
    "SmsJsonMini": 1
}

# Load unique sysnames from file into a set
file_path = '/workspace/JSONs/Capturing-logs/unique_sysnames_procnames.txt'
loaded_set = set()
try:
    with open(file_path, 'r') as file:
        for line in file:
            loaded_set.add(line.strip())
except Exception as e:
    print(f"Error reading file {file_path}: {e}")

# Integer encoding dictionary for sysnames with a fallback for unknown sysnames
sysname_encoder = {name: i for i, name in enumerate(loaded_set)}
sysname_encoder["UNKNOWN"] = -1  # Assign -1 to unknown sysnames

def parse_json_to_graph(file_path, label, save_path):
    try:
        with open(file_path) as f:
            data = json.load(f)

        # Initialize node features and edges
        node_features = []
        edges = []

        # Extract syscall information
        syscalls = data.get('behaviors', {}).get('dynamic', {}).get('host', [])
        for i, syscall in enumerate(syscalls):
            syscall_info = syscall.get('low', [{}])[0]

            # Retrieve 'sysname' or use "UNKNOWN" if not available
            sysname = syscall_info.get('sysname', 'UNKNOWN')
            sysname_index = sysname_encoder.get(sysname, sysname_encoder['UNKNOWN'])  # Integer encoding

            # Add the integer-encoded sysname as a feature
            feature_vector = [sysname_index]
            node_features.append(feature_vector)

            # Sequential edge
            if i > 0:
                edges.append([i - 1, i])

            # Cross-reference edge using 'xref'
            if 'xref' in syscall_info:
                xref_id = syscall_info['xref']
                if xref_id < i:  # Ensure valid reference
                    edges.append([xref_id, i])

        # Skip saving if no valid syscalls
        if not node_features:
            print(f"No valid syscalls with 'sysname' found in file: {file_path}")
            return

        # Convert node features and edges to PyTorch tensors
        x = torch.tensor(node_features, dtype=torch.float)
        edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()

        # Create Data object
        graph_data = Data(x=x, edge_index=edge_index, y=torch.tensor([label], dtype=torch.long))

        # Save graph data as .pt file
        torch.save(graph_data, save_path)
        print(f"Saved graph to {save_path}")

    except json.JSONDecodeError:
        print(f"Skipping file due to JSONDecodeError: {file_path}")
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")

# Process each file in categories and save immediately to prevent memory overload
for category, label in categories.items():
    category_dir = os.path.join(root_dir, category)
    for file_name in os.listdir(category_dir):
        if file_name.endswith('.json'):
            file_path = os.path.join(category_dir, file_name)
            save_path = os.path.join(output_dir, f"{category}_{file_name.replace('.json', '.pt')}")
            parse_json_to_graph(file_path, label, save_path)

print("All graphs have been processed and saved.")
