In [1]:
import os
import json

# Paths
root_dir = '/workspace/JSONs/Capturing-logs/SmsJson'
output_dir = '/workspace/ProcessedSequences/Smsware'
os.makedirs(output_dir, exist_ok=True)

# Load syscall encoding
with open('/workspace/Barebones/unique_syscalls.txt', 'r') as f:
    unique_syscalls = [line.strip() for line in f.readlines()]
syscall_to_index = {syscall: idx for idx, syscall in enumerate(unique_syscalls)}

def process_json_file(file_path):
    data = None
    
    # Attempt to open the file with UTF-8 encoding first, then try 'latin-1' as fallback
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except (UnicodeDecodeError, json.JSONDecodeError):
        print(f"UTF-8 decoding or JSON error for {file_path}. Trying with 'latin-1' encoding.")
        try:
            with open(file_path, 'r', encoding='latin-1') as f:
                data = json.load(f)
        except (UnicodeDecodeError, json.JSONDecodeError):
            print(f"Skipping file due to encoding or JSON format issues: {file_path}")
            return  # Skip this file if both attempts fail
    
    # Proceed with extracting syscalls if data was successfully loaded
    try:
        syscalls = data.get('behaviors', {}).get('dynamic', {}).get('host', [])
        syscall_sequence = []

        for entry in syscalls:
            if entry.get("class") == "SYSCALL" and "low" in entry:
                for low_entry in entry["low"]:
                    sysname = low_entry.get("sysname")
                    if sysname in syscall_to_index:
                        seq_id = low_entry.get("id")
                        syscall_sequence.append((seq_id, syscall_to_index[sysname]))

        syscall_sequence.sort(key=lambda x: x[0])
        encoded_sequence = [entry[1] for entry in syscall_sequence]

        if encoded_sequence:
            output_file = os.path.join(output_dir, f"{os.path.basename(file_path).replace('.json', '_sequence.txt')}")
            with open(output_file, 'w') as out_f:
                out_f.write(" ".join(map(str, encoded_sequence)))

    except KeyError as e:
        print(f"Skipping file due to missing data key {e}: {file_path}")

# Process each JSON file in the directory
for filename in os.listdir(root_dir):
    if filename.endswith('.json'):
        file_path = os.path.join(root_dir, filename)
        process_json_file(file_path)

print("Smsware sequence extraction complete.")


UTF-8 decoding or JSON error for /workspace/JSONs/Capturing-logs/SmsJson/EBD0E5816B4CD4252A94A7700A3431A5FC0E1D6FE1EB36A198064AFCB533AB82.json. Trying with 'latin-1' encoding.
Skipping file due to encoding or JSON format issues: /workspace/JSONs/Capturing-logs/SmsJson/EBD0E5816B4CD4252A94A7700A3431A5FC0E1D6FE1EB36A198064AFCB533AB82.json
UTF-8 decoding or JSON error for /workspace/JSONs/Capturing-logs/SmsJson/B1698885C3D85E3C6F7E2AB94751DC394C0EB08F17093A9D7327FDED76D1E0A3.json. Trying with 'latin-1' encoding.
Skipping file due to encoding or JSON format issues: /workspace/JSONs/Capturing-logs/SmsJson/B1698885C3D85E3C6F7E2AB94751DC394C0EB08F17093A9D7327FDED76D1E0A3.json
UTF-8 decoding or JSON error for /workspace/JSONs/Capturing-logs/SmsJson/4951131DA6682276DA6EF866AA5929D7B0E854F2E95442642934DC810C271331.json. Trying with 'latin-1' encoding.
Skipping file due to encoding or JSON format issues: /workspace/JSONs/Capturing-logs/SmsJson/4951131DA6682276DA6EF866AA5929D7B0E854F2E95442642934D