In [1]:
import os
import json
import pandas as pd
from concurrent.futures import ProcessPoolExecutor, as_completed
from collections import Counter

# Define the path to the root directory containing the folders
root_dir = "/home/belief/Desktop/MalwareDetection/JSONs/Capturing-logs-new"

# Define the folders for each malware type
malware_types = ["AdwareJson", "BenignJson", "BankingwareJson", "RiskwareJson", "SmswareJson"]

# Define function to extract features from JSON file
def extract_features(file_info):
    filepath, malware_type = file_info
    with open(filepath, "r") as f:
        json_data = json.load(f)
        
        # Extract features similar to original code
        # (shortened here for clarity)
        num_permissions = json_data["behaviors"]["static"].get("num_permissions", 0)
        # ... other feature extraction steps ...

        # Collect syscall frequencies and 2-grams
        syscalls = json_data["behaviors"]["dynamic"].get("host", [])
        syscall_frequencies = Counter()
        syscall_2grams = Counter()
        syscall_list = []
        for host in syscalls:
            if "low" in host:
                for syscall in host["low"]:
                    syscall_name = syscall.get("sysname")
                    if syscall_name:
                        syscall_list.append(syscall_name)
                        syscall_frequencies[syscall_name] += 1

        # Generate 2-gram counts (limit to top 100 to reduce memory)
        for i in range(len(syscall_list) - 1):
            syscall_2gram = (syscall_list[i], syscall_list[i + 1])
            syscall_2grams[syscall_2gram] += 1

        # Build dictionary for features
        features = {
            "num_permissions": num_permissions,
            # ... other static features ...
            "malware_type": malware_type
        }
        
        # Add syscall frequencies, only top 50 to reduce memory usage
        for syscall, count in syscall_frequencies.most_common(50):
            features[f"syscall_{syscall}"] = count

        # Add syscall 2-gram frequencies, only top 50 to reduce memory usage
        for (syscall1, syscall2), count in syscall_2grams.most_common(50):
            features[f"syscall_2gram_{syscall1}_{syscall2}"] = count

    return features

# Process each malware type folder in smaller batches
for malware_type in malware_types:
    folder_path = os.path.join(root_dir, malware_type)
    files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".json")]
    
    # Set batch size to limit memory usage
    batch_size = 100  
    for i in range(0, len(files), batch_size):
        batch_files = files[i:i+batch_size]
        data = []

        # Process files in the current batch in parallel
        with ProcessPoolExecutor() as executor:
            future_to_file = {executor.submit(extract_features, (file, malware_type)): file for file in batch_files}
            for future in as_completed(future_to_file):
                try:
                    data.append(future.result())
                except Exception as exc:
                    print(f"Error processing file {future_to_file[future]}: {exc}")

        # Append batch results to a CSV file
        csv_filename = f"{malware_type}.csv"
        df = pd.DataFrame(data)
        df.to_csv(csv_filename, mode='a', header=not os.path.exists(csv_filename), index=False)
        print(f"Batch of features appended to {csv_filename}")


Error processing file /home/belief/Desktop/MalwareDetection/JSONs/Capturing-logs-new/AdwareJson/8143D45CB4EDF607996E7889CE8F94827B93C16CA5FF18CA007BBBBFFDBB0BB7.json: Unterminated string starting at: line 1 column 305142 (char 305141)
Error processing file /home/belief/Desktop/MalwareDetection/JSONs/Capturing-logs-new/AdwareJson/89B3095CFEAC18BADA832C7C0197AD4D3A420312D64839C1E21DAA092DA1C1BB.json: Unterminated string starting at: line 1 column 16941931 (char 16941930)
Error processing file /home/belief/Desktop/MalwareDetection/JSONs/Capturing-logs-new/AdwareJson/F5BE9DFC198DF5ABCFBA9B697AC5FA6AEEBE787557777A3D557C758A7BE55962.json: Unterminated string starting at: line 1 column 57203434 (char 57203433)
Error processing file /home/belief/Desktop/MalwareDetection/JSONs/Capturing-logs-new/AdwareJson/C8BEB967C2359C90F80BADEF6F14CC4C52B53FC8EBF3F6C1A1F06FAA1E6F4DD4.json: Unterminated string starting at: line 1 column 109662124 (char 109662123)
Error processing file /home/belief/Desktop/Mal