**Step 1: Setup and Install Dependencies**

In [1]:
%%capture
!apt install python3.10 python3.10-distutils python3-pip -y
!update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
!pip install pandas


**Step 2: Mount Google Drive**

In [2]:
# Import and mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Step 3: Set Path to Your Dataset Folder**

In [3]:
# Change directory to your project folder
%cd /content/drive/MyDrive/Ransomware-Detection

# Set the full path to your ember2018 folder
EMBER_PATH = '/content/drive/MyDrive/Ransomware-Detection/Dataset/ember2018'

# Define the output folder for the processed CSV files.
OUTPUT_PATH = '/content/drive/MyDrive/Ransomware-Detection/Dataset/ProcessedMetadata'
# Create the output folder if it doesn't exist.
import os
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)


/content/drive/MyDrive/Ransomware-Detection


**Step 4: Load and Merge Train Feature JSONL Files**

In [4]:
import os
import glob
import json
import pandas as pd

# Verify the ember2018 folder exists.
if not os.path.exists(EMBER_PATH):
    raise FileNotFoundError(f"Directory {EMBER_PATH} not found. Ensure the EMBER dataset is extracted.")

# List all train feature JSONL files.
train_files = sorted(glob.glob(os.path.join(EMBER_PATH, "train_features_*.jsonl")))
print("Found train feature files:", train_files)

# Define the metadata columns we want to extract.
new_metadata_columns = [
    "sha256", "md5", "appeared", "label", "avclass",
    "general_size", "general_exports", "general_imports", "strings_numstrings"
]

# Define a function to flatten each JSON record.
def flatten_record(record):
    out = {}
    out["sha256"] = record.get("sha256")
    out["md5"] = record.get("md5")
    out["appeared"] = record.get("appeared")
    out["label"] = record.get("label")
    out["avclass"] = record.get("avclass")

    general = record.get("general", {})
    out["general_size"] = general.get("size")
    out["general_exports"] = general.get("exports")
    out["general_imports"] = general.get("imports")

    strings = record.get("strings", {})
    out["strings_numstrings"] = strings.get("numstrings")

    return out

# Set the output CSV path for train metadata in the new folder.
train_csv_path = os.path.join(OUTPUT_PATH, "train_metadata.csv")
first_chunk = True  # Write header only once.

# Process each train JSONL file in chunks.
for file in train_files:
    print(f"Processing {file}...")
    try:
        for chunk in pd.read_json(file, lines=True, chunksize=10000):
            # Convert the chunk to a list of dictionaries.
            records = chunk.to_dict(orient="records")
            # Flatten each record.
            flattened_records = [flatten_record(r) for r in records]
            # Create a new DataFrame from the flattened records.
            flat_chunk_df = pd.DataFrame(flattened_records, columns=new_metadata_columns)
            # Append the chunk to the CSV file.
            flat_chunk_df.to_csv(train_csv_path, mode="a", header=first_chunk, index=False)
            first_chunk = False
    except Exception as e:
        print(f"Error processing {file}: {e}")

print("Train metadata processed; last processed chunk shape:", flat_chunk_df.shape)
print("Train metadata saved to:", train_csv_path)


Found train feature files: ['/content/drive/MyDrive/Ransomware-Detection/Dataset/ember2018/train_features_0.jsonl', '/content/drive/MyDrive/Ransomware-Detection/Dataset/ember2018/train_features_1.jsonl', '/content/drive/MyDrive/Ransomware-Detection/Dataset/ember2018/train_features_2.jsonl', '/content/drive/MyDrive/Ransomware-Detection/Dataset/ember2018/train_features_3.jsonl', '/content/drive/MyDrive/Ransomware-Detection/Dataset/ember2018/train_features_4.jsonl', '/content/drive/MyDrive/Ransomware-Detection/Dataset/ember2018/train_features_5.jsonl']
Processing /content/drive/MyDrive/Ransomware-Detection/Dataset/ember2018/train_features_0.jsonl...
Processing /content/drive/MyDrive/Ransomware-Detection/Dataset/ember2018/train_features_1.jsonl...
Processing /content/drive/MyDrive/Ransomware-Detection/Dataset/ember2018/train_features_2.jsonl...
Processing /content/drive/MyDrive/Ransomware-Detection/Dataset/ember2018/train_features_3.jsonl...
Processing /content/drive/MyDrive/Ransomware-Det

**Step 5: Load and Process Test Feature JSONL File in Chunks**

In [5]:
# Define the test features JSONL file path.
test_file = os.path.join(EMBER_PATH, "test_features.jsonl")
if not os.path.exists(test_file):
    raise FileNotFoundError(f"Test file {test_file} not found.")

# Set the output CSV path for test metadata in the new folder.
test_csv_path = os.path.join(OUTPUT_PATH, "test_metadata.csv")
first_chunk = True  # Reset flag for test file.

print(f"Processing {test_file}...")
try:
    for chunk in pd.read_json(test_file, lines=True, chunksize=10000):
        records = chunk.to_dict(orient="records")
        flattened_records = [flatten_record(r) for r in records]
        flat_chunk_df = pd.DataFrame(flattened_records, columns=new_metadata_columns)
        flat_chunk_df.to_csv(test_csv_path, mode="a", header=first_chunk, index=False)
        first_chunk = False
except Exception as e:
    print(f"Error processing {test_file}: {e}")

print("Test metadata processed; last processed chunk shape:", flat_chunk_df.shape)
print("Test metadata saved to:", test_csv_path)


Processing /content/drive/MyDrive/Ransomware-Detection/Dataset/ember2018/test_features.jsonl...
Test metadata processed; last processed chunk shape: (10000, 9)
Test metadata saved to: /content/drive/MyDrive/Ransomware-Detection/Dataset/ProcessedMetadata/test_metadata.csv


**Step 6: Verify**

In [6]:
# List contents of the output folder to check that CSV files were saved.
!ls -lh {OUTPUT_PATH}


total 124M
-rw------- 1 root root 25M Apr 10 12:44 test_metadata.csv
-rw------- 1 root root 99M Apr 10 12:42 train_metadata.csv


**Step 7: Load CSV Files and Print First & Last Few Rows**

In [7]:
# Load the CSV files into Pandas DataFrames.
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

# Print the first 5 and last 5 rows for train metadata.
print("Train Metadata - First 5 Rows:")
print(train_df.head())
print("\nTrain Metadata - Last 5 Rows:")
print(train_df.tail())

# Print the first 5 and last 5 rows for test metadata.
print("\nTest Metadata - First 5 Rows:")
print(test_df.head())
print("\nTest Metadata - Last 5 Rows:")
print(test_df.tail())


Train Metadata - First 5 Rows:
                                              sha256  \
0  0abb4fda7d5b13801d63bee53e5e256be43e141faa077a...   
1  c9cafff8a596ba8a80bafb4ba8ae6f2ef3329d95b85f15...   
2  eac8ddb4970f8af985742973d6f0e06902d42a3684d791...   
3  7f513818bcc276c531af2e641c597744da807e21cc1160...   
4  ca65e1c387a4cc9e7d8a8ce12bf1bcf9f534c9032b9d95...   

                                md5 appeared  label avclass  general_size  \
0  63956d6417f8f43357d9a8e79e52257e  2006-12      0     NaN       3101705   
1  6f7bde7a1126debf0cc359a54953efc1  2007-01      0     NaN        504320   
2  7520c8f9534ca818726a4feaebf49e2b  2007-02      0     NaN        180224   
3  e435a536968941854bcec3b902c439f6  2007-02      0     NaN       2377730   
4  e93049e2df82ab26f35ad0049173cb14  2007-02      0     NaN       1153808   

   general_exports  general_imports  strings_numstrings  
0                0              156               14573  
1                0              619                18