In [None]:
import os
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.data import Subset
import torch.optim as optim
import torch.nn.functional as F

In [None]:
torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
torch.cuda.device_count() # returns 1 in my case


if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

In [None]:
BASE = '/home/HardDisk/Satang/thesis_proj'
DATAPATH = 'original' #(Ransap path)
FOLDER = "win7-120gb-hdd"
os.chdir(f'{BASE}/{DATAPATH}')
folders = sorted(os.listdir())
print(folders)
os.chdir(f'{BASE}/{DATAPATH}/{FOLDER}')
labels = sorted(os.listdir())
print(labels)
benign = ['AESCrypt', 'Zip', 'SDelete', 'Excel', 'Firefox']
ransomware = ['TeslaCrypt', 'Cerber', 'WannaCry', 'GandCrab4', 'Ryuk', 'Sodinokibi', 'Darkside']


In [None]:
import numpy as np
import os

def save_numpy_array(array, label, base_dir, file_format='npy'):
    """
    Save a Numpy array to a file in a class-named folder, appending a dynamic suffix to the label based on the number of existing files.

    Parameters:
    - array (np.ndarray): The Numpy array to save.
    - label (str): The base label to use as the filename and folder name for the class.
    - base_dir (str): The base directory where the class folder will be created.
    - file_format (str): The format to save the file in ('npy' or 'csv'). Default is 'npy'.
    
    Returns:
    - str: The full path to the saved file.
    """
    # Create class folder within the base directory
    class_dir = os.path.join(base_dir, label)
    os.makedirs(class_dir, exist_ok=True)  # Create the folder if it doesn't exist

    # Count existing files in the class folder
    existing_files = os.listdir(class_dir)
    file_count = sum(1 for file in existing_files if file.endswith(f".{file_format}"))
    
    # Generate suffix based on the file count
    suffix = f"_{file_count + 1}"  # Start from _1 if no files exist

    # Define the full file path with the dynamic suffix
    file_name = f"{label}{suffix}.{file_format}"
    file_path = os.path.join(class_dir, file_name)

    # Save the file in the specified format
    if file_format == 'npy':
        np.save(file_path, array)
    elif file_format == 'csv':
        np.savetxt(file_path, array, delimiter=",", comments="")
    else:
        raise ValueError("Unsupported file format. Use 'npy' or 'csv'.")

    return file_path

In [None]:
import numpy as np

df_r = pd.read_csv("/home/HardDisk/Satang/thesis_proj/dataset/dataset/original/win7-120gb-hdd/AESCrypt/AESCrypt-20200427_16-23-28/ata_write.csv",header=None)
df_r = np.array(df_r)

# Get unique values in order of appearance and their counts
column_1 = df_r[:,0].astype(int)
column_2 = df_r[:,3]
# Get unique values in column 1 and their counts
unique_values = np.unique(column_1)

def count_operations_and_sum(column_1, column_2, window_size=10):
    # Step 1: Count unique values and occurrences
    unique_values, counts = np.unique(column_1, return_counts=True)
    unique_counts_array = np.array(counts)  # Store unique counts in an array

    # Step 2: Compute sums and counts for `column_2`
    sum_dict = {}  # Store sum of column_2 values for each operation
    count_dict = {}  # Store count of occurrences for each operation
    operations = []  # Store the sequences (as NumPy arrays)
    sliding_sums = []  # Store sum of unique counts in each sliding window

    # Identify the valid steps between start and end
    for i in range(len(unique_values) - (window_size-1)):  # Move one step at a time
        operation = unique_values[i:i+window_size]  # Extract 3 consecutive values
        operation_key = str(operation)  # Convert to string for dictionary key storage

        if operation_key not in sum_dict:
            mask = np.isin(column_1, operation)  # Find rows where column_1 matches the operation
            sum_dict[operation_key] = np.nansum(column_2[mask])  # Sum corresponding column_2 values
            count_dict[operation_key] = np.sum(mask)  # Count occurrences
            operations.append(operation)  # Append as a NumPy array



    return operations, sum_dict, count_dict, unique_counts_array



operations, sum_dict, count_dict, unique_counts = count_operations_and_sum(column_1, column_2)

print("Operations:", len(operations))
print("Sum Dict:", sum_dict)
print("Count Dict:", count_dict)
print("\nUnique Counts Array:", unique_counts)

# operation_r = str(operations[1])
# print(sum_dict[operation_r])

In [None]:
SEED = 42
output_dir = "/home/HardDisk/Satang/thesis_proj/30/10/raw_data_8"
np.random.seed(SEED)
window_size = 10 # change the window size
for folder in folders:
    for label in labels:
        os.chdir(f'{BASE}/{DATAPATH}/{folder}/{label}')
        dirs = sorted(os.listdir())
        dirs = np.array(dirs)
        # Shuffle directory
        np.random.seed(SEED)
        np.random.shuffle(dirs)

        for dir_idx in range(len(dirs)):
            print(dirs[dir_idx])
            os.chdir(f'{BASE}/{DATAPATH}/{folder}/{label}/{dirs[dir_idx]}')
            files = sorted(os.listdir())
            tmp = []
            tmp_train = []
            
            df_r = pd.read_csv(f'{BASE}/{DATAPATH}/{folder}/{label}/{dirs[dir_idx]}/{files[0]}', header=None)
            df_w = pd.read_csv(f'{BASE}/{DATAPATH}/{folder}/{label}/{dirs[dir_idx]}/{files[1]}', header=None)
            # df_r = np.array(df_r)
            # df_w = np.array(df_w)

            # Given column 1 (numbers) and column 2 (values to sum)
            column_r = df_r.iloc[:, 0]
            column_w = df_w.iloc[:, 0]

            values_r = df_r.iloc[:, 3]
            values_w = df_w.iloc[:, 3]

            # Get unique operations for read and write
            operations_r, sum_dict_r, count_dict_r, unique_count_r = count_operations_and_sum(column_r, values_r,window_size)
            operations_w, sum_dict_w, count_dict_w, unique_count_w = count_operations_and_sum(column_w, values_w,window_size)
            # Ensure operations exist before processing

            # Ensure operations exist before processing
            step = min(len(operations_r), len(operations_w))
            i_r = 0
            i_w = 0
            for k in range(step):
                operation_r = str(operations_r[k])  # Convert NumPy array to string key
                operation_w = str(operations_w[k])  # Convert NumPy array to string key

                # Get operation sums and counts separately
                operation_sum_r = sum_dict_r[operation_r]
                operation_sum_w = sum_dict_w[operation_w]

                count_r = count_dict_r[operation_r]
                count_w = count_dict_w[operation_w]

                # print(count_r)
                # print(count_w)

                unique_r = unique_count_r[k:k+window_size]
                unique_w = unique_count_w[k:k+window_size]
                # print(k)
                # print(unique_r)
                # print(unique_w)
                # Average read/write throughput [byte/s]
                T_read = operation_sum_r / window_size
                T_write = operation_sum_w / window_size

                # print(T_read)
                # print(T_write)
                
                # Variance of logical block addresses (read)
                filtered_read = df_r.iloc[i_r:i_r + np.sum(unique_r), 2]
                filtered_read = filtered_read[~np.isnan(filtered_read)]  # Remove NaN
                # print(filtered_read.shape)
                V_read_mean = np.mean(filtered_read)
                # print(V_read_mean)
                V_read = (1 / (count_r - 1)) * np.sum((filtered_read - V_read_mean) ** 2)

                # Variance of logical block addresses (write)
                filtered_write = df_w.iloc[i_w:i_w + np.sum(unique_w), 2]
                filtered_write = filtered_write[~np.isnan(filtered_write)]  # Remove NaN
                V_write_mean = np.mean(filtered_write)
                V_write = (1 / (count_w - 1)) * np.sum((filtered_write - V_write_mean) ** 2)

                # Average normalized Shannon entropy (write)
                filtered_entropy = df_w.iloc[i_w:i_w + np.sum(unique_w), 4]
                filtered_entropy = filtered_entropy[~np.isnan(filtered_entropy)]  # Remove NaN
                entropy_mean = np.mean(filtered_entropy)
                num_row_ent = filtered_entropy.shape[0]
                if num_row_ent > 0:
                    H_write = (1 / count_w) * np.sum(df_w.iloc[i_w:i_w + np.sum(unique_w), 4])
                else:
                    H_write = 0
                # Variance normalized Shannon Entropy (write)
                Var_H_write = (1/ (count_w - 1)) * np.sum((filtered_entropy - entropy_mean) ** 2)

                # Spatial Locality Ratio on write access
                delta = 128
                lba_values_write = filtered_write.values
                if len(lba_values_write) < 2:
                    SLR_write = 0.0
                else:
                    lba_diffs_write = np.abs(np.diff(lba_values_write))
                    SLR_write = np.sum(lba_diffs_write <= delta) / len(lba_diffs_write)

                # Spatial Locality Ratio on read access
                delta = 128
                lba_values_read = filtered_read.values
                if len(lba_values_read) < 2:
                    SLR_read = 0.0
                else:
                    lba_diffs_read = np.abs(np.diff(lba_values_read))
                    SLR_read = np.sum(lba_diffs_read <= delta) / len(lba_diffs_read)

                tmp.append([T_write, T_read, V_write, V_read, H_write, Var_H_write, SLR_write, SLR_read])

                # Move window based on read/write count
                i_r += unique_count_r[k]+1
                i_w += unique_count_w[k]+1

            tmp_train.append(tmp)
            tmp_train = np.array(tmp_train)
            transposed_array = tmp_train.transpose(1, 2, 0)

            # Combine the groups into a single array
            result_array = transposed_array.reshape(transposed_array.shape[0], -1)
            save_numpy_array(result_array, label, output_dir, file_format='csv')


In [None]:
import os
import pandas as pd

def calculate_global_min_max_per_class(base_folder):
    """
    Calculate the global minimum and maximum values for the first 5 columns across all CSV files
    in each class folder inside the given base folder.
    
    Args:
        base_folder (str): Path to the base folder containing class subfolders with CSV files.
    """
    for class_folder in sorted(os.listdir(base_folder)):
        class_path = os.path.join(base_folder, class_folder)
        
        # Ensure it's a directory before processing
        if not os.path.isdir(class_path):
            continue
        
        global_min = {}
        global_max = {}
        
        for root, _, files in os.walk(class_path):
            for file in files:
                if file.endswith(".csv"):
                    file_path = os.path.join(root, file)
                    # Read CSV without headers, considering only the first 5 columns
                    df = pd.read_csv(file_path, header=None, usecols=range(8))
                    
                    # Calculate min and max for each column
                    for column in df.columns:
                        col_min = df[column].min()
                        col_max = df[column].max()
                        
                        if column not in global_min or col_min < global_min[column]:
                            global_min[column] = col_min
                        if column not in global_max or col_max > global_max[column]:
                            global_max[column] = col_max
        
        # Print results for each class folder
        print(f"Class: {class_folder}")
        print("  Global Minimum Values:", global_min)
        print("  Global Maximum Values:", global_max)
        print("----------------------------------")

# Example usage
base_folder = "/home/HardDisk/Satang/thesis_proj/New_30/10/raw_data_8"  # Replace with the path to your main folder
calculate_global_min_max_per_class(base_folder)


In [None]:
import os
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np

base_dir = '/home/HardDisk/Satang/thesis_proj/30/10/raw_data_8'
output_dir = '/home/HardDisk/Satang/thesis_proj/30/10/raw_data_normalized_8'
os.makedirs(output_dir, exist_ok=True)

for class_name in os.listdir(base_dir):
    class_path = os.path.join(base_dir, class_name)
    if not os.path.isdir(class_path):
        continue

    dfs = []
    filenames = []

    # Step 1: Load all data
    for file in os.listdir(class_path):
        if file.endswith('.csv'):
            file_path = os.path.join(class_path, file)
            df = pd.read_csv(file_path, header=None)
            dfs.append(df)
            filenames.append((file, df))
    
    if not dfs:
        continue

    # Step 2: Concatenate and inspect
    all_data = pd.concat(dfs, ignore_index=True)

    # DEBUG: Check actual min/max per column
    print(f"\nClass: {class_name}")
    print("Original min values:\n", all_data.min().values)
    print("Original max values:\n", all_data.max().values)

    # Step 3: Fit scaler on the combined data
    scaler = MinMaxScaler()
    scaler.fit(all_data)

    # DEBUG: Verify scaler fit
    print("Scaler min_:\n", scaler.data_min_)
    print("Scaler max_:\n", scaler.data_max_)

    # Step 4: Normalize each file
    class_output_path = os.path.join(output_dir, class_name)
    os.makedirs(class_output_path, exist_ok=True)

    for file, df in filenames:
        norm_data = scaler.transform(df)

        # DEBUG: Log first row before and after
        print(f"File: {file}")
        print("Original first row:", df.iloc[0].values)
        print("Normalized first row:", norm_data[0])

        pd.DataFrame(norm_data).to_csv(
            os.path.join(class_output_path, file),
            index=False,
            header=False
        )

print("✅ Normalization complete.")


In [None]:
import os
import random
from pathlib import Path
from shutil import copy2

# === Setup ===
INPUT_DIR = "/home/HardDisk/Satang/thesis_proj/New_30/15/raw_data_normalized_8" # directory of normlaized data
OUTPUT_BASE = "/home/HardDisk/Satang/github"+ "/30"+"/15"  # You can change this if needed
SPLIT_RATIOS = (0.7, 0.15, 0.15)  # Train, Val, Test
SEED = 42
window_size = 15
# === Prepare output folders ===
splits = ['train', 'val', 'test']
for split in splits:
    for cls in os.listdir(INPUT_DIR):
        os.makedirs(os.path.join(OUTPUT_BASE, split, cls), exist_ok=True)

# === Perform the split ===
random.seed(SEED)

for cls in sorted(os.listdir(INPUT_DIR)):
    class_path = os.path.join(INPUT_DIR, cls)
    if not os.path.isdir(class_path):
        continue

    files = sorted([f for f in os.listdir(class_path) if f.endswith(".csv")])
    random.shuffle(files)

    n_total = len(files)
    n_train = int(n_total * SPLIT_RATIOS[0])
    n_val = int(n_total * SPLIT_RATIOS[1])
    n_test = n_total - n_train - n_val

    train_files = files[:n_train]
    val_files = files[n_train:n_train + n_val]
    test_files = files[n_train + n_val:]

    for f in train_files:
        src = os.path.join(class_path, f)
        dst = os.path.join(OUTPUT_BASE, "train", cls, f)
        copy2(src, dst)

    for f in val_files:
        src = os.path.join(class_path, f)
        dst = os.path.join(OUTPUT_BASE, "val", cls, f)
        copy2(src, dst)

    for f in test_files:
        src = os.path.join(class_path, f)
        dst = os.path.join(OUTPUT_BASE, "test", cls, f)
        copy2(src, dst)

print("✅ Done! Files have been split and copied to:", OUTPUT_BASE)


✅ Done! Files have been split and copied to: /home/HardDisk/Satang/github/30/15


In [31]:
from pathlib import Path
import numpy as np
import pandas as pd

# Config
DATA_ROOT = Path("/home/HardDisk/Satang/github/30/15")  # Folder structure: split/train/class/*.csv

INNER_STRIDE = 1
NUM_FEATURES = 8

def extract_x_vectors(csv_path, t_window, inner_stride):
    df = pd.read_csv(csv_path, header=None).values  # shape: (time, 8)
    x_vectors = []

    for i in range(0, len(df) - t_window + 1, inner_stride):
        window = df[i:i + t_window, :]  # shape: (t_window, 8)
        vector = np.mean(window, axis=0)
        x_vectors.append(vector)

    return np.array(x_vectors)  # shape: (N_x, 8)

def extract_time_aware_samples(x_vectors, t_d, t_window):
    sample_len = t_d - t_window + 1
    outer_stride = max(1, int(0.25 * sample_len))  # 75% overlap → 25% stride

    samples = []
    for i in range(0, len(x_vectors) - sample_len + 1, outer_stride):
        chunk = x_vectors[i:i + sample_len]
        samples.append(chunk)

    return np.stack(samples) if samples else np.empty((0, sample_len, NUM_FEATURES))

def load_dataset_by_split(split, t_window, t_d):
    data = []
    labels = []
    split_path = DATA_ROOT / split

    for class_dir in sorted(split_path.iterdir()):
        if not class_dir.is_dir():
            continue

        for file in sorted(class_dir.glob("*.csv")):
            x_vecs = extract_x_vectors(file, t_window, INNER_STRIDE)
            samples = extract_time_aware_samples(x_vecs, t_d, t_window)

            data.extend(samples)
            labels.extend([class_dir.name] * len(samples))

    return np.array(data), np.array(labels)

# Test configuration (adjust as needed)

T_D = 30 
sample_len = T_D - window_size + 1

X_train, y_train = load_dataset_by_split("train", window_size, T_D)
X_train.shape, y_train.shape, sample_len
# Example call: load training data
X_train, y_train =load_dataset_by_split("train", window_size, T_D)
X_val, y_val = load_dataset_by_split("val", window_size,T_D)
X_test, y_test = load_dataset_by_split("test", window_size,T_D)


X_train.shape, y_train.shape



((11343, 16, 8), (11343,))

In [32]:
def split_csv_into_chunks(input_root, output_root, window_size, t_d, num_features=8):
    input_root = Path(input_root)
    output_root = Path(output_root)
    sample_counter = {}

    chunk_len = t_d - window_size + 1
    stride = max(1, int(0.25 * chunk_len))  # 75% overlap

    for split in ['train', 'val', 'test']:
        for class_dir in (input_root / split).iterdir():
            if not class_dir.is_dir():
                continue

            class_name = class_dir.name
            out_class_dir = output_root / split / class_name
            out_class_dir.mkdir(parents=True, exist_ok=True)

            for file in sorted(class_dir.glob("*.csv")):
                df = pd.read_csv(file, header=None).values
                if df.shape[1] != num_features:
                    continue

                sample_counter.setdefault(class_name, 0)
                for i in range(0, len(df) - chunk_len + 1, stride):
                    chunk = df[i:i + chunk_len]
                    sample_counter[class_name] += 1

                    out_path = out_class_dir / f"{class_name}_sample_{sample_counter[class_name]:04d}.csv"
                    pd.DataFrame(chunk).to_csv(out_path, index=False, header=False)

    print(f"✅ Done: All files split into ({chunk_len}, {num_features}) samples with stride={stride}.")

split_csv_into_chunks(
    input_root="/home/HardDisk/Satang/github/30/15",
    output_root="/home/HardDisk/Satang/github/30/15/split_16",
    window_size=10,
    t_d=30,
    num_features=8
)

✅ Done: All files split into (21, 8) samples with stride=5.
