In [18]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import timm

from collections import Counter
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from keras.models import Sequential
from keras.layers import LSTM, Dense, Conv2D, MaxPooling2D, Flatten, Dropout
from keras.utils import to_categorical
from imblearn.over_sampling import SMOTE
from pyts.image import GramianAngularField, MarkovTransitionField
from io import BytesIO
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

In [19]:
# For shuffle
SEED = 42

In [20]:
# For feature engineering
t_d = 30 # detection time
t_w = 10 # time windows

In [21]:
# For model
EPOCH = 30
BATCHSIZE = 32
VAL = 0.2

In [22]:
BASE = '/media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/RanSAP/dataset'

In [23]:
DATAPATH = 'original'

In [24]:
FOLDER = "win7-250gb-ssd"

In [25]:
os.chdir(f'{BASE}/{DATAPATH}')
folders = sorted(os.listdir())
print(folders)

['win7-120gb-hdd', 'win7-120gb-ssd', 'win7-250gb-hdd', 'win7-250gb-ssd']


In [26]:
os.chdir(f'{BASE}/{DATAPATH}/{FOLDER}')
labels = sorted(os.listdir())
print(labels)

['AESCrypt', 'Cerber', 'Cerber-largefiles', 'Cerber-w10dirs', 'Darkside', 'Darkside-largefiles', 'Darkside-w10dirs', 'Excel', 'Firefox', 'GandCrab4', 'GandCrab4-largefiles', 'GandCrab4-w10dirs', 'Ryuk', 'Ryuk-largefiles', 'Ryuk-w10dirs', 'SDelete', 'Sodinokibi', 'Sodinokibi-largefiles', 'Sodinokibi-w10dirs', 'TeslaCrypt', 'TeslaCrypt-largefiles', 'TeslaCrypt-w10dirs', 'WannaCry', 'WannaCry-largefiles', 'WannaCry-w10dirs', 'Zip']


In [27]:
benign = ['AESCrypt', 'Zip', 'SDelete', 'Excel', 'Firefox']
ransomware = ['TeslaCrypt', 'TeslaCrypt-largefiles', 'TeslaCrypt-w10dirs',
              'Cerber', 'Cerber-largefiles', 'Cerber-w10dirs', 
              'Darkside', 'Darkside-largefiles', 'Darkside-w10dirs',
              'WannaCry', 'WannaCry-largefiles', 'WannaCry-w10dirs', 
              'GandCrab4', 'GandCrab4-largefiles', 'GandCrab4-w10dirs',
              'Ryuk', 'Ryuk-largefiles', 'Ryuk-w10dirs',
              'Sodinokibi', 'Sodinokibi-largefiles', 'Sodinokibi-w10dirs']

In [28]:
X_train = []
X_test = []
y1_train = []
y1_test = []
window_size = 10

In [None]:
import numpy as np

def adjust_rows_to_multiple(array1, array2, divisor):
    """
    Adjust the number of rows in two 2D arrays such that:
    - The smaller array's row count is rounded down to the nearest multiple of the given divisor.
    - The larger array's row count is truncated to match the adjusted smaller array.

    Parameters:
        array1 (np.ndarray): First 2D array.
        array2 (np.ndarray): Second 2D array.
        divisor (int): The number to which rows should be divisible.

    Returns:
        np.ndarray, np.ndarray: Arrays with adjusted row counts.
    """
    rows1, rows2 = array1.shape[0], array2.shape[0]
    
    # Find the smaller number of rows and round down to the nearest multiple of the divisor
    min_rows = min(rows1, rows2)
    adjusted_rows = (min_rows // divisor) * divisor  # Round down to nearest multiple of divisor
    
    # Truncate both arrays to the adjusted row count
    array1 = array1[:adjusted_rows, :]
    array2 = array2[:adjusted_rows, :]
    
    return array1, array2




Before adjustment:
Array 1 shape: (46, 3)
Array 2 shape: (74, 3)

After adjustment:
Array 1 shape: (40, 3)
Array 2 shape: (40, 3)


In [30]:
import numpy as np
import os

def save_numpy_array(array, label, base_dir, file_format='npy'):
    """
    Save a Numpy array to a file in a class-named folder, appending a dynamic suffix to the label based on the number of existing files.

    Parameters:
    - array (np.ndarray): The Numpy array to save.
    - label (str): The base label to use as the filename and folder name for the class.
    - base_dir (str): The base directory where the class folder will be created.
    - file_format (str): The format to save the file in ('npy' or 'csv'). Default is 'npy'.
    
    Returns:
    - str: The full path to the saved file.
    """
    # Create class folder within the base directory
    class_dir = os.path.join(base_dir, label)
    os.makedirs(class_dir, exist_ok=True)  # Create the folder if it doesn't exist

    # Count existing files in the class folder
    existing_files = os.listdir(class_dir)
    file_count = sum(1 for file in existing_files if file.endswith(f".{file_format}"))
    
    # Generate suffix based on the file count
    suffix = f"_{file_count + 1}"  # Start from _1 if no files exist

    # Define the full file path with the dynamic suffix
    file_name = f"{label}{suffix}.{file_format}"
    file_path = os.path.join(class_dir, file_name)

    # Save the file in the specified format
    if file_format == 'npy':
        np.save(file_path, array)
    elif file_format == 'csv':
        np.savetxt(file_path, array, delimiter=",", comments="")
    else:
        raise ValueError("Unsupported file format. Use 'npy' or 'csv'.")

    return file_path



In [31]:
df_r = pd.read_csv(r"/media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/RanSAP/dataset/original/win7-120gb-hdd/AESCrypt/AESCrypt-20200427_17-28-53/ata_read.csv", header=None)
df_w = pd.read_csv(r"/media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/RanSAP/dataset/original/win7-120gb-hdd/AESCrypt/AESCrypt-20200427_17-28-53/ata_write.csv", header=None)
df_r = np.array(df_r)
df_w = np.array(df_w)

i = 0
window_size = 10
V_write_mean = np.mean(df_w[i:i+window_size,2])
V_write = (1/(window_size-1)) * np.sum((df_w[i:i+window_size,2]-V_write_mean)**2)
V_read_mean = np.mean(df_r[i:i+window_size,2])
V_read = (1/(window_size-1)) * np.sum((df_r[i:i+window_size,2]-V_read_mean)**2)
print(df_w[0:10:2])
print(V_read)
print(V_write)
i = 1
window_size = 10
V_write_mean = np.mean(df_w[i:i+window_size,2])
V_write = (1/(window_size-1)) * np.sum((df_w[i:i+window_size,2]-V_write_mean)**2)
V_read_mean = np.mean(df_r[i:i+window_size,2])
V_read = (1/(window_size-1)) * np.sum((df_r[i:i+window_size,2]-V_read_mean)**2)
print(V_read)
print(V_write)

[[1.58797588e+09 7.79844026e+08 6.51912800e+06 4.09600000e+03
  3.31856426e-01 2.62084225e-01]
 [1.58797588e+09 7.79845099e+08 6.51914400e+06 4.09600000e+03
  3.94940333e-01 3.40999690e-01]
 [1.58797588e+09 7.80105960e+08 4.50410800e+07 4.09600000e+03
  5.16920602e-01 4.58237374e-01]
 [1.58797588e+09 7.81520833e+08 8.71308160e+07 4.09600000e+03
  2.77433311e-01 2.20048575e-01]
 [1.58797588e+09 7.81777399e+08 6.30371200e+06 4.09600000e+03
  3.89769618e-01 3.18499028e-01]]
586.6666666666666
732912521713579.2
586.6666666666666
713865029286235.6


In [32]:
# window_size = 10
# tmp = []
# tmp_train = []
# df_r = pd.read_csv(r"/media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/RanSAP/dataset/original/win7-120gb-hdd/AESCrypt/AESCrypt-20200427_17-28-53/ata_read.csv", header=None)
# df_w = pd.read_csv(r"/media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/RanSAP/dataset/original/win7-120gb-hdd/AESCrypt/AESCrypt-20200427_17-28-53/ata_write.csv", header=None)
# df_r = np.array(df_r)
# df_w = np.array(df_w)
# # df_r = np.sum(df_r[0:10,3])
# # df_w = np.sum(df_w[0:10,3])
# # print(df_w)
# # print(df_r)
# i= 0
# # print(df_r.shape)
# # print(df_w.shape)
# # print(df_w[0:10,3])
# df_r, df_w = adjust_rows_to_multiple_of_ten(df_r, df_w)
# print(df_r.shape)
# while i < df_r.shape[0]-10:
#     # timestamp_r = np.unique(df_r[0])
#     # timestamp_w = np.unique(df_w[0])

#     # filtered_df_r = df_r[(df_r[0] >= timestamp_r[i]) & (df_r[0] < timestamp_r[i+t_w])]
#     # filtered_df_w = df_w[(df_w[0] >= timestamp_w[i]) & (df_w[0] < timestamp_w[i+t_w])]

#     # average write throughput [byte/s]
#     T_write = np.sum(df_w[i:i+window_size,3])/(np.sum(df_w[i:i+window_size,1])/1000000) #1000000 is to convert from microsecond to second
#     # average read throughput [byte/s]
#     T_read = np.sum(df_r[i:i+window_size,3])/(np.sum(df_r[i:i+window_size,1])/1000000)

#     # variance of logical block addresses (written)
#     V_write_mean = np.mean(df_w[i:i+window_size,2])
#     V_write = (1/(window_size-1)) * np.sum((df_w[i:i+window_size,2]- V_write_mean)**2)

#     # variance of logical block addresses (read)
#     V_read_mean = np.mean(df_r[i:i+window_size,2])
#     V_read = (1/(window_size-1)) * np.sum((df_r[i:i+window_size,2]- V_read_mean)**2)

#     # average normalized Shannon entropy
#     H_write = (1/window_size) * np.sum(df_w[i:i+window_size,4])

#     tmp.append([T_write, T_read, V_write, V_read, H_write])
#     # print(len(tmp)
#     i = i+1
#     print(i)
# print("loop finished")
# # print(len(tmp))
# # print(tmp[0])
# tmp_train.append(tmp)
# tmp_train = np.array(tmp_train)
# transposed_array = tmp_train.transpose(1, 2, 0)

# # Combine the groups into a single array
# result_array = transposed_array.reshape(transposed_array.shape[0], -1)

# # Print the resulting array
# print(result_array)
# print(result_array.shape)
# #print(tmp_train.shape)

In [37]:
window_size = 20
step_size = 5
output_dir = "/media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/raw data (new_feature)"
np.random.seed(SEED)
for folder in folders:
    print("folder finished")
    for label in labels:
        os.chdir(f'{BASE}/{DATAPATH}/{folder}/{label}')
        dirs = sorted(os.listdir())
        dirs = np.array(dirs)
        # Shuffle directory
        np.random.seed(SEED)
        np.random.shuffle(dirs)
        # train_idx = int(len(dirs)*0.8)
        # print(f"Train index: {train_idx}")

        for dir_idx in range(len(dirs)):
            print(dirs[dir_idx])
            os.chdir(f'{BASE}/{DATAPATH}/{folder}/{label}/{dirs[dir_idx]}')
            files = sorted(os.listdir())
            tmp = []
            tmp_train =[]
            i = 0
            df_r = pd.read_csv(f'{BASE}/{DATAPATH}/{folder}/{label}/{dirs[dir_idx]}/{files[0]}', header=None)
            df_w = pd.read_csv(f'{BASE}/{DATAPATH}/{folder}/{label}/{dirs[dir_idx]}/{files[1]}', header=None)
            df_r = np.array(df_r)
            df_w = np.array(df_w)
            df_r, df_w = adjust_rows_to_multiple(df_r, df_w, window_size)
            step = int((df_r.shape[0]-window_size)/ step_size) + 1
            for k in range (step):
                # print(df_r.shape, df_w.shape)
                # print(df_r[:5], df_w[:5])
                # timestamp_r = np.unique(df_r[0])
                # timestamp_w = np.unique(df_w[0])

                # filtered_df_r = df_r[(df_r[0] >= timestamp_r[i]) & (df_r[0] < timestamp_r[i+t_w])]
                # filtered_df_w = df_w[(df_w[0] >= timestamp_w[i]) & (df_w[0] < timestamp_w[i+t_w])]

                # average write throughput [byte/s]
                T_write = np.sum(df_w[i:i+window_size,3])/(np.sum(df_w[i:i+window_size,1])/1000000) #1000000 is to convert from microsecond to second
                # average read throughput [byte/s]
                T_read = np.sum(df_r[i:i+window_size,3])/(np.sum(df_r[i:i+window_size,1])/1000000)

                # variance of logical block addresses (written)
                V_write_mean = np.mean(df_w[i:i+window_size,2])
                V_write = (1/(window_size-1)) * np.sum((df_w[i:i+window_size,2]-V_write_mean)**2)

                # variance of logical block addresses (read)
                V_read_mean = np.mean(df_r[i:i+window_size,2])
                V_read = (1/(window_size-1)) * np.sum((df_r[i:i+window_size,2]-V_read_mean)**2)

                # average normalized Shannon entropy
                H_write = (1/window_size) * np.sum(df_w[i:i+window_size,4])

                tmp.append([T_write, T_read, V_write, V_read, H_write])
                i = i+5
                # print(i)
            
            tmp_train.append(tmp)
            tmp_train = np.array(tmp_train)
            transposed_array = tmp_train.transpose(1, 2, 0)
            # print(f"Transposed array shape: {transposed_array.shape}")
            # Combine the groups into a single array
            result_array = transposed_array.reshape(transposed_array.shape[0], -1)
            save_numpy_array(result_array, label, output_dir, file_format='csv')
            # # Train-Test split
            # if dir_idx < train_idx:
            #     X_train.append(tmp)
            #     y1_train.append(label)
            # else:
            #     X_test.append(tmp)
            #     y1_test.append(label)

folder finished
AESCrypt-20200427_17-28-53
AESCrypt-20200427_16-29-10
AESCrypt-20200427_17-00-04
AESCrypt-20200427_16-23-28
AESCrypt-20200427_17-22-00
AESCrypt-20200427_16-36-30
AESCrypt-20200427_17-40-13
AESCrypt-20200427_16-52-26
AESCrypt-20200427_16-43-56
AESCrypt-20200427_17-04-45
Cerber-20200806_18-11-08
Cerber-20200805_23-25-39
Cerber-20200806_17-56-03
Cerber-20200805_23-19-47
Cerber-20200806_18-06-08
Cerber-20200806_17-38-41
Cerber-20200806_20-08-06
Cerber-20200806_17-49-48
Cerber-20200806_17-44-07
Cerber-20200806_18-01-07
Cerber-20200805_19-39-46
Cerber-20200805_18-57-22
Cerber-20200805_19-21-30
Cerber-20200805_18-49-19
Cerber-20200805_19-34-10
Cerber-20200805_19-03-16
Cerber-20200805_19-51-24
Cerber-20200805_19-15-08
Cerber-20200805_19-09-09
Cerber-20200805_19-28-41
Cerber-20200807_19-44-01
Cerber-20200807_18-46-27
Cerber-20200807_19-21-28
Cerber-20200807_18-39-45
Cerber-20200807_19-38-48
Cerber-20200807_18-53-05
Cerber-20200807_19-49-13
Cerber-20200807_19-15-45
Cerber-2020080

In [1]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import os
import re

def normalize_csv_files_in_chunks(directory, output_file, chunksize=10000):
    # Initialize scaler and output file
    scaler = MinMaxScaler(feature_range=(-1, 1))
    first_file = True

    # Function to extract numbers for sorting
    def extract_number(file_name):
        numbers = re.findall(r'\d+', file_name)
        return int(numbers[0]) if numbers else float('inf')

    # Traverse the directory and sort folders and files
    for root, dirs, files in os.walk(directory):
        # Sort directories and files for predictable numerical order
        dirs.sort(key=extract_number)
        files.sort(key=extract_number)

        for file in files:
            if file.endswith('.csv'):
                file_path = os.path.join(root, file)
                print(f"Processing file: {file_path}")

                # Process the file in chunks
                for chunk in pd.read_csv(file_path, chunksize=chunksize):
                    # Select numeric columns for normalization
                    numeric_cols = chunk.select_dtypes(include=['float64', 'int64']).columns

                    # Normalize numeric data
                    chunk_numeric = scaler.fit_transform(chunk[numeric_cols])
                    chunk_normalized = pd.DataFrame(chunk_numeric, columns=numeric_cols)

                    # Write to output file (append mode after the first chunk)
                    chunk_normalized.to_csv(output_file, mode='a', index=False, header=first_file)
                    first_file = False

    print(f"Normalization complete. Output saved to {output_file}.")


In [2]:
# Directory path
directory_path = "/media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/test_one_folder/raw_data_10_2_250_ssd"

# Output file
output_file_path = "/media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/test_one_folder/normalized_output_10_2_250.csv"

# Process and normalize
normalize_csv_files_in_chunks(directory_path, output_file_path)


Processing file: /media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/test_one_folder/raw_data_10_2_250_ssd/GandCrab4-w10dirs/GandCrab4-w10dirs_5.csv
Processing file: /media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/test_one_folder/raw_data_10_2_250_ssd/GandCrab4-w10dirs/GandCrab4-w10dirs_8.csv
Processing file: /media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/test_one_folder/raw_data_10_2_250_ssd/GandCrab4-w10dirs/GandCrab4-w10dirs_1.csv
Processing file: /media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/test_one_folder/raw_data_10_2_250_ssd/GandCrab4-w10dirs/GandCrab4-w10dirs_7.csv
Processing file: /media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/test_one_folder/raw_data_10_2_250_ssd/GandCrab4-w10dirs/GandCrab4-w10dirs_4.csv
Processing file: /media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/test_one_folder/raw_data_10_2_250_ssd/GandCrab4-w10dirs/GandCrab4-w10dir

In [3]:
def split_normalized_data_with_sorted_order(normalized_file, raw_data_folder, output_folder, chunksize=10000):
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Initialize an empty DataFrame to hold leftover rows
    buffer = pd.DataFrame()

    # Traverse the raw data folder structure
    for root, dirs, files in os.walk(raw_data_folder):
        # Sort directories and files for consistent order
        dirs.sort()
        files.sort()

        # Maintain the relative path structure
        relative_path = os.path.relpath(root, raw_data_folder)
        new_folder_path = os.path.join(output_folder, relative_path)
        os.makedirs(new_folder_path, exist_ok=True)

        for file in files:
            if file.endswith('.csv'):
                raw_file_path = os.path.join(root, file)

                # Load the raw file to determine row and column structure
                raw_file = pd.read_csv(raw_file_path)
                num_rows, num_cols = raw_file.shape

                # Load chunks of the normalized file until enough rows are read
                while len(buffer) < num_rows:
                    for chunk in pd.read_csv(normalized_file, chunksize=chunksize):
                        buffer = pd.concat([buffer, chunk], ignore_index=True)
                        if len(buffer) >= num_rows:
                            break

                # Extract corresponding rows and columns
                split_data = buffer.iloc[:num_rows, :num_cols]
                buffer = buffer.iloc[num_rows:]  # Keep remaining rows in the buffer

                # Save the split data into the corresponding folder
                output_file_path = os.path.join(new_folder_path, file)
                split_data.to_csv(output_file_path, index=False)
                print(f"Saved split data to: {output_file_path}")

    print("Splitting complete!")


In [5]:
# File paths
normalized_file = "/media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/test_one_folder/normalized_output_10_2_250.csv"  # Path to the large normalized file
raw_data_folder = "/media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/test_one_folder/raw_data_10_2_250_ssd"  # Original raw data folder
output_folder = "/media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/test_one_folder/raw_data_normalized_10_2_250"  # Destination for split files

# Run the splitting process
split_normalized_data_with_sorted_order(normalized_file, raw_data_folder, output_folder)


Saved split data to: /media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/test_one_folder/raw_data_normalized_10_2_250/AESCrypt/AESCrypt_1.csv
Saved split data to: /media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/test_one_folder/raw_data_normalized_10_2_250/AESCrypt/AESCrypt_10.csv
Saved split data to: /media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/test_one_folder/raw_data_normalized_10_2_250/AESCrypt/AESCrypt_2.csv
Saved split data to: /media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/test_one_folder/raw_data_normalized_10_2_250/AESCrypt/AESCrypt_3.csv
Saved split data to: /media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/test_one_folder/raw_data_normalized_10_2_250/AESCrypt/AESCrypt_4.csv
Saved split data to: /media/eab301b/b60ac77c-0185-406e-8549-95e9a295e4ca/thesis/thesis_proj/test_one_folder/raw_data_normalized_10_2_250/AESCrypt/AESCrypt_5.csv
Saved split data to: /media/eab30