In [165]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import timm

from collections import Counter
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from keras.models import Sequential
from keras.layers import LSTM, Dense, Conv2D, MaxPooling2D, Flatten, Dropout
from keras.utils import to_categorical
from imblearn.over_sampling import SMOTE
from pyts.image import GramianAngularField, MarkovTransitionField
from io import BytesIO
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

In [166]:
# For shuffle
SEED = 42

In [167]:
# For feature engineering
t_d = 26 # detection time
t_w = 10 # time windows

In [168]:
# For model
EPOCH = 30
BATCHSIZE = 32
VAL = 0.2

In [169]:
BASE = 'E:/thesis project/thesis_proj/RanSAP-main/dataset'

In [170]:
DATAPATH = 'original'

In [171]:
FOLDER = "win7-250gb-ssd"

In [172]:
os.chdir(f'{BASE}/{DATAPATH}')
folders = sorted(os.listdir())
print(folders)

['win7-120gb-hdd', 'win7-120gb-ssd', 'win7-250gb-hdd', 'win7-250gb-ssd']


In [173]:
os.chdir(f'{BASE}/{DATAPATH}/{FOLDER}')
labels = sorted(os.listdir())
print(labels)

['AESCrypt', 'Cerber', 'Cerber-largefiles', 'Cerber-w10dirs', 'Darkside', 'Darkside-largefiles', 'Darkside-w10dirs', 'Excel', 'Firefox', 'GandCrab4', 'GandCrab4-largefiles', 'GandCrab4-w10dirs', 'Ryuk', 'Ryuk-largefiles', 'Ryuk-w10dirs', 'SDelete', 'Sodinokibi', 'Sodinokibi-largefiles', 'Sodinokibi-w10dirs', 'TeslaCrypt', 'TeslaCrypt-largefiles', 'TeslaCrypt-w10dirs', 'WannaCry', 'WannaCry-largefiles', 'WannaCry-w10dirs', 'Zip']


In [174]:
benign = ['AESCrypt', 'Zip', 'SDelete', 'Excel', 'Firefox']
ransomware = ['TeslaCrypt', 'Cerber', 'WannaCry', 'GandCrab4', 'Ryuk', 'Sodinokibi', 'Darkside']

In [175]:
X_train = []
X_test = []
y1_train = []
y1_test = []

In [176]:
df_r = pd.read_csv('E:/thesis project/thesis_proj/RanSAP-main/dataset/original/win7-120gb-ssd/TeslaCrypt/TeslaCrypt-20210616_19-34-53/ata_read.csv', header=None)
#df_w = pd.read_csv(f'{BASE}/{DATAPATH}/{folder}/{label}/{dirs[dir_idx]}/{files[1]}', header=None)
timestamp_r = np.unique(df_r[0])
print(df_r.shape)
filtered_df_r = df_r[(df_r[0] >= timestamp_r[0]) & (df_r[0] < timestamp_r[0+t_w])]
print(filtered_df_r.shape)

(109304, 4)
(65646, 4)


In [177]:
np.random.seed(SEED)
for folder in folders:
    for label in labels:
        os.chdir(f'{BASE}/{DATAPATH}/{folder}/{label}')
        dirs = sorted(os.listdir())
        dirs = np.array(dirs)
        # Shuffle directory
        np.random.seed(SEED)
        np.random.shuffle(dirs)
        train_idx = int(len(dirs)*0.8)
        print(f"Train index: {train_idx}")

        for dir_idx in range(len(dirs)):
            print(dirs[dir_idx])
            os.chdir(f'{BASE}/{DATAPATH}/{folder}/{label}/{dirs[dir_idx]}')
            files = sorted(os.listdir())
            tmp = []
            
            for i in range(t_d - t_w):
                #if (dirs[dir_idx]=="TeslaCrypt-20210616_22-22-34"):
                df_r = pd.read_csv(f'{BASE}/{DATAPATH}/{folder}/{label}/{dirs[dir_idx]}/{files[0]}', header=None)
                df_w = pd.read_csv(f'{BASE}/{DATAPATH}/{folder}/{label}/{dirs[dir_idx]}/{files[1]}', header=None)
                timestamp_r = np.unique(df_r[0])
                print(f"Current i: {i}, timestamp_r length: {len(timestamp_r)}, accessing index: {i + t_w}")
                timestamp_w = np.unique(df_w[0])

                filtered_df_r = df_r[(df_r[0] >= timestamp_r[i]) & (df_r[0] < timestamp_r[i+t_w])]
                filtered_df_w = df_w[(df_w[0] >= timestamp_w[i]) & (df_w[0] < timestamp_w[i+t_w])]

                # average write throughput [byte/s]
                T_write = np.sum(filtered_df_w[3])/t_w

                # average read throughput [byte/s]
                T_read = np.sum(filtered_df_r[3])/t_w

                # variance of logical block addresses (written)
                V_write_mean = np.mean(filtered_df_w[2])
                V_write = (1/(t_w-1)) * np.sum((filtered_df_w[2]-V_write_mean)**2)

                # variance of logical block addresses (read)
                V_read_mean = np.mean(filtered_df_r[2])
                V_read = (1/(t_w-1)) * np.sum((filtered_df_r[2]-V_read_mean)**2)

                # average normalized Shannon entropy
                H_write = (1/t_w) * np.sum(filtered_df_w[4])

                tmp.append([T_write, T_read, V_write, V_read, H_write])

            # Train-Test split
            if dir_idx < train_idx:
                X_train.append(tmp)
                y1_train.append(label)
            else:
                X_test.append(tmp)
                y1_test.append(label)

Train index: 8
AESCrypt-20200427_17-28-53
Current i: 0, timestamp_r length: 97, accessing index: 10
Current i: 1, timestamp_r length: 97, accessing index: 11
Current i: 2, timestamp_r length: 97, accessing index: 12
Current i: 3, timestamp_r length: 97, accessing index: 13
Current i: 4, timestamp_r length: 97, accessing index: 14
Current i: 5, timestamp_r length: 97, accessing index: 15
Current i: 6, timestamp_r length: 97, accessing index: 16
Current i: 7, timestamp_r length: 97, accessing index: 17
Current i: 8, timestamp_r length: 97, accessing index: 18
Current i: 9, timestamp_r length: 97, accessing index: 19
Current i: 10, timestamp_r length: 97, accessing index: 20
Current i: 11, timestamp_r length: 97, accessing index: 21
Current i: 12, timestamp_r length: 97, accessing index: 22
Current i: 13, timestamp_r length: 97, accessing index: 23
Current i: 14, timestamp_r length: 97, accessing index: 24
Current i: 15, timestamp_r length: 97, accessing index: 25
AESCrypt-20200427_16-29-

In [178]:
X_train = np.array(X_train)
y1_train = np.array(y1_train)
X_test = np.array(X_test)
y1_test = np.array(y1_test)
print(type(X_train))
print(X_train.shape)
print(y1_train.shape)
print(X_test.shape)
print(y1_test.shape)


<class 'numpy.ndarray'>
(832, 16, 5)
(832,)
(212, 16, 5)
(212,)


In [179]:
X_train_reshaped = X_train.reshape(-1, X_train.shape[-1])
X_train_reshaped.shape

(13312, 5)

In [180]:
X_test_reshaped = X_test.reshape(-1, X_test.shape[-1])
X_test_reshaped.shape

(3392, 5)

In [181]:
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train_reshaped)
X_test_normalized = scaler.transform(X_test_reshaped)

In [182]:
X_train_normalized = X_train_normalized.reshape(X_train.shape)
X_test_normalized = X_test_normalized.reshape(X_test.shape)

In [183]:
print(X_train_normalized.shape)
print(X_test_normalized.shape)

(832, 16, 5)
(212, 16, 5)


In [184]:
np.save('E:/thesis project/thesis_proj/X_train_normalized.npy',X_train_normalized)
np.save('E:/thesis project/thesis_proj/y1_train.npy',y1_train)
np.save('E:/thesis project/thesis_proj/X_test_normalized.npy',X_test_normalized)
np.save('E:/thesis project/thesis_proj/y1_test.npy',y1_test)

## Label Encoder

In [185]:
y_train = []
for y in y1_train:
    if y in benign:
        y_train.append('Benign')
    else:
        y_train.append('Ransomware')
print(len(y_train))

832


In [186]:
y_test = []

for y in y1_test:
    if y in benign:
        y_test.append('Benign')
    else:
        y_test.append('Ransomware')

In [187]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [188]:
lbl = LabelEncoder()

In [189]:
classes, count = np.unique(y_train, return_counts=True)
print(classes)
print(count)

['Benign' 'Ransomware']
[160 672]


In [190]:
print(lbl.fit_transform(classes), classes)
y_train_int = lbl.fit_transform(y_train)
y_train_int.shape

[0 1] ['Benign' 'Ransomware']


(832,)

In [191]:
classes_test, count_test = np.unique(y_test, return_counts=True)
print(classes_test)
print(count_test)

['Benign' 'Ransomware']
[ 41 171]


In [192]:
y_test_int = lbl.transform(y_test)
y_test_int.shape

(212,)

## Data Augmentation (SMOTE)

In [193]:
X_train_normalized = np.load('E:/thesis project/thesis_proj/X_train_normalized.npy')
X_test_normalized = np.load('E:/thesis project/thesis_proj/X_test_normalized.npy')
X_train_normalized = np.load('E:/thesis project/thesis_proj/X_train_normalized.npy')
y1_test = np.load('E:/thesis project/thesis_proj/y1_test.npy')
y1_train = np.load('E:/thesis project/thesis_proj/y1_train.npy')
# For model
EPOCH = 30
BATCHSIZE = 32
VAL = 0.2
BASE = 'E:/thesis project/thesis_proj/'

In [194]:
data_train_flattened = X_train_normalized.reshape(X_train_normalized.shape[0], -1)
data_train_flattened.shape

(832, 80)

In [195]:
SEED =42
np.random.seed(SEED)
smote = SMOTE(sampling_strategy='auto', random_state=SEED) # Resample benign class to be equal to ransomware class
data_train_resampled_flattened, y_train_int_aug = smote.fit_resample(data_train_flattened, y_train_int)
print(y_train_int_aug.shape)
np.save('E:/thesis project/thesis_proj/y_train_int_aug.npy',y_train_int_aug)

(1344,)


In [196]:
X_train_normalized_aug = data_train_resampled_flattened.reshape(data_train_resampled_flattened.shape[0], X_train_normalized.shape[1], X_train_normalized.shape[2])
X_train_normalized_aug.shape
np.save('E:/thesis project/thesis_proj/X_train_normalized_aug.npy',X_train_normalized_aug)