# Ransomware Early Detection based on pre-attack activities
This notebook contains all the models the authors have developed.

First of all we have to mount the Google Drive volume for dataset loading.

In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [10]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#ML auxiliary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

# Classifiers
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Keras
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils.np_utils import to_categorical
from keras.metrics import TopKCategoricalAccuracy

#SkLearn auxiliary libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay, f1_score, precision_score, recall_score, make_scorer

We import the dataset and we reduce the number of samples per family (max. 450 items).

In [3]:
dataset = pd.read_csv('/content/drive/MyDrive/dataset_cleaned_wo_blacklisted_families.csv')
dataset = dataset.groupby('Family').head(450)
family_count = dataset['Family'].sort_values()
print(family_count.value_counts())

2     450
5     450
14    450
15    450
17    450
23    446
9     443
0     432
6     377
3     359
13    331
8     295
7     266
Name: Family, dtype: int64


## Binary classification
In this section we build the models for binary classification. The models we have chosen are:
- Bernoulli Naive Bayes
- K-Nearest Neighbors
- Random Forest

### Dataset preparation
First of all we transform the dataset to a two-class one by replacing all families identifiers with *Ransomware* or *Benign* classes.

In [4]:
binary_class_dataset = dataset.copy(deep=True)
binary_class_dataset['Family'] = np.where(binary_class_dataset.Family <= 22, 'Ransomware', 'Benign')

In [5]:
binary_class_dataset.shape

(5199, 40)

We then split the labels from the features in order to run the classification tasks.

In [6]:
data = binary_class_dataset.to_numpy()
x = data[:, :-1].astype(str)
y = data[:, -1].astype(str)

### ML Models
In this section we train and validate the three models which have been already mentioned above for what concerns binary classification

#### Bernoulli Naive Bayes

In [32]:
bern_na_bay = BernoulliNB()

scores = cross_val_score(bern_na_bay, x, y, cv=10)
print(scores.mean())

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=75)
bern_na_bay.fit(x_train, y_train)

predict=bern_na_bay.predict(x_test) 

print(classification_report(predict, y_test))

0.9924929598340004
              precision    recall  f1-score   support

      Benign       0.94      1.00      0.97        95
  Ransomware       1.00      0.99      1.00       945

    accuracy                           0.99      1040
   macro avg       0.97      1.00      0.98      1040
weighted avg       0.99      0.99      0.99      1040



#### K-Nearest Neighbors

In [35]:
knn = KNeighborsClassifier(n_neighbors=3)

scores = cross_val_score(knn, x, y, cv=10)
print(scores.mean())

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=75)
knn.fit(x_train, y_train)

predict=knn.predict(x_test) 

print(classification_report(predict, y_test))

0.9915314213724619
              precision    recall  f1-score   support

      Benign       1.00      0.99      1.00       102
  Ransomware       1.00      1.00      1.00       938

    accuracy                           1.00      1040
   macro avg       1.00      1.00      1.00      1040
weighted avg       1.00      1.00      1.00      1040



#### Random Forest

In [37]:
rand_for_classifier=RandomForestClassifier()

scores = cross_val_score(rand_for_classifier, x, y, cv=10)
print(scores.mean())

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=75)
rand_for_classifier.fit(x_train, y_train)

predict=rand_for_classifier.predict(x_test) 

print(classification_report(predict, y_test))

0.9990377204683563
              precision    recall  f1-score   support

      Benign       1.00      1.00      1.00       101
  Ransomware       1.00      1.00      1.00       939

    accuracy                           1.00      1040
   macro avg       1.00      1.00      1.00      1040
weighted avg       1.00      1.00      1.00      1040



## Multi-label classification

### Dataset preparation

We then split the labels from the features in order to run the classification tasks. We also remove the first row with column heading.

In [38]:
dataset = dataset.iloc[1: , :]
data = dataset.to_numpy()
x = data[:, :-1].astype(float)
y = data[:, -1].astype(float)

### ML model
For multilabel classification we have chosen to use a Convolutional Neural Network

In [39]:
# Model creation and training
y_cat = to_categorical(y)
x_train, x_test, y_train, y_test = train_test_split(x, y_cat, test_size=0.2)
model = Sequential()
model.add(Dense(512, input_dim=39, activation = "relu"))
model.add(Dense(256, activation = "relu"))
model.add(Dense(128, activation = "relu"))
model.add(Dropout(0.2))
model.add(Dense(24, activation = "softmax"))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', TopKCategoricalAccuracy(k=2)])
model.summary()
model.fit(x_train, y_train, verbose=1, epochs=100, batch_size=30)

predict_x=model.predict(x_test) 
y_pred_class=np.argmax(predict_x,axis=1)

y_pred = model.predict(x_test)
y_test_class = np.argmax(y_test, axis=1)
print(confusion_matrix(y_test_class, y_pred_class))

print(classification_report(y_test_class, y_pred_class))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               20480     
                                                                 
 dense_1 (Dense)             (None, 256)               131328    
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_3 (Dense)             (None, 24)                3096      
                                                                 
Total params: 187,800
Trainable params: 187,800
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Ep

## Comparison with previous works

### On Ransomware Family Attribution Using Pre-Attack Paranoia Activities

In [None]:
# Dataset loading
paranoia_dataset = pd.read_csv('/content/drive/MyDrive/paranoia_dataset.csv')
data_para = paranoia_dataset.to_numpy()
xp = data_para[:, :-1].astype(float)
yp = data_para[:, -1].astype(float)
yp_cat = to_categorical(yp)
xp_train, xp_test, yp_train, yp_test = train_test_split(xp, yp_cat, test_size=0.2)

We first check the accuracy of their model using their dataset

In [None]:
randclf = RandomForestClassifier(n_estimators=50, random_state=10)
randclf.fit(xp_train, yp_train)
yp_train_pred = randclf.predict(xp_train)
print('train precision: ' + str(precision_score(yp_train, yp_train_pred, average='weighted')))
print('train recall: ' + str(recall_score(yp_train, yp_train_pred, average='weighted')))
print('train accuracy: ' + str(accuracy_score(yp_train, yp_train_pred)))
yp_test_pred = randclf.predict(xp_test)
print('test precision: ' + str(precision_score(yp_test, yp_test_pred, average='weighted')))
print('test recall: ' + str(recall_score(yp_test, yp_test_pred, average='weighted')))
print('test accuracy: ' + str(accuracy_score(yp_test, yp_test_pred)))

We then use our model on their dataset to check the performances

In [None]:
model = Sequential()
model.add(Dense(512, input_dim=23, activation = "relu"))
model.add(Dense(256, activation = "relu"))
model.add(Dense(128, activation = "relu"))
model.add(Dropout(0.2))
model.add(Dense(6, activation = "softmax"))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', TopKCategoricalAccuracy(k=2)])
model.summary()
model.fit(xp_train, yp_train, verbose=1, epochs=100, batch_size=30)

predict_xp=model.predict(xp_test) 
yp_pred_class=np.argmax(predict_xp,axis=1)

yp_pred = model.predict(xp_test)
yp_test_class = np.argmax(yp_test, axis=1)

print(classification_report(yp_test_class, yp_pred_class))