# Классификатор DGA доменов

## Ссылки:

- https://habr.com/ru/companies/pt/articles/282349/
- https://www.securitylab.ru/blog/personal/Morning/347957.php
- https://underdefense.com/guides/detecting-dga-domains-machine-learning-approach/
- https://habr.com/ru/companies/wunderfund/articles/331310/
- https://github.com/ryancollingwood/DGAClassifier

## Наборы данных

- https://www.kaggle.com/datasets/gtkcyber/dga-dataset
- https://github.com/chrmor/DGA_domains_dataset


In [1]:
!nvidia-smi

Wed Mar 27 08:21:53 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.65                 Driver Version: 551.86         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3070 Ti     On  |   00000000:01:00.0  On |                  N/A |
| 53%   32C    P8             20W /  310W |    2486MiB /   8192MiB |      7%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import keras

from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.layers import LSTM

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

dir_path = 'work/models/dga/'

2024-03-27 08:22:01.896010: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-27 08:22:02.097983: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-27 08:22:02.098037: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-27 08:22:02.133287: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-27 08:22:02.208471: I tensorflow/core/platform/cpu_feature_guar

In [3]:
!pip install pycaret
!pip install mlflow
%pip install scipy==1.11.4

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [4]:
df1 = pd.read_csv('work/datasets/dga_data.csv')
df2 = pd.read_csv('work/datasets/dga_domains_full.csv')
df3 = pd.read_csv('work/datasets/dga_domains.csv')

In [5]:
## Очистка и преобразование исходных данных
df1.loc[(df1['isDGA'] == 'dga'), 'isDGA'] = 1
df1.loc[(df1['isDGA'] == 'legit'), 'isDGA'] = 0

df1 = df1.drop(columns=df1.columns[1], axis=1)
df1 = df1.rename(columns={'host': 'domain'})

df2.loc[(df2['isDGA'] == 'legit'), 'isDGA'] = 0
df2.loc[(df2['isDGA'] == 'dga'), 'isDGA'] = 1

df3.loc[(df3['isDGA'] == 'legit'), 'isDGA'] = 0
df3.loc[(df3['isDGA'] == 'dga'), 'isDGA'] = 1

df3 = df3.drop(columns=df1.columns[2], axis=1)
df3 = df3.drop(columns=df1.columns[1], axis=1)

df3.head()

df3 = df3.rename(columns={'host': 'domain'})

In [6]:
df = pd.concat([df1, df2, df3])

df = df.astype({'domain': 'string', 'isDGA': 'int8', 'subclass': 'category'})

In [7]:
# df.head(10)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 968824 entries, 0 to 133925
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype   
---  ------    --------------   -----   
 0   isDGA     968824 non-null  int8    
 1   domain    968824 non-null  string  
 2   subclass  834898 non-null  category
dtypes: category(1), int8(1), string(1)
memory usage: 16.6 MB


In [8]:
X, y = df['domain'], df['isDGA']

In [9]:
ascii_ranges = ["_", "-", ".", "0", "1", "9", "a", "z"]
ascii_ord_ranges = list()
for c in ascii_ranges:
    ascii_ord_ranges.append(ord(c))

expected_ords = [ord("-"), ord("_"), ord(".")] + list(range(ord("0"), ord("9") + 1)) + list(
    range(ord("a"), ord("z") + 1))

for i in ascii_ord_ranges:
    assert (i in expected_ords)

ord_data_dfs = dict()

for column in ['domain']:
    ord_data_dfs[column] = df[column].apply(lambda x: [ord(w) for w in x.lower()]).apply(pd.Series)
    display(ord_data_dfs[column].head())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,63,64,65,66,67,68,69,70,71,72
0,54.0,120.0,122.0,120.0,115.0,119.0,51.0,115.0,111.0,107.0,...,,,,,,,,,,
1,103.0,108.0,98.0,116.0,108.0,120.0,119.0,119.0,104.0,98.0,...,,,,,,,,,,
2,120.0,120.0,109.0,97.0,109.0,111.0,112.0,121.0,105.0,112.0,...,,,,,,,,,,
3,122.0,102.0,100.0,53.0,115.0,122.0,112.0,105.0,49.0,56.0,...,,,,,,,,,,
4,106.0,112.0,113.0,102.0,116.0,121.0,109.0,105.0,117.0,118.0,...,,,,,,,,,,


In [10]:
for column in ['domain']:
    ord_data_dfs[column] = ord_data_dfs[column].fillna(-1)

In [12]:
unknown_ord_columns = dict()

for column in ['domain']:
    unknown_ord_columns[column] = set()

    for ord_column in ord_data_dfs[column].columns:
        try:
            assert (all(ord_data_dfs[column][ord_column].isin(expected_ords + [-1])))
        except AssertionError:
            print(column, "- column:", ord_column, "contained unknown ordinal value")
            unknown_ord_columns[column].add(ord_column)
            continue

    print()

for column in ['domain']:
    for ord_column in unknown_ord_columns[column]:
        unknown_ord_index = (~ord_data_dfs[column][ord_column].isin(expected_ords + [-1]))
        if any(unknown_ord_index):
            print(ord_data_dfs[column].loc[unknown_ord_index][ord_column])
            display(df.loc[unknown_ord_index][column])

    print()



In [13]:
validTokens = {x: idx + 1 for idx, x in enumerate(set(''.join(X)))}

maxFeatures = len(validTokens) + 1
maxLength = np.max([len(x) for x in X])

In [14]:
X = [[validTokens[y] for y in x] for x in X]
X = pad_sequences(X, maxlen=maxLength)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=36)

In [16]:
# LSTM модель
model = Sequential()
model.add(Embedding(maxFeatures, 128, input_length=maxLength))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

2024-03-27 08:28:07.789185: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-27 08:28:08.303184: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-27 08:28:08.303262: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-27 08:28:08.305231: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-27 08:28:08.305295: I external/local_xla/xla/stream_executor

In [17]:
model.fit(X_train, y_train, batch_size=15, epochs=5)

Epoch 1/5


2024-03-27 08:28:13.441640: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
2024-03-27 08:28:15.628770: I external/local_xla/xla/service/service.cc:168] XLA service 0x7fcb743d2630 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-03-27 08:28:15.628847: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3070 Ti, Compute Capability 8.6
2024-03-27 08:28:15.661338: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1711528095.802062    2369 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7fcd2ba922d0>

In [23]:
# https://www.tensorflow.org/guide/keras/serialization_and_saving
model.save(dir_path + 'dga_lstm.pkl')
model.save(dir_path + 'dga_lstm.keras')

INFO:tensorflow:Assets written to: work/models/dga/dga_lstm.pkl/assets


INFO:tensorflow:Assets written to: work/models/dga/dga_lstm.pkl/assets


In [None]:
# model = keras.models.load_model(path)

In [116]:
predictions = model.predict(X_test).round()



In [117]:
acc = accuracy_score(y_test, predictions)
classification_report = classification_report(y_test, predictions)

In [118]:
print(classification_report)

              precision    recall  f1-score   support

           0       0.97      0.98      0.97     99677
           1       0.97      0.97      0.97     94088

    accuracy                           0.97    193765
   macro avg       0.97      0.97      0.97    193765
weighted avg       0.97      0.97      0.97    193765


In [20]:
def tokenize_domain(domain):
    d_ = [[validTokens[char] for char in domain]]
    d_ = pad_sequences(d_, maxlen=maxLength)

    return d_

In [21]:
model.predict(tokenize_domain('example.com'))



array([[1.7835644e-05]], dtype=float32)

In [19]:
model.predict(tokenize_domain('cvyh1po636avyrsxebwbkn7.ddns.net'))

NameError: name 'tokenize_domain' is not defined

In [22]:
validTokens

{'8': 1,
 'r': 2,
 '6': 3,
 '9': 4,
 '7': 5,
 'c': 6,
 'y': 7,
 '2': 8,
 'h': 9,
 '.': 10,
 'l': 11,
 'm': 12,
 'n': 13,
 'j': 14,
 '_': 15,
 'z': 16,
 'w': 17,
 '1': 18,
 'q': 19,
 'd': 20,
 'u': 21,
 '3': 22,
 'g': 23,
 's': 24,
 'f': 25,
 'b': 26,
 't': 27,
 'x': 28,
 '-': 29,
 '4': 30,
 'p': 31,
 'e': 32,
 '5': 33,
 'a': 34,
 'o': 35,
 'i': 36,
 'v': 37,
 'k': 38,
 '0': 39}

# PyCaret example

In [14]:
from pycaret.nlp import *

nlp_df = setup(df, target='domain', session_id=126, log_experiment=False, log_plots=True, experiment_name='dga_nlp_1')

ModuleNotFoundError: No module named 'pycaret.nlp'

In [None]:
s = ClassificationExperiment()
s.setup(df, target='is_legit', session_id=123, use_gpu=True, log_experiment=False,
        experiment_name='semantic_classification')

best_regression_model = s.compare_models()