# Классификатор DGA доменов

## Ссылки:

- https://habr.com/ru/companies/pt/articles/282349/
- https://www.securitylab.ru/blog/personal/Morning/347957.php
- https://underdefense.com/guides/detecting-dga-domains-machine-learning-approach/
- https://habr.com/ru/companies/wunderfund/articles/331310/
- https://github.com/ryancollingwood/DGAClassifier

## Наборы данных

- https://www.kaggle.com/datasets/gtkcyber/dga-dataset
- https://github.com/chrmor/DGA_domains_dataset


In [1]:
!nvidia-smi

Tue Mar 19 18:54:38 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.06              Driver Version: 545.92       CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3070 Ti     On  | 00000000:01:00.0  On |                  N/A |
| 78%   30C    P5              23W / 331W |   1547MiB /  8192MiB |      3%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                         

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import keras

from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.layers import LSTM

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

dir_path = './work/models/dga/'

In [3]:
df1 = pd.read_csv('./work/datasets/dga_data.csv')

In [4]:
df2 = pd.read_csv('./work/datasets/dga_domains_full.csv')

In [5]:
## Очистка и преобразование исходных данных

df1.loc[(df1['isDGA'] == 'dga'), 'isDGA'] = 1
df1.loc[(df1['isDGA'] != 'dga'), 'isDGA'] = 0

df1 = df1.drop(columns=df1.columns[1], axis=1)
df1 = df1.rename(columns={'host': 'domain'})

df2.loc[(df2['isDGA'] != 'dga'), 'isDGA'] = 0
df2.loc[(df2['isDGA'] == 'dga'), 'isDGA'] = 1

In [6]:
df = pd.concat([df1, df2])
df = df.reset_index(drop=True)

df = df.astype({'domain': 'string', 'isDGA': 'int8', 'subclass': 'category'})

In [7]:
df.sample(frac=1).reset_index(drop=True)

Unnamed: 0,isDGA,domain,subclass
0,0,ihyztoftwmbpvydauaudiskvxobjb.ru,goz
1,0,buscagrupos.com.br,alexa
2,1,iraxi872o4m2143d5ngxuru.ddns.net,corebot
3,1,ksoxukphifgxepaaxvva.pro,necurs
4,1,mbafocecodfcbndd.co,padcrypt
...,...,...,...
834893,1,i4ms28qseois.top,qadars
834894,0,villagehatshop.com,legit
834895,0,cabinetbank.com,alexa
834896,0,dragonballarchives.com,alexa


In [8]:
X, y = df['domain'], df['isDGA']

In [9]:
validTokens = {x: idx + 1 for idx, x in enumerate(set(''.join(X)))}

maxFeatures = len(validTokens) + 1
maxLength = np.max([len(x) for x in X])

In [10]:
X = [[validTokens[y] for y in x] for x in X]
X = pad_sequences(X, maxlen=maxLength)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# LSTM модель

model = Sequential()
model.add(Embedding(maxFeatures, 128, input_length=maxLength))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

2024-03-19 18:54:45.409382: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-19 18:54:45.424183: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-19 18:54:45.424240: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-19 18:54:45.426127: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-19 18:54:45.426210: I external/local_xla/xla/stream_executor

In [13]:
model.fit(X_train, y_train, batch_size=15, epochs=5)

2024-03-19 18:54:47.014813: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
2024-03-19 18:54:47.720844: I external/local_xla/xla/service/service.cc:168] XLA service 0x7fd0800eae90 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-03-19 18:54:47.720917: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3070 Ti, Compute Capability 8.6
2024-03-19 18:54:47.731412: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1710874487.781257   11791 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.




In [14]:
# https://www.tensorflow.org/guide/keras/serialization_and_saving
model.save(path + 'dga_lstm.keras')

In [15]:
# model = keras.models.load_model(path)

In [29]:
predictions = model.predict(X_test).round()



In [33]:
acc = accuracy_score(y_test, predictions)
classification_report = classification_report(y_test, predictions)

In [35]:
print(classification_report)

              precision    recall  f1-score   support

           0       0.94      0.95      0.95     99488
           1       0.93      0.92      0.92     67492

    accuracy                           0.94    166980
   macro avg       0.94      0.93      0.94    166980
weighted avg       0.94      0.94      0.94    166980


In [41]:
def tokenize_domain(domain):
    d_ = [[validTokens[char] for char in domain]]
    d_ = pad_sequences(d_, maxlen=maxLength)

    return d_

In [65]:
model.predict(tokenize_domain('example.com'))



array([[0.0001923]], dtype=float32)

In [57]:
model.predict(tokenize_domain('cvyh1po636avyrsxebwbkn7.ddns.net'))



array([[1.]], dtype=float32)

In [51]:
validTokens

{'5': 1,
 'g': 2,
 '1': 3,
 '6': 4,
 't': 5,
 '9': 6,
 'd': 7,
 'c': 8,
 'q': 9,
 'x': 10,
 'a': 11,
 'j': 12,
 '.': 13,
 'w': 14,
 'f': 15,
 'r': 16,
 '-': 17,
 '2': 18,
 '0': 19,
 'z': 20,
 'b': 21,
 '3': 22,
 '_': 23,
 'v': 24,
 'n': 25,
 'm': 26,
 'l': 27,
 'i': 28,
 '8': 29,
 'h': 30,
 's': 31,
 '7': 32,
 '\n': 33,
 'y': 34,
 '4': 35,
 'e': 36,
 'u': 37,
 'p': 38,
 'k': 39,
 'o': 40}