# Классификатор DGA доменов

## Ссылки:

- https://habr.com/ru/companies/pt/articles/282349/
- https://www.securitylab.ru/blog/personal/Morning/347957.php
- https://underdefense.com/guides/detecting-dga-domains-machine-learning-approach/
- https://habr.com/ru/companies/wunderfund/articles/331310/
- https://github.com/ryancollingwood/DGAClassifier

## Наборы данных

- https://www.kaggle.com/datasets/gtkcyber/dga-dataset
- https://github.com/chrmor/DGA_domains_dataset


In [None]:
!nvidia-smi

In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import keras

from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.layers import LSTM

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

dir_path = './work/models/dga/'

In [100]:
df1 = pd.read_csv('./work/datasets/dga_data.csv')
df2 = pd.read_csv('./work/datasets/dga_domains_full.csv')
df3 = pd.read_csv('./work/datasets/dga_domains.csv')

In [101]:
## Очистка и преобразование исходных данных
df1.loc[(df1['isDGA'] == 'dga'), 'isDGA'] = 1
df1.loc[(df1['isDGA'] == 'legit'), 'isDGA'] = 0

df1 = df1.drop(columns=df1.columns[1], axis=1)
df1 = df1.rename(columns={'host': 'domain'})

df2.loc[(df2['isDGA'] == 'legit'), 'isDGA'] = 0
df2.loc[(df2['isDGA'] == 'dga'), 'isDGA'] = 1

df3.loc[(df3['isDGA'] == 'legit'), 'isDGA'] = 0
df3.loc[(df3['isDGA'] == 'dga'), 'isDGA'] = 1

df3 = df3.drop(columns=df1.columns[2], axis=1)
df3 = df3.drop(columns=df1.columns[1], axis=1)

df3.head()

df3 = df3.rename(columns={'host': 'domain'})

In [102]:
df = pd.concat([df1, df2, df3])

df = df.astype({'domain': 'string', 'isDGA': 'int8', 'subclass': 'category'})

In [86]:
df.head(10)

Unnamed: 0,isDGA,domain,subclass
0,1,6xzxsw3sokvg1tc752y1a6p0af.com,gameoverdga
1,1,glbtlxwwhbnpxs.ru,cryptolocker
2,1,xxmamopyipbfpk.ru,cryptolocker
3,1,zfd5szpi18i85wj9uy13l69rg.net,newgoz
4,1,jpqftymiuver.ru,cryptolocker
5,1,takenroll.net,nivdort
6,1,vrrtneoswsds.ru,cryptolocker
7,1,mzapiwbcsbqiyxnlxa1hprx8n.com,newgoz
8,1,xkoihiygtanuio.ru,cryptolocker
9,1,dreamdont.net,nivdort


In [103]:
X, y = df['domain'], df['isDGA']

In [104]:
ascii_ranges = ["_", "-", ".", "0", "1", "9", "a", "z"]
ascii_ord_ranges = list()
for c in ascii_ranges:
    ascii_ord_ranges.append(ord(c))

expected_ords = [ord("-"), ord("_"), ord(".")] + list(range(ord("0"), ord("9") + 1)) + list(
    range(ord("a"), ord("z") + 1))

# a simple check to ensure the range domains we identified are present
for i in ascii_ord_ranges:
    assert (i in expected_ords)

ord_data_dfs = dict()

for column in ['domain']:
    ord_data_dfs[column] = df[column].apply(lambda x: [ord(w) for w in x.lower()]).apply(pd.Series)
    display(ord_data_dfs[column].head())

domain


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,63,64,65,66,67,68,69,70,71,72
0,54.0,120.0,122.0,120.0,115.0,119.0,51.0,115.0,111.0,107.0,...,,,,,,,,,,
1,103.0,108.0,98.0,116.0,108.0,120.0,119.0,119.0,104.0,98.0,...,,,,,,,,,,
2,120.0,120.0,109.0,97.0,109.0,111.0,112.0,121.0,105.0,112.0,...,,,,,,,,,,
3,122.0,102.0,100.0,53.0,115.0,122.0,112.0,105.0,49.0,56.0,...,,,,,,,,,,
4,106.0,112.0,113.0,102.0,116.0,121.0,109.0,105.0,117.0,118.0,...,,,,,,,,,,


In [105]:
for column in ['domain']:
    ord_data_dfs[column] = ord_data_dfs[column].fillna(-1)

In [106]:
unknown_ord_columns = dict()

for column in ['domain']:
    unknown_ord_columns[column] = set()

    for ord_column in ord_data_dfs[column].columns:
        try:
            assert (all(ord_data_dfs[column][ord_column].isin(expected_ords + [-1])))
        except AssertionError:
            print(column, "- column:", ord_column, "contained unknown ordinal value")
            unknown_ord_columns[column].add(ord_column)
            continue

    print()

for column in ['domain']:
    for ord_column in unknown_ord_columns[column]:
        unknown_ord_index = (~ord_data_dfs[column][ord_column].isin(expected_ords + [-1]))
        if any(unknown_ord_index):
            print(ord_data_dfs[column].loc[unknown_ord_index][ord_column])
            display(df.loc[unknown_ord_index][column])

    print()

domain

domain


In [109]:
validTokens = {x: idx + 1 for idx, x in enumerate(set(''.join(X)))}

maxFeatures = len(validTokens) + 1
maxLength = np.max([len(x) for x in X])

In [110]:
X = [[validTokens[y] for y in x] for x in X]
X = pad_sequences(X, maxlen=maxLength)

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [112]:
# LSTM модель

model = Sequential()
model.add(Embedding(maxFeatures, 128, input_length=maxLength))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [113]:
model.fit(X_train, y_train, batch_size=15, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7fe1d5b1e6d0>

In [115]:
# https://www.tensorflow.org/guide/keras/serialization_and_saving
model.save(dir_path + 'dga_lstm.keras')

In [None]:
# model = keras.models.load_model(path)

In [116]:
predictions = model.predict(X_test).round()



In [117]:
acc = accuracy_score(y_test, predictions)
classification_report = classification_report(y_test, predictions)

In [118]:
print(classification_report)

              precision    recall  f1-score   support

           0       0.97      0.98      0.97     99677
           1       0.97      0.97      0.97     94088

    accuracy                           0.97    193765
   macro avg       0.97      0.97      0.97    193765
weighted avg       0.97      0.97      0.97    193765


In [119]:
def tokenize_domain(domain):
    d_ = [[validTokens[char] for char in domain]]
    d_ = pad_sequences(d_, maxlen=maxLength)

    return d_

In [120]:
model.predict(tokenize_domain('example.com'))



array([[0.00110966]], dtype=float32)

In [123]:
model.predict(tokenize_domain('cvyh1po636avyrsxebwbkn7.ddns.net'))



array([[1.]], dtype=float32)

In [124]:
validTokens

{'0': 1,
 't': 2,
 '3': 3,
 'b': 4,
 '7': 5,
 '9': 6,
 'w': 7,
 'u': 8,
 '_': 9,
 'p': 10,
 'n': 11,
 '2': 12,
 '1': 13,
 '8': 14,
 'm': 15,
 'r': 16,
 'f': 17,
 's': 18,
 'v': 19,
 'y': 20,
 'x': 21,
 'h': 22,
 '4': 23,
 '.': 24,
 'j': 25,
 'g': 26,
 'o': 27,
 '-': 28,
 'd': 29,
 'l': 30,
 'q': 31,
 'e': 32,
 '5': 33,
 'k': 34,
 'z': 35,
 'i': 36,
 'c': 37,
 '6': 38,
 'a': 39}