In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, Input

In [21]:
# Load processed dataset
df = pd.read_csv("/content/final_dga_dataset_cleaned.csv")

print("Dataset Shape:", df.shape)
df['subclass'].value_counts()

Dataset Shape: (160000, 3)


Unnamed: 0_level_0,count
subclass,Unnamed: 1_level_1
legit,80000
cryptolocker,37254
newgoz,9276
gameoverdga,8461
nivdort,8456
necurs,8331
goz,6136
bamital,2086


In [22]:
# Lowercase and remove TLD
df['domain'] = df['domain'].str.lower()
df['domain'] = df['domain'].str.split('.').str[0]

# Encode labels
label_encoder = LabelEncoder()
df['family_label'] = label_encoder.fit_transform(df['subclass'])

num_classes = len(label_encoder.classes_)
print("Classes:", label_encoder.classes_)
print("Number of Classes:", num_classes)

Classes: ['bamital' 'cryptolocker' 'gameoverdga' 'goz' 'legit' 'necurs' 'newgoz'
 'nivdort']
Number of Classes: 8


In [23]:
classes = np.unique(y_train)

weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=y_train
)

class_weights = dict(zip(classes, weights))

print("Class Weights:")
for cls, w in class_weights.items():
    print(f"{label_encoder.classes_[cls]}: {w:.3f}")

Class Weights:
bamital: 9.587
cryptolocker: 0.537
gameoverdga: 2.364
goz: 3.259
legit: 0.250
necurs: 2.401
newgoz: 2.156
nivdort: 2.365


In [24]:
model_lstm = Sequential([
    Input(shape=(MAX_LEN,)),

    Embedding(input_dim=vocab_size, output_dim=64),

    Bidirectional(LSTM(64, return_sequences=False)),

    Dense(128, activation='relu'),
    Dropout(0.5),

    Dense(num_classes, activation='softmax')
])

model_lstm.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model_lstm.summary()

In [25]:
history_lstm = model_lstm.fit(
    X_train,
    y_train,
    epochs=6,
    batch_size=256,
    validation_split=0.2,
    class_weight=class_weights
)

Epoch 1/6
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 195ms/step - accuracy: 0.4609 - loss: 1.0629 - val_accuracy: 0.7736 - val_loss: 0.4900
Epoch 2/6
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 194ms/step - accuracy: 0.7645 - loss: 0.4688 - val_accuracy: 0.7921 - val_loss: 0.4558
Epoch 3/6
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 194ms/step - accuracy: 0.7927 - loss: 0.4295 - val_accuracy: 0.8138 - val_loss: 0.4273
Epoch 4/6
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 203ms/step - accuracy: 0.8057 - loss: 0.4156 - val_accuracy: 0.7806 - val_loss: 0.4673
Epoch 5/6
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 206ms/step - accuracy: 0.8126 - loss: 0.4020 - val_accuracy: 0.8224 - val_loss: 0.4074
Epoch 6/6
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 195ms/step - accuracy: 0.8158 - loss: 0.4169 - val_accuracy: 0.8052 - val_loss: 0.4408


In [26]:
y_pred_lstm = model_lstm.predict(X_test)
y_pred_classes_lstm = y_pred_lstm.argmax(axis=1)

print(confusion_matrix(y_test, y_pred_classes_lstm))
print(classification_report(y_test, y_pred_classes_lstm, target_names=label_encoder.classes_))

[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12ms/step
[[  417     0     0     0     0     0     0     0]
 [    0  7170     0     0   126   139     0    16]
 [    0     0  1691     1     0     0     1     0]
 [    0     0     0  1195     1    31     0     0]
 [    1   262     1     7 12507   566     0  2656]
 [    0   442     0     0    54  1138     0    32]
 [    0     0  1854     0     0     0     1     0]
 [    0     1     0     0    34     3     0  1653]]
              precision    recall  f1-score   support

     bamital       1.00      1.00      1.00       417
cryptolocker       0.91      0.96      0.94      7451
 gameoverdga       0.48      1.00      0.65      1693
         goz       0.99      0.97      0.98      1227
       legit       0.98      0.78      0.87     16000
      necurs       0.61      0.68      0.64      1666
      newgoz       0.50      0.00      0.00      1855
     nivdort       0.38      0.98      0.55      1691

    accuracy            