In [13]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load cleaned dataset
df = pd.read_csv("/content/final_dga_dataset_cleaned.csv")

# Lowercase + remove TLD
df['domain'] = df['domain'].str.lower()
df['domain'] = df['domain'].str.split('.').str[0]

# Encode subclass labels
label_encoder = LabelEncoder()
df['family_label'] = label_encoder.fit_transform(df['subclass'])

print("Classes:")
for i, cls in enumerate(label_encoder.classes_):
    print(i, "→", cls)

num_classes = len(label_encoder.classes_)
print("\nTotal classes:", num_classes)

Classes:
0 → bamital
1 → cryptolocker
2 → gameoverdga
3 → goz
4 → legit
5 → necurs
6 → newgoz
7 → nivdort

Total classes: 8


In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

X = df['domain'].values
y = df['family_label'].values

tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(X)

X_seq = tokenizer.texts_to_sequences(X)

MAX_LEN = 40
X_pad = pad_sequences(X_seq, maxlen=MAX_LEN, padding='post')

X_train, X_test, y_train, y_test = train_test_split(
    X_pad, y, test_size=0.2, random_state=42, stratify=y
)

In [15]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(y_train)

weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=y_train
)

class_weights = dict(zip(classes, weights))

print("Class Weights:")
for cls, w in class_weights.items():
    print(f"{cls}: {w:.3f}")

Class Weights:
0: 9.587
1: 0.537
2: 2.364
3: 3.259
4: 0.250
5: 2.401
6: 2.156
7: 2.365


In [16]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weights = dict(enumerate(class_weights))


In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Input

vocab_size = len(tokenizer.word_index) + 1

model = Sequential([
    Input(shape=(MAX_LEN,)),

    Embedding(input_dim=vocab_size, output_dim=64),

    Conv1D(256, 3, activation='relu'),
    Conv1D(128, 3, activation='relu'),
    GlobalMaxPooling1D(),

    Dense(128, activation='relu'),
    Dropout(0.5),

    Dense(num_classes, activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

In [18]:
history = model.fit(
    X_train,
    y_train,
    epochs=6,
    batch_size=256,
    validation_split=0.2,
    class_weight=class_weights
)

Epoch 1/6
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 225ms/step - accuracy: 0.4317 - loss: 1.1479 - val_accuracy: 0.6395 - val_loss: 0.6007
Epoch 2/6
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 212ms/step - accuracy: 0.7335 - loss: 0.5060 - val_accuracy: 0.8306 - val_loss: 0.4284
Epoch 3/6
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 215ms/step - accuracy: 0.7994 - loss: 0.4335 - val_accuracy: 0.8525 - val_loss: 0.3605
Epoch 4/6
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 210ms/step - accuracy: 0.8362 - loss: 0.3927 - val_accuracy: 0.8497 - val_loss: 0.3541
Epoch 5/6
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 222ms/step - accuracy: 0.8467 - loss: 0.3838 - val_accuracy: 0.8497 - val_loss: 0.3342
Epoch 6/6
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 209ms/step - accuracy: 0.8556 - loss: 0.3629 - val_accuracy: 0.8731 - val_loss: 0.3011


In [19]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)

print(confusion_matrix(y_test, y_pred_classes))
print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_))

[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step
[[  417     0     0     0     0     0     0     0]
 [    0  6786     0     6    91   564     0     4]
 [    0     0   102     0     0     0  1591     0]
 [    0     0     0  1201     1    25     0     0]
 [    2    97     0     8 15249   432     9   203]
 [    0   729     0    51    67   818     0     1]
 [    0     0    84     0     0     0  1771     0]
 [    0     0     0     0     4     0     0  1687]]
              precision    recall  f1-score   support

     bamital       1.00      1.00      1.00       417
cryptolocker       0.89      0.91      0.90      7451
 gameoverdga       0.55      0.06      0.11      1693
         goz       0.95      0.98      0.96      1227
       legit       0.99      0.95      0.97     16000
      necurs       0.44      0.49      0.47      1666
      newgoz       0.53      0.95      0.68      1855
     nivdort       0.89      1.00      0.94      1691

    accuracy            