In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
FILE = '/content/drive/MyDrive/msis_sem1_project/dga_multi.csv'
df = pd.read_csv(FILE)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head()

Unnamed: 0,Domain,Botnet_Family
0,rdaudaep.com,szribi
1,msemzrvzglrzbuuseik.biz,qakbot
2,pbobadmvxxh.com,ramnit
3,qgiykqweseuukuiw.org,ramdo
4,ixjcbhjxex.mooo.com,kraken


In [None]:
import seaborn as sns
sns.countplot(df['Botnet_Family'])

**DATA PREP**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
tokenizer = Tokenizer(char_level=True)  # char_level=True treats each character as a token
tokenizer.fit_on_texts(df['Domain'])

sequences = tokenizer.texts_to_sequences(df['Domain'])

X = pad_sequences(sequences, maxlen=50, padding='post')

In [None]:
len(tokenizer.word_index)

42

In [None]:
label_encoder = LabelEncoder()
df['Botnet_Family'] = label_encoder.fit_transform(df['Botnet_Family'])

y = to_categorical(df['Botnet_Family'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
pip install keras-tuner --upgrade

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


**MODELLING**

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Dropout, Dense, Flatten, MultiHeadAttention, LayerNormalization, Add
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Input layer
input_seq = Input(shape=(50,))

# Embedding layer
embedded = Embedding(input_dim=42, output_dim=192, input_length=50)(input_seq)

# Convolutional layers
# Try 2 to 4
conv1 = Conv1D(filters=256, kernel_size=7, activation='relu')(embedded)
pool1 = MaxPooling1D(pool_size=2)(conv1)
drop1 = Dropout(0.25)(pool1)

conv2 = Conv1D(filters=128, kernel_size=5, activation='relu')(drop1)
pool2 = MaxPooling1D(pool_size=2)(conv2)
drop2 = Dropout(0.25)(pool2)

# Multi-Head Self-Attention layer
attention_output = MultiHeadAttention(num_heads=4, key_dim=128)(drop2, drop2)
attention_output = Add()([attention_output, drop2])  # Residual connection
attention_output = LayerNormalization()(attention_output)  # Normalization

# Flatten and output layers
flatten = Flatten()(attention_output)
dense1 = Dense(128, activation='relu')(flatten)
dropout = Dropout(0.25)(dense1)
output = Dense(60, activation='softmax')(dropout)

# Create model
model1 = Model(inputs=input_seq, outputs=output)

# Compile model
model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.00001)

# Train the model with early stopping
model1.fit(X_train, y_train, epochs=100, batch_size=16, validation_split=0.2, callbacks=[early_stopping, reduce_lr])

# Evaluate the model
loss, accuracy = model1.evaluate(X_test, y_test)
print('Test loss:', loss)
print('Test accuracy:', accuracy)



Epoch 1/100
[1m9600/9600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 4ms/step - accuracy: 0.5462 - loss: 1.6198 - val_accuracy: 0.8029 - val_loss: 0.5726 - learning_rate: 0.0010
Epoch 2/100
[1m9600/9600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 4ms/step - accuracy: 0.7846 - loss: 0.6327 - val_accuracy: 0.8291 - val_loss: 0.4893 - learning_rate: 0.0010
Epoch 3/100
[1m9600/9600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 4ms/step - accuracy: 0.8071 - loss: 0.5559 - val_accuracy: 0.8304 - val_loss: 0.4636 - learning_rate: 0.0010
Epoch 4/100
[1m9600/9600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 4ms/step - accuracy: 0.8141 - loss: 0.5265 - val_accuracy: 0.8333 - val_loss: 0.4739 - learning_rate: 0.0010
Epoch 5/100
[1m9600/9600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 4ms/step - accuracy: 0.8199 - loss: 0.5061 - val_accuracy: 0.8399 - val_loss: 0.4533 - learning_rate: 0.0010
Epoch 6/100
[1m9600/9600[0m [32m━━━━━━━━━━━━━━━

In [None]:
import joblib

joblib.dump(tokenizer, 'tokenizer.pkl')
joblib.dump(label_encoder, 'encoder_multi.pkl')
joblib.dump(model1, 'multiclass_classification_model.pkl')

['multiclass_classification_model.pkl']