# DNS Tunneling Detection using CNN + LSTM

This notebook implements a **CNN + LSTM** hybrid model for DNS tunneling detection â€” a strong baseline between pure CNN and CNN+BiLSTM models for fair comparison.

Features:
- Character-level embedding of subdomain/query names
- Combined numeric + sequential input
- Early stopping
- Complete evaluation with ROC-AUC, confusion matrix, and training curves

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

import tensorflow as tf
from tensorflow.keras import layers, Model, Input
from tensorflow.keras.callbacks import EarlyStopping

%matplotlib inline

## 1. Load Dataset

In [None]:
DATA_PATH = "dns_preprocessed_dataset_deduplicated1_(1).csv"
df = pd.read_csv(DATA_PATH)

print("Dataset shape:", df.shape)
print(df.head())
print("\nLabel distribution:")
print(df['label'].value_counts())

plt.figure(figsize=(6,4))
df['label'].value_counts().sort_index().plot(kind='bar')
plt.xticks([0,1], ['Normal', 'Tunneled'], rotation=0)
plt.title('Class Distribution')
plt.ylabel('Count')
plt.show()

## 2. Feature Selection & Preprocessing

In [None]:
TEXT_COL = 'qname_clean'
NUMERIC_FEATURES = [
    'qname_len', 'label_count', 'qtype', 'entropy',
    'udp_length', 'ip_length', 'ttl', 'response_flag', 'dns_time'
]

# Remove empty queries
df = df[df[TEXT_COL].astype(str).str.len() > 0].reset_index(drop=True)

## 3. Character-Level Encoding

In [None]:
all_text = ''.join(df[TEXT_COL].astype(str).tolist())
chars = sorted(list(set(all_text)))
char2idx = {c: i+1 for i, c in enumerate(chars)}
vocab_size = len(char2idx) + 1

MAX_SEQ_LEN = min(100, df[TEXT_COL].astype(str).str.len().max())

def encode_qname(s, max_len=MAX_SEQ_LEN):
    s = str(s)
    seq = [char2idx.get(ch, 0) for ch in s[:max_len]]
    if len(seq) < max_len:
        seq += [0] * (max_len - len(seq))
    return np.array(seq, dtype=np.int32)

X_text = np.stack(df[TEXT_COL].apply(encode_qname))
X_num = df[NUMERIC_FEATURES].astype(float).values
y = df['label'].astype(int).values

print(f"Vocabulary size: {vocab_size}, Max sequence length: {MAX_SEQ_LEN}")

## 4. Train / Validation / Test Split

In [None]:
X_text_train, X_text_temp, X_num_train, X_num_temp, y_train, y_temp = train_test_split(
    X_text, X_num, y, test_size=0.3, stratify=y, random_state=42
)

X_text_val, X_text_test, X_num_val, X_num_test, y_val, y_test = train_test_split(
    X_text_temp, X_num_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

# Scale numeric features
scaler = StandardScaler()
X_num_train = scaler.fit_transform(X_num_train)
X_num_val = scaler.transform(X_num_val)
X_num_test = scaler.transform(X_num_test)

print(f"Train: {len(y_train)}, Val: {len(y_val)}, Test: {len(y_test)}")

## 5. Build CNN + LSTM Model

In [None]:
embedding_dim = 64
num_filters = 128
kernel_size = 5

text_input = Input(shape=(MAX_SEQ_LEN,), name="text_input")
x = layers.Embedding(vocab_size, embedding_dim)(text_input)
x = layers.Conv1D(num_filters, kernel_size, activation='relu')(x)
x = layers.MaxPooling1D(pool_size=2)(x)
x = layers.LSTM(64, return_sequences=False)(x)
x = layers.Dense(64, activation='relu')(x)

num_input = Input(shape=(X_num_train.shape[1],), name="num_input")
n = layers.Dense(32, activation='relu')(num_input)
n = layers.Dense(32, activation='relu')(n)

combined = layers.concatenate([x, n])
combined = layers.Dense(64, activation='relu')(combined)
output = layers.Dense(1, activation='sigmoid')(combined)

cnn_lstm_model = Model(inputs=[text_input, num_input], outputs=output)
cnn_lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

cnn_lstm_model.summary()

## 6. Train the Model

In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = cnn_lstm_model.fit(
    [X_text_train, X_num_train], y_train,
    validation_data=([X_text_val, X_num_val], y_val),
    epochs=30,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

## 7. Training Curves

In [None]:
plt.figure(figsize=(12,4))

plt.subplot(1,2,1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('CNN + LSTM Accuracy')
plt.legend()

plt.subplot(1,2,2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('CNN + LSTM Loss')
plt.legend()

plt.tight_layout()
plt.show()

## 8. Evaluation on Test Set

In [None]:
test_probs = cnn_lstm_model.predict([X_text_test, X_num_test]).flatten()
test_preds = (test_probs >= 0.5).astype(int)

print("Classification Report:")
print(classification_report(y_test, test_preds, digits=4))

cm = confusion_matrix(y_test, test_preds)
print("Confusion Matrix:\n", cm)

## 9. ROC Curve

In [None]:
fpr, tpr, _ = roc_curve(y_test, test_probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(7,6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.3f})', linewidth=2)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - CNN + LSTM Model')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

## 10. Save Model

In [None]:
cnn_lstm_model.save("dns_cnn_lstm_model.h5")
print("CNN + LSTM model saved as 'dns_cnn_lstm_model.h5'")