# DNS Tunneling Detection using CNN (No BiLSTM)

This notebook implements a **pure CNN-based model** for DNS tunneling detection (no BiLSTM) to enable fair comparison with the hybrid CNN+BiLSTM architecture.

Features:
- Character-level embedding of query names
- Combined numeric + text input
- Early stopping
- Full evaluation: accuracy, classification report, confusion matrix, ROC-AUC

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

import tensorflow as tf
from tensorflow.keras import layers, models, Input, Model
from tensorflow.keras.callbacks import EarlyStopping

%matplotlib inline

## 1. Load Dataset

In [None]:
DATA_PATH = "dns_preprocessed_dataset_deduplicated1_(1).csv"
df = pd.read_csv(DATA_PATH)

print("Dataset shape:", df.shape)
print(df.head())
print("\nLabel distribution:")
print(df['label'].value_counts())

plt.figure(figsize=(6,4))
df['label'].value_counts().sort_index().plot(kind='bar')
plt.xticks([0,1], ['Normal', 'Tunneled'], rotation=0)
plt.title('Class Distribution')
plt.show()

## 2. Feature Selection & Preprocessing

In [None]:
TEXT_COL = 'qname_clean'
NUMERIC_FEATURES = [
    'qname_len', 'label_count', 'qtype', 'entropy',
    'udp_length', 'ip_length', 'ttl', 'response_flag', 'dns_time'
]

df = df[df[TEXT_COL].astype(str).str.len() > 0].reset_index(drop=True)

## 3. Character-Level Encoding

In [None]:
all_text = ''.join(df[TEXT_COL].astype(str).tolist())
chars = sorted(list(set(all_text)))
char2idx = {c: i+1 for i, c in enumerate(chars)}
vocab_size = len(char2idx) + 1

MAX_SEQ_LEN = min(100, df[TEXT_COL].astype(str).str.len().max())

def encode_qname(s, max_len=MAX_SEQ_LEN):
    s = str(s)
    seq = [char2idx.get(ch, 0) for ch in s[:max_len]]
    if len(seq) < max_len:
        seq += [0] * (max_len - len(seq))
    return np.array(seq, dtype=np.int32)

X_text = np.stack(df[TEXT_COL].apply(encode_qname))
X_num = df[NUMERIC_FEATURES].astype(float).values
y = df['label'].astype(int).values

## 4. Train / Validation / Test Split

In [None]:
X_text_train, X_text_temp, X_num_train, X_num_temp, y_train, y_temp = train_test_split(
    X_text, X_num, y, test_size=0.3, random_state=42, stratify=y
)

X_text_val, X_text_test, X_num_val, X_num_test, y_val, y_test = train_test_split(
    X_text_temp, X_num_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# Scale numeric features
scaler = StandardScaler()
X_num_train = scaler.fit_transform(X_num_train)
X_num_val = scaler.transform(X_num_val)
X_num_test = scaler.transform(X_num_test)

print(f"Train: {X_text_train.shape[0]}, Val: {X_text_val.shape[0]}, Test: {X_text_test.shape[0]}")

## 5. Build CNN Model

In [None]:
text_input = Input(shape=(MAX_SEQ_LEN,), name="text_input")
x = layers.Embedding(vocab_size, 128, input_length=MAX_SEQ_LEN)(text_input)
x = layers.Conv1D(128, 5, activation='relu')(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(64, activation='relu')(x)

num_input = Input(shape=(X_num_train.shape[1],), name="num_input")
n = layers.Dense(32, activation='relu')(num_input)
n = layers.Dense(32, activation='relu')(n)

combined = layers.concatenate([x, n])
combined = layers.Dense(64, activation='relu')(combined)
output = layers.Dense(1, activation='sigmoid')(combined)

cnn_model = Model(inputs=[text_input, num_input], outputs=output)
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

cnn_model.summary()

## 6. Train the CNN Model

In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = cnn_model.fit(
    [X_text_train, X_num_train], y_train,
    validation_data=([X_text_val, X_num_val], y_val),
    epochs=30,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

## 7. Training Curves

In [None]:
plt.figure(figsize=(12,4))

plt.subplot(1,2,1)
plt.plot(history.history['accuracy'], label='train_acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
plt.title('CNN Accuracy')
plt.legend()

plt.subplot(1,2,2)
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.title('CNN Loss')
plt.legend()

plt.show()

## 8. Evaluation on Test Set

In [None]:
test_probs = cnn_model.predict([X_text_test, X_num_test]).flatten()
test_preds = (test_probs >= 0.5).astype(int)

print("Classification Report:")
print(classification_report(y_test, test_preds, digits=4))

cm = confusion_matrix(y_test, test_preds)
print("Confusion Matrix:\n", cm)

## 9. ROC Curve

In [None]:
fpr, tpr, _ = roc_curve(y_test, test_probs)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.3f}')
plt.plot([0,1],[0,1],'--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - CNN Only')
plt.legend()
plt.show()

## 10. Save the Model

In [None]:
cnn_model.save("dns_cnn_model.h5")
print("CNN model saved as dns_cnn_model.h5")