In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import PassiveAggressiveClassifier

from sklearn.metrics import accuracy_score, f1_score


In [3]:
df = pd.read_csv("DNS-testing.csv")
print(df.shape)
df.head()


(6703, 78)


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,17,48,2,0,2944,0,1472,1472,1472.0,0.0,...,0,0.0,0.0,0,0,0,0.0,0,0,DrDoS_DNS
1,17,2,2,0,2944,0,1472,1472,1472.0,0.0,...,-1,0.0,0.0,0,0,0,0.0,0,0,DrDoS_DNS
2,17,1,2,0,2944,0,1472,1472,1472.0,0.0,...,-1,0.0,0.0,0,0,0,0.0,0,0,DrDoS_DNS
3,17,1,2,0,2944,0,1472,1472,1472.0,0.0,...,1480,0.0,0.0,0,0,0,0.0,0,0,DrDoS_DNS
4,17,1,2,0,2896,0,1448,1448,1448.0,0.0,...,0,0.0,0.0,0,0,0,0.0,0,0,DrDoS_DNS


In [4]:
X = df.drop(columns=['Label'])   # features
y = df['Label']                  # target


In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)


In [None]:
df.columns

Index(['Protocol', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Fwd Packets Length Total',
       'Bwd Packets Length Total', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Packet Length Min', 'Packet Length Max', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Count', 'RST Fla

In [7]:
df["Label"].unique()

array(['DrDoS_DNS', 'Benign'], dtype=object)

In [8]:
selected_features = [
    'Protocol',
    'Total Fwd Packets',
    'Fwd Packets Length Total',
    'Fwd Packet Length Max',
    'Fwd Packet Length Mean',
    'Bwd Packet Length Max',
    'Bwd Packet Length Mean',
    'Flow Bytes/s',
    'Flow IAT Mean',
    'Flow IAT Max',
    'Flow Duration',
    'Total Backward Packets',
    'Bwd Packets Length Total',
    'Fwd Packet Length Min',
    'Fwd Packet Length Std',
    'Bwd Packet Length Min',
    'Bwd Packet Length Std',
    'Flow Packets/s',
    'Flow IAT Std',
    'Flow IAT Min'
]


In [9]:
df_benign = df[df['Label'] == 'Benign']


In [10]:
df_benign_features = df_benign[selected_features]


In [11]:
print(df_benign_features.shape)
df_benign_features.head()


(3034, 20)


Unnamed: 0,Protocol,Total Fwd Packets,Fwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Mean,Flow Bytes/s,Flow IAT Mean,Flow IAT Max,Flow Duration,Total Backward Packets,Bwd Packets Length Total,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Min,Bwd Packet Length Std,Flow Packets/s,Flow IAT Std,Flow IAT Min
3669,6,26,616,297,23.692308,384,29.538462,12.484017,2173760.0,10007500,110861755,26,768,0,80.46006,0,104.350655,0.469053,4077410.2,1
3670,6,9,8,1,0.888889,31,6.2,1.735465,2240833.8,10018634,40335006,10,62,0,0.333333,0,13.070747,0.471055,4264280.0,1
3671,0,56,0,0,0.0,0,0.0,0.0,2058993.4,9882838,113244633,0,0,0,0.0,0,0.0,0.494505,3839703.2,0
3672,6,21,20,1,0.952381,0,0.0,0.209142,2390723.8,10016037,95628949,20,0,0,0.218218,0,0.0,0.42874,4244399.0,2
3673,6,21,20,1,0.952381,0,0.0,0.209176,2390331.0,10004365,95613243,20,0,0,0.218218,0,0.0,0.428811,4237871.5,1


In [None]:
df = df.dropna()

X = df.drop(columns=["Label"])
y = df["Label"]

le = LabelEncoder()
y = le.fit_transform(y)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
selector = SelectKBest(score_func=f_classif, k=20)
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)


  f = msb / msw


In [None]:
# Save feature names
feature_names = X.columns

# Apply SelectKBest
selector = SelectKBest(score_func=f_classif, k=20)
X_selected = selector.fit_transform(X_train, y_train)

# Get selected feature indices
selected_idx = selector.get_support(indices=True)

# Get selected feature names
selected_features = feature_names[selected_idx]

print("Selected Features:")
for f in selected_features:
    print(f)


Selected Features:
Protocol
Flow Duration
Total Fwd Packets
Total Backward Packets
Fwd Packets Length Total
Bwd Packets Length Total
Fwd Packet Length Max
Fwd Packet Length Min
Fwd Packet Length Mean
Fwd Packet Length Std
Bwd Packet Length Max
Bwd Packet Length Min
Bwd Packet Length Mean
Bwd Packet Length Std
Flow Bytes/s
Flow Packets/s
Flow IAT Mean
Flow IAT Std
Flow IAT Max
Flow IAT Min


In [None]:
models = {
   
    
    "RF": RandomForestClassifier(
    n_estimators=300,
    max_depth=20,          # ↓ controls memorization
    min_samples_split=10,  # ↓ prevents deep splits
    min_samples_leaf=5,    # ↓ smooths decision boundaries
    max_features="sqrt",   # ↓ decorrelates trees
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)
,
}

for m in models.values():
    m.fit(X_train, y_train)


In [None]:
for name, model in models.items():
    preds = model.predict(X_test)
    print(name,
          accuracy_score(y_test, preds),
          f1_score(y_test, preds, average="macro"))


RF 0.9955246146195923 0.9954851566949221


In [None]:
for name, model in models.items():
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)

    print(name)
    print("Train Acc:", accuracy_score(y_train, train_pred))
    print("Test Acc :", accuracy_score(y_test, test_pred))
    print()


RF
Train Acc: 0.9968030690537084
Test Acc : 0.9955246146195923



In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

for name, model in models.items():
    y_pred = model.predict(X_test)

    print(name)
    print("Precision:", precision_score(y_test, y_pred, average="macro"))
    print("Recall   :", recall_score(y_test, y_pred, average="macro"))
    print("F1-score :", f1_score(y_test, y_pred, average="macro"))
    print()


RF
Precision: 0.9953480090256706
Recall   : 0.9956268527113215
F1-score : 0.9954851566949221



In [None]:
centroids = {}
for c in np.unique(y_train):
    centroids[c] = X_train[y_train == c].mean(axis=0)


: 

In [None]:
def rpl_predict(x, threshold=5.0):
    dists = [np.linalg.norm(x - centroids[c]) for c in centroids]
    if min(dists) > threshold:
        return -1   # unknown
    return list(centroids.keys())[np.argmin(dists)]


: 

In [None]:
unknown_count = 0

for x in X_test:
    if rpl_predict(x) == -1:
        unknown_count += 1

print("Unknown detected:", unknown_count)


Unknown detected: 139


: 

: 