# MultiClass Classification for DDos Attacks

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, f1_score, recall_score, confusion_matrix
import time
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.compose import ColumnTransformer


In [4]:
df = pd.read_csv('unsw_nb15_dataset.csv')

In [5]:
df['proto'].unique()

array(['udp', 'arp', 'tcp', 'igmp', 'ospf', 'sctp', 'gre', 'ggp', 'ip',
       'ipnip', 'st2', 'argus', 'chaos', 'egp', 'emcon', 'nvp', 'pup',
       'xnet', 'mux', 'dcn', 'hmp', 'prm', 'trunk-1', 'trunk-2',
       'xns-idp', 'leaf-1', 'leaf-2', 'irtp', 'rdp', 'netblt', 'mfe-nsp',
       'merit-inp', '3pc', 'idpr', 'ddp', 'idpr-cmtp', 'tp++', 'ipv6',
       'sdrp', 'ipv6-frag', 'ipv6-route', 'idrp', 'mhrp', 'i-nlsp', 'rvd',
       'mobile', 'narp', 'skip', 'tlsp', 'ipv6-no', 'any', 'ipv6-opts',
       'cftp', 'sat-expak', 'ippc', 'kryptolan', 'sat-mon', 'cpnx', 'wsn',
       'pvp', 'br-sat-mon', 'sun-nd', 'wb-mon', 'vmtp', 'ttp', 'vines',
       'nsfnet-igp', 'dgp', 'eigrp', 'tcf', 'sprite-rpc', 'larp', 'mtp',
       'ax.25', 'ipip', 'aes-sp3-d', 'micp', 'encap', 'pri-enc', 'gmtp',
       'ifmp', 'pnni', 'qnx', 'scps', 'cbt', 'bbn-rcc', 'igp', 'bna',
       'swipe', 'visa', 'ipcv', 'cphb', 'iso-tp4', 'wb-expak', 'sep',
       'secure-vmtp', 'xtp', 'il', 'rsvp', 'unas', 'fc', 'iso-ip',


In [6]:
df.drop(['id'], axis=1, inplace=True)

# Keep the attack category as is for multi-classification
# Check for any missing values in attack_cat and replace with 'normal' if needed
df['attack_cat'] = df['attack_cat'].fillna('normal')

# Print distribution of attack categories
print("Attack Category Distribution:")
print(df['attack_cat'].value_counts())

Attack Category Distribution:
attack_cat
Normal            37000
Generic           18871
Exploits          11132
Fuzzers            6062
DoS                4089
Reconnaissance     3496
Analysis            677
Backdoor            583
Shellcode           378
Worms                44
Name: count, dtype: int64


In [7]:
# Handle numerical features (clamping extreme values and applying log transformation)
df_numeric = df.select_dtypes(include=[np.number])
for feature in df_numeric.columns:
    if df_numeric[feature].max() > 10 * df_numeric[feature].median() and df_numeric[feature].max() > 10:
        df[feature] = np.where(df[feature] < df[feature].quantile(0.95), df[feature], df[feature].quantile(0.95))

for feature in df_numeric.columns:
    if df_numeric[feature].nunique() > 50:
        df[feature] = np.log(df[feature] + 1) if df_numeric[feature].min() == 0 else np.log(df[feature])

In [8]:
# Handle categorical features by limiting unique values and encoding
df_cat = df.select_dtypes(exclude=[np.number])
for feature in df_cat.columns:
    if feature != 'attack_cat' and df_cat[feature].nunique() > 6:  # Don't limit attack_cat values
        df[feature] = np.where(df[feature].isin(df[feature].value_counts().head().index), df[feature], '-')


In [9]:
# Encode the target variable (attack_cat)
le = LabelEncoder()
y = le.fit_transform(df['attack_cat'])

In [10]:
# Store mapping for later interpretation
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:")
for category, label in label_mapping.items():
    print(f"{label}: {category}")

Label Mapping:
0: Analysis
1: Backdoor
2: DoS
3: Exploits
4: Fuzzers
5: Generic
6: Normal
7: Reconnaissance
8: Shellcode
9: Worms


In [11]:
# Apply One-Hot Encoding to categorical features (excluding attack_cat)
categorical_cols = [col for col in df.select_dtypes(exclude=[np.number]).columns if col != 'attack_cat']
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'), categorical_cols)], remainder='passthrough')
X = np.array(ct.fit_transform(df.drop(columns=['attack_cat'])))

In [12]:
# Feature selection using chi-square test
best_features = SelectKBest(score_func=chi2, k='all')
X = best_features.fit_transform(X, y)

In [13]:
# Standardize features
sc = StandardScaler()
X = sc.fit_transform(X)

In [14]:
# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [15]:
# Train XGBoost model with multi-class configurations
print("Training XGBoost model...")
start_time = time.time()
xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=len(np.unique(y)),
    eval_metric='mlogloss',
    colsample_bytree=0.8,
    learning_rate=0.05,
    max_depth=12,
    n_estimators=400,
    subsample=0.8,
    gamma=0.1,
    reg_lambda=1.0,
    reg_alpha=0.5,
    use_label_encoder=False,
    tree_method='hist'  # Can be faster for large datasets
)
xgb_model.fit(X_train, y_train)
xgb_training_time = time.time() - start_time
print(f"XGBoost training completed in {xgb_training_time:.2f} seconds")

y_pred_xgb = xgb_model.predict(X_test)

Training XGBoost model...


XGBoostError: [14:05:16] /workspace/src/data/array_interface.cu:44: Check failed: err == cudaGetLastError() (0 vs. 46) : 
Stack trace:
  [bt] (0) /home/sagar0418/Academics/6th sem/da/project/DDoS-Attacks-Prediction/venv/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(+0x25c1ac) [0x7b1688e5c1ac]
  [bt] (1) /home/sagar0418/Academics/6th sem/da/project/DDoS-Attacks-Prediction/venv/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(+0xa3f4bc) [0x7b168963f4bc]
  [bt] (2) /home/sagar0418/Academics/6th sem/da/project/DDoS-Attacks-Prediction/venv/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(+0x4e3a2e) [0x7b16890e3a2e]
  [bt] (3) /home/sagar0418/Academics/6th sem/da/project/DDoS-Attacks-Prediction/venv/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(XGDMatrixSetInfoFromInterface+0xb2) [0x7b1688d63522]
  [bt] (4) /lib/x86_64-linux-gnu/libffi.so.8(+0x7b16) [0x7b16f6d1fb16]
  [bt] (5) /lib/x86_64-linux-gnu/libffi.so.8(+0x43ef) [0x7b16f6d1c3ef]
  [bt] (6) /lib/x86_64-linux-gnu/libffi.so.8(ffi_call+0x12e) [0x7b16f6d1f0be]
  [bt] (7) /usr/lib/python3.12/lib-dynload/_ctypes.cpython-312-x86_64-linux-gnu.so(+0xe11c) [0x7b16f6d9911c]
  [bt] (8) /usr/lib/python3.12/lib-dynload/_ctypes.cpython-312-x86_64-linux-gnu.so(+0x92af) [0x7b16f6d942af]



In [None]:
# Train Random Forest model
print("Training Random Forest model...")
start_time = time.time()
rf_model = RandomForestClassifier(
    n_estimators=400,
    max_depth=12,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1  # Use all available cores
)
rf_model.fit(X_train, y_train)
rf_training_time = time.time() - start_time
print(f"Random Forest training completed in {rf_training_time:.2f} seconds")

y_pred_rf = rf_model.predict(X_test)

In [None]:
# Evaluate XGBoost model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb, average='weighted')
recall_xgb = recall_score(y_test, y_pred_xgb, average='weighted')
f1_weighted_xgb = f1_score(y_test, y_pred_xgb, average='weighted')

# Evaluate Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
f1_weighted_rf = f1_score(y_test, y_pred_rf, average='weighted')

# Print performance metrics for both models
print("\nXGBoost Model Performance:")
print("Accuracy: ", "{:.2%}".format(accuracy_xgb))
print("Precision Score (Macro): ", "{:.2%}".format(precision_xgb))
print(" Recall Score (Macro): ", "{:.2%}".format(recall_xgb))
print("F1-Score (Weighted): ", "{:.2%}".format(f1_weighted_xgb))
print("Training Time: ", "{:.2f} seconds".format(xgb_training_time))

print("\nRandom Forest Model Performance:")
print("Accuracy: ", "{:.2%}".format(accuracy_rf))
print("Precision Score (Macro): ", "{:.2%}".format(precision_rf))
print(" Recall Score (Macro): ", "{:.2%}".format(recall_rf))
print("F1-Score (Weighted): ", "{:.2%}".format(f1_weighted_rf))
print("Training Time: ", "{:.2f} seconds".format(rf_training_time))

In [None]:
# Detailed classification report
print("\nDetailed Classification Report for XGBoost:")
xgb_report = classification_report(y_test, y_pred_xgb, target_names=le.classes_, output_dict=False)
print(xgb_report)

print("\nDetailed Classification Report for Random Forest:")
rf_report = classification_report(y_test, y_pred_rf, target_names=le.classes_, output_dict=False)
print(rf_report)

# Create confusion matrices with proper labels
def plot_confusion_matrix(y_true, y_pred, model_name, class_names):
    cm = confusion_matrix(y_true, y_pred)
    print(f"\nConfusion Matrix for {model_name}:")
    # Print header
    header = "True\\Pred"
    for name in class_names:
        header += f"\t{name[:4]}"  # Abbreviate class names for better display
    print(header)

    # Print each row
    for i, row in enumerate(cm):
        row_str = f"{class_names[i][:4]}"  # Abbreviate class name
        for val in row:
            row_str += f"\t{val}"
        print(row_str)

    return cm

# Plot confusion matrices
xgb_cm = plot_confusion_matrix(y_test, y_pred_xgb, "XGBoost", le.classes_)
rf_cm = plot_confusion_matrix(y_test, y_pred_rf, "Random Forest", le.classes_)

# Feature importance analysis for interpretability
if hasattr(xgb_model, 'feature_importances_'):
    print("\nTop 15 Features (XGBoost):")
    feature_names = list(df.drop('attack_cat', axis=1).columns)
    feature_importances = xgb_model.feature_importances_
    sorted_idx = np.argsort(feature_importances)[::-1][:15]
    for i in sorted_idx:
        if i < len(feature_names):
            print(f"{feature_names[i]}: {feature_importances[i]:.4f}")

if hasattr(rf_model, 'feature_importances_'):
    print("\nTop 15 Features (Random Forest):")
    feature_names = list(df.drop('attack_cat', axis=1).columns)
    feature_importances = rf_model.feature_importances_
    sorted_idx = np.argsort(feature_importances)[::-1][:15]
    for i in sorted_idx:
        if i < len(feature_names):
            print(f"{feature_names[i]}: {feature_importances[i]:.4f}")