# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
import warnings

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from tensorflow import keras
from tensorflow.keras import layers

from pytorch_tabnet.tab_model import TabNetClassifier

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

warnings.filterwarnings('ignore')

In [None]:
# Load data
train = pd.read_parquet('/content/drive/MyDrive/ML_data/TSYP/UNSW_NB15_training-set.parquet')
test = pd.read_parquet('/content/drive/MyDrive/ML_data/TSYP/UNSW_NB15_testing-set.parquet')

In [None]:
# Display data shape
print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')

Train shape: (175341, 36)
Test shape: (82332, 36)


In [None]:
# Seperate categorical and numerical columns
categorical_columns = train.select_dtypes(include=['category', 'object']).columns
numerical_columns = train.columns.difference(categorical_columns)

In [None]:
categorical_columns_list = [col for col in categorical_columns if col != 'attack_cat']
numerical_columns_list = [col for col in numerical_columns if col != 'label']

In [None]:
# Use log1p transformation for numerical columns with more than 50 unique values to reduce skewness
for col in numerical_columns_list:
  if train[col].nunique() > 50:
    train[col] = np.log1p(train[col])
    test[col] = np.log1p(test[col])

In [None]:
# Display value counts for categorical columns
for col in categorical_columns_list:
  print(f"Value count for {col}: ")
  print(train[col].value_counts().head())

  print(f"Value count for {col}: ")
  print(test[col].value_counts().head())

Value count for proto: 
proto
tcp     79946
udp     63283
unas    12084
arp      2859
ospf     2595
Name: count, dtype: int64
Value count for proto: 
proto
tcp     43095
udp     29418
unas     3515
arp       987
ospf      676
Name: count, dtype: int64
Value count for service: 
service
-           94168
dns         47294
http        18724
smtp         5058
ftp-data     3995
Name: count, dtype: int64
Value count for service: 
service
-       47153
dns     21367
http     8287
smtp     1851
ftp      1552
Name: count, dtype: int64
Value count for state: 
state
INT    82275
FIN    77825
CON    13152
REQ     1991
RST       83
Name: count, dtype: int64
Value count for state: 
state
FIN    39339
INT    34163
CON     6982
REQ     1842
ACC        4
Name: count, dtype: int64


In [None]:
# Get the top categories from the 'prop' column in the train set
top_prop_categories = train['proto'].value_counts().head().index
top_service_categories = train['service'].value_counts().head(8).index
top_state_categories = train['state'].value_counts().head(4).index

# Apply the transformation to both train and test sets
train['proto'] = np.where(train['proto'].isin(top_prop_categories), train['proto'], '-')
test['proto'] = np.where(test['proto'].isin(top_prop_categories), test['proto'], '-')

train['service'] = np.where(train['service'].isin(top_service_categories), train['service'], '-')
test['service'] = np.where(test['service'].isin(top_service_categories), test['service'], '-')

train['state'] = np.where(train['state'].isin(top_state_categories), train['state'], '-')
test['state'] = np.where(test['state'].isin(top_state_categories), test['state'], '-')

In [None]:
for col in categorical_columns_list:
  print(f"{col} has {train[col].nunique()} unique values")
  print(f"{col} has {test[col].nunique()} unique values")

proto has 6 unique values
proto has 6 unique values
service has 8 unique values
service has 8 unique values
state has 5 unique values
state has 5 unique values


In [None]:
for col in categorical_columns_list:
  print(f"Value count for {col}: ")
  print(train[col].value_counts())

  print(f"Value count for {col}: ")
  print(test[col].value_counts())

Value count for proto: 
proto
tcp     79946
udp     63283
-       14574
unas    12084
arp      2859
ospf     2595
Name: count, dtype: int64
Value count for proto: 
proto
tcp     43095
udp     29418
-        4641
unas     3515
arp       987
ospf      676
Name: count, dtype: int64
Value count for service: 
service
-           94435
dns         47294
http        18724
smtp         5058
ftp-data     3995
ftp          3428
ssh          1302
pop3         1105
Name: count, dtype: int64
Value count for service: 
service
-           47252
dns         21367
http         8287
smtp         1851
ftp          1552
ftp-data     1396
pop3          423
ssh           204
Name: count, dtype: int64
Value count for state: 
state
INT    82275
FIN    77825
CON    13152
REQ     1991
-         98
Name: count, dtype: int64
Value count for state: 
state
FIN    39339
INT    34163
CON     6982
REQ     1842
-          6
Name: count, dtype: int64


In [None]:
# One-hot encode categorical columns
train = pd.get_dummies(train, columns=categorical_columns_list)
test = pd.get_dummies(test, columns=categorical_columns_list)

In [None]:
# Split data into features and labels
X_train = train.drop(['attack_cat', 'label'], axis=1)
y_train = train['label']

X_test = test.drop(['attack_cat', 'label'], axis=1)
y_test = test['label']

X_test = X_test[X_train.columns]

In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175341 entries, 0 to 175340
Data columns (total 50 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   dur                175341 non-null  float32
 1   spkts              175341 non-null  float32
 2   dpkts              175341 non-null  float32
 3   sbytes             175341 non-null  float64
 4   dbytes             175341 non-null  float64
 5   rate               175341 non-null  float32
 6   sload              175341 non-null  float32
 7   dload              175341 non-null  float32
 8   sloss              175341 non-null  float32
 9   dloss              175341 non-null  float32
 10  sinpkt             175341 non-null  float32
 11  dinpkt             175341 non-null  float32
 12  sjit               175341 non-null  float32
 13  djit               175341 non-null  float32
 14  swin               175341 non-null  int16  
 15  stcpb              175341 non-null  float64
 16  dt

In [None]:
# Fixed Variables
RANDOM_STATE = 17

In [None]:
# Create an empty DataFrame with specified columns
model_results = pd.DataFrame(columns=['model_name', 'accuracy', 'f1', 'precision', 'recall', 'auc'])

In [None]:
# Evaluation function
def evaluate_model(model, X_test, y_test):
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred, average='weighted')
  precision = precision_score(y_test, y_pred, average='weighted')
  recall = recall_score(y_test, y_pred, average='weighted')
  y_pred_proba = model.predict_proba(X_test)[:, 1]
  auc = roc_auc_score(y_test, y_pred_proba, average='weighted', multi_class='ovr')
  return accuracy, f1, precision, recall, auc

In [None]:
# Evaluate multiple models
def evaluate_multiple_models(X_train, y_train, X_test, y_test, version_name, model_results):
    # Convert boolean columns to integers
    for col in X_train.select_dtypes(include=['bool']).columns:
        X_train[col] = X_train[col].astype('int8')
        X_test[col] = X_test[col].astype('int8')

    # Models list and names
    models = [
        ("XGBoost", XGBClassifier(random_state=RANDOM_STATE, eval_metric='mlogloss')),
        ("LightGBM", LGBMClassifier(random_state=RANDOM_STATE)),
        ("CatBoost", CatBoostClassifier(verbose=0, random_state=RANDOM_STATE)),
    ]

    # Train and evaluate tree-based models
    for name, model in models:
        model.fit(X_train, y_train)
        accuracy, f1, precision, recall, auc = evaluate_model(model, X_test, y_test)
        model_results = pd.concat([model_results, pd.DataFrame({'model_name': [f'{name}_{version_name}'],
                                                                'accuracy': [accuracy],
                                                                'f1': [f1],
                                                                'precision': [precision],
                                                                'recall': [recall],
                                                                'auc': [auc]
                                                               })], ignore_index=True)

    # Deep Learning model
    dl_model = keras.Sequential([
        layers.InputLayer(shape=(X_train.shape[1],)),
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dense(64, activation='relu'),
        layers.BatchNormalization(),
        layers.Dense(16, activation='relu'),
        layers.BatchNormalization(),
        layers.Dense(1)
    ])

    learning_rate = 3e-2
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    dl_model.compile(optimizer=optimizer, loss=keras.losses.BinaryCrossentropy(from_logits=True))
    dl_model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=1)

    # Evaluate Deep Learning model
    dl_y_pred = (dl_model.predict(X_test) > 0.5).astype("int32")
    dl_accuracy = accuracy_score(y_test, dl_y_pred)
    dl_f1 = f1_score(y_test, dl_y_pred, average='weighted')
    dl_precision = precision_score(y_test, dl_y_pred, average='weighted')
    dl_recall = recall_score(y_test, dl_y_pred, average='weighted')
    dl_auc = roc_auc_score(y_test, dl_y_pred, average='weighted')

    model_results = pd.concat([model_results, pd.DataFrame({'model_name': [f'Deep_Learning_{version_name}'],
                                                            'accuracy': [dl_accuracy],
                                                            'f1': [dl_f1],
                                                            'precision': [dl_precision],
                                                            'recall': [dl_recall],
                                                            'auc': [dl_auc]
                                                           })], ignore_index=True)

    # TabNet Model
    clf = TabNetClassifier(
        n_d=8,
        n_a=8,
        n_steps=3,
        gamma=1.5,
        n_independent=2,
        n_shared=2,
        lambda_sparse=1e-4,
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=2e-2),
        scheduler_params={"gamma": 0.95, "step_size": 20},
        scheduler_fn=torch.optim.lr_scheduler.StepLR,
        mask_type='entmax',
        verbose=1
    )
    clf.fit(X_train=X_train.values, y_train=y_train.values, max_epochs=10, patience=20,
            batch_size=1024, virtual_batch_size=128, num_workers=0, drop_last=False)

    # Evaluate TabNet model
    tabnet_y_pred = clf.predict(X_test.values)
    tabnet_accuracy = accuracy_score(y_test, tabnet_y_pred)
    tabnet_f1 = f1_score(y_test, tabnet_y_pred, average='weighted')
    tabnet_precision = precision_score(y_test, tabnet_y_pred, average='weighted')
    tabnet_recall = recall_score(y_test, tabnet_y_pred, average='weighted')
    tabnet_y_pred_proba = clf.predict_proba(X_test.values)
    tabnet_auc = roc_auc_score(y_test, tabnet_y_pred_proba[:, 1], average='weighted')

    model_results = pd.concat([model_results, pd.DataFrame({'model_name': [f'TabNet_{version_name}'],
                                                            'accuracy': [tabnet_accuracy],
                                                            'f1': [tabnet_f1],
                                                            'precision': [tabnet_precision],
                                                            'recall': [tabnet_recall],
                                                            'auc': [tabnet_auc]
                                                           })], ignore_index=True)

    return model_results


In [None]:
# Train and evaluate baseline models
model_results = evaluate_multiple_models(X_train, y_train, X_test, y_test, version_name='baseline', model_results=model_results)

[LightGBM] [Info] Number of positive: 119341, number of negative: 56000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041440 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5739
[LightGBM] [Info] Number of data points in the train set: 175341, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.680622 -> initscore=0.756633
[LightGBM] [Info] Start training from score 0.756633
Epoch 1/10
[1m2740/2740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - loss: 0.2724
Epoch 2/10
[1m2740/2740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 5ms/step - loss: 0.2207
Epoch 3/10
[1m2740/2740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 5ms/step - loss: 0.2599
Epoch 4/10
[1m2740/2740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 5ms/step - loss: 0.2005
Epoch 5/10
[1m2740/2740[

In [None]:
# Display model results
model_results

Unnamed: 0,model_name,accuracy,f1,precision,recall,auc
0,XGBoost_baseline,0.87152,0.868845,0.884861,0.87152,0.980165
1,LightGBM_baseline,0.862508,0.858948,0.880255,0.862508,0.980667
2,CatBoost_baseline,0.871168,0.868412,0.885045,0.871168,0.980447
3,Deep_Learning_baseline,0.792292,0.779047,0.840607,0.792292,0.770125
4,TabNet_baseline,0.819268,0.810352,0.856883,0.819268,0.972947


# Some Testings:

In [None]:
# Function to remove outliers using the IQR method
def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.05)
    Q3 = data[column].quantile(0.95)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Return only the rows that are within the bounds
    return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]

In [None]:
train_without_outilers = train.copy()

In [None]:
# Apply the remove_outliers_iqr function to all numerical columns
for column in numerical_columns_list:
    before_removal = train_without_outilers.shape[0]
    train_without_outilers = remove_outliers_iqr(train_without_outilers, column)
    after_removal = train_without_outilers.shape[0]
    print(f"Removed {before_removal - after_removal} outliers from {column}")

Removed 265 outliers from ackdat
Removed 46 outliers from ct_dst_sport_ltm
Removed 1070 outliers from ct_flw_http_mthd
Removed 2564 outliers from ct_ftp_cmd
Removed 0 outliers from ct_src_dport_ltm
Removed 0 outliers from dbytes
Removed 0 outliers from dinpkt
Removed 0 outliers from djit
Removed 0 outliers from dload
Removed 8 outliers from dloss
Removed 0 outliers from dmean
Removed 0 outliers from dpkts
Removed 0 outliers from dtcpb
Removed 2740 outliers from dur
Removed 0 outliers from dwin
Removed 0 outliers from is_ftp_login
Removed 2616 outliers from is_sm_ips_ports
Removed 0 outliers from rate
Removed 402 outliers from response_body_len
Removed 0 outliers from sbytes
Removed 0 outliers from sinpkt
Removed 0 outliers from sjit
Removed 0 outliers from sload
Removed 32 outliers from sloss
Removed 0 outliers from smean
Removed 0 outliers from spkts
Removed 0 outliers from stcpb
Removed 0 outliers from swin
Removed 155 outliers from synack
Removed 0 outliers from tcprtt
Removed 13 ou

In [None]:
X_train = train_without_outilers.drop(['attack_cat', 'label'], axis=1)
y_train = train_without_outilers['label']

X_test = test.drop(['attack_cat', 'label'], axis=1)
y_test = test['label']

X_test = X_test[X_train.columns]

In [None]:
model_results = evaluate_multiple_models(X_train, y_train, X_test, y_test, version_name='without_outliers', model_results=model_results)

[LightGBM] [Info] Number of positive: 114601, number of negative: 50829
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.118057 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5631
[LightGBM] [Info] Number of data points in the train set: 165430, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.692746 -> initscore=0.812989
[LightGBM] [Info] Start training from score 0.812989
Epoch 1/10
[1m2585/2585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 5ms/step - loss: 0.2604
Epoch 2/10
[1m2585/2585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 5ms/step - loss: 0.2065
Epoch 3/10
[1m2585/2585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 5ms/step - loss: 0.2231
Epoch 4/10
[1m2585/2585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 5ms/step - loss: 0.1790
Epoch 5/10
[1m2585/2585[

In [None]:
model_results

Unnamed: 0,model_name,accuracy,f1,precision,recall,auc
0,XGBoost_baseline,0.87152,0.868845,0.884861,0.87152,0.980165
1,LightGBM_baseline,0.862508,0.858948,0.880255,0.862508,0.980667
2,CatBoost_baseline,0.871168,0.868412,0.885045,0.871168,0.980447
3,Deep_Learning_baseline,0.792292,0.779047,0.840607,0.792292,0.770125
4,TabNet_baseline,0.819268,0.810352,0.856883,0.819268,0.972947
5,XGBoost_without_outliers,0.867889,0.865065,0.88152,0.867889,0.978161
6,LightGBM_without_outliers,0.86054,0.856983,0.877733,0.86054,0.978157
7,CatBoost_without_outliers,0.870463,0.867663,0.884512,0.870463,0.979803
8,Deep_Learning_without_outliers,0.665974,0.608897,0.77887,0.665974,0.629137
9,TabNet_without_outliers,0.809965,0.800717,0.845135,0.809965,0.952734


In [None]:
# Remove duplicate rows from the train DataFrame
train_without_duplicates = train.drop_duplicates()

# Print the new shape of the train DataFrame
print(f'Train shape after removing duplicates: {train_without_duplicates.shape}')

Train shape after removing duplicates: (93535, 52)


In [None]:
X_train = train_without_duplicates.drop(['attack_cat', 'label'], axis=1)
y_train = train_without_duplicates['label']

X_test = test.drop(['attack_cat', 'label'], axis=1)
y_test = test['label']

In [None]:
model_results = evaluate_multiple_models(X_train, y_train, X_test, y_test, version_name='without_duplicates', model_results=model_results)

[LightGBM] [Info] Number of positive: 44946, number of negative: 48589
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019923 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5750
[LightGBM] [Info] Number of data points in the train set: 93535, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.480526 -> initscore=-0.077935
[LightGBM] [Info] Start training from score -0.077935
Epoch 1/10
[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - loss: 0.3768
Epoch 2/10
[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - loss: 0.3260
Epoch 3/10
[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 6ms/step - loss: 0.2769
Epoch 4/10
[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - loss: 0.2641
Epoch 5/10
[1m1462/1462[0m 

In [None]:
model_results

Unnamed: 0,model_name,accuracy,f1,precision,recall,auc
0,XGBoost_baseline,0.87152,0.868845,0.884861,0.87152,0.980165
1,LightGBM_baseline,0.862508,0.858948,0.880255,0.862508,0.980667
2,CatBoost_baseline,0.871168,0.868412,0.885045,0.871168,0.980447
3,Deep_Learning_baseline,0.792292,0.779047,0.840607,0.792292,0.770125
4,TabNet_baseline,0.819268,0.810352,0.856883,0.819268,0.972947
5,XGBoost_without_outliers,0.867889,0.865065,0.88152,0.867889,0.978161
6,LightGBM_without_outliers,0.86054,0.856983,0.877733,0.86054,0.978157
7,CatBoost_without_outliers,0.870463,0.867663,0.884512,0.870463,0.979803
8,Deep_Learning_without_outliers,0.665974,0.608897,0.77887,0.665974,0.629137
9,TabNet_without_outliers,0.809965,0.800717,0.845135,0.809965,0.952734
