***IMPORTS***

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree  import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import time

Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading xgboost-2.1.1-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl (199.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.0/199.0 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.23.4 xgboost-2.1.1


In [None]:
import warnings
warnings.filterwarnings("ignore")

***READING DATA***

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_data = pd.read_csv("/content/drive/MyDrive/UNSW_NB15_training-set.csv")
test_data = pd.read_csv("/content/drive/MyDrive/UNSW_NB15_testing-set.csv")
data = pd.concat([train_data, test_data])

***DECIDING IMPORTANT FEATURES***

In [None]:
features = ['dur', 'proto', 'service', 'state', 'rate', 'sbytes', 'dbytes', 'synack', 'ct_srv_src', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'ct_src_ltm', 'ct_srv_dst']

X = pd.DataFrame(data[features])

***CONVERTING CATEGORICAL DATA INTO NUMERICAL***

In [None]:
categories_dict = {category: i for i, category in enumerate(data['attack_cat'].unique())}
data['attack_cat'] = data['attack_cat'].replace(categories_dict)

categories_dict = {category: i for i, category in enumerate(X['proto'].unique())}
X['proto'] = X['proto'].replace(categories_dict)

categories_dict = {category: i for i, category in enumerate(X['service'].unique())}
X['service'] = X['service'].replace(categories_dict)

categories_dict = {category: i for i, category in enumerate(X['state'].unique())}
X['state'] = X['state'].replace(categories_dict)

***MODELS AND FUNCTION FOR EVALUATING***

In [None]:
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "LinearSVC": LinearSVC(),
    "XGB Classifier": XGBClassifier()
}

In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    end_time = time.time()
    execution_time = end_time - start_time
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    return acc, report, cm, execution_time

# ***BINARY CLASSIFICATION***

In [None]:
y = pd.DataFrame(data['label'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

***NORMALIZING DATA***

In [None]:
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

In [None]:
for model_name, model in models.items():
    print(f"Training {model_name}...")
    accuracy, report, cm, execution_time = evaluate_model(model, X_train, y_train, X_test, y_test)
    print(f"\n{model_name} Accuracy: {accuracy:.4f}")
    print(f"Classification Report for {model_name}:\n{report}")
    print(f"Confusion Matrix for {model_name}:\n{cm}\n")
    print(f"Time for {model_name}:{execution_time:.2f} seconds\n")
    print("\n------------------------------------------------------------------------------------------------------------------------------\n")

Training Decision Tree...

Decision Tree Accuracy: 0.9399
Classification Report for Decision Tree:
              precision    recall  f1-score   support

           0       0.92      0.91      0.92     18613
           1       0.95      0.95      0.95     32922

    accuracy                           0.94     51535
   macro avg       0.94      0.93      0.93     51535
weighted avg       0.94      0.94      0.94     51535

Confusion Matrix for Decision Tree:
[[17009  1604]
 [ 1491 31431]]

Time for Decision Tree:1.30 seconds


------------------------------------------------------------------------------------------------------------------------------

Training KNN...

KNN Accuracy: 0.9169
Classification Report for KNN:
              precision    recall  f1-score   support

           0       0.89      0.88      0.88     18613
           1       0.93      0.94      0.94     32922

    accuracy                           0.92     51535
   macro avg       0.91      0.91      0.91     51535

##***HYPERPARAMETER TUNING FOR DT, RF, XGB***

In [None]:
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_dt = GridSearchCV(DecisionTreeClassifier(), param_grid_dt, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
grid_search_dt.fit(X_train, y_train)

print("Best Hyperparameters for Decision Tree:", grid_search_dt.best_params_)

best_dt_model = grid_search_dt.best_estimator_

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Hyperparameters for Decision Tree: {'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [None]:
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

print("Best Hyperparameters for Random Forest:", grid_search_rf.best_params_)

best_rf_model = grid_search_rf.best_estimator_

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Hyperparameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}


In [None]:
param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'objective': ['binary:logistic']
}

grid_search_xgb = GridSearchCV(XGBClassifier(), param_grid_xgb, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
grid_search_xgb.fit(X_train, y_train)

print("Best Hyperparameters for XGB Classifier:", grid_search_xgb.best_params_)

best_xgb_model = grid_search_xgb.best_estimator_

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Hyperparameters for XGB Classifier: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 300, 'objective': 'binary:logistic'}


In [None]:
accuracy, report, cm, execution_time = evaluate_model(best_dt_model, X_train, y_train, X_test, y_test)
print(f"\nDecision Tree Accuracy with Best Hyperparameters: {accuracy:.4f}")
print(f"Classification Report for Decision Tree:\n{report}")
print(f"Confusion Matrix for Decision Tree:\n{cm}\n")
print(f"Time for Decision Tree:{execution_time}\n")
print("\n------------------------------------------------------------------------------------------------------------------------------\n")

accuracy, report, cm, execution_time = evaluate_model(best_rf_model, X_train, y_train, X_test, y_test)
print(f"\nRandom Forest Accuracy with Best Hyperparameters: {accuracy:.4f}")
print(f"Classification Report for Random Forest:\n{report}")
print(f"Confusion Matrix for Random Forest:\n{cm}\n")
print(f"Time for Random Forest:{execution_time}\n")
print("\n------------------------------------------------------------------------------------------------------------------------------\n")

accuracy, report, cm, execution_time = evaluate_model(best_xgb_model, X_train, y_train, X_test, y_test)
print(f"\nXGB Classifier Accuracy with Best Hyperparameters: {accuracy:.4f}")
print(f"Classification Report for {model_name}:\n{report}")
print(f"Confusion Matrix for {model_name}:\n{cm}\n")
print(f"Time for {model_name}:{execution_time}\n")
print("\n------------------------------------------------------------------------------------------------------------------------------\n")


Decision Tree Accuracy with Best Hyperparameters: 0.9410
Classification Report for Decision Tree:
              precision    recall  f1-score   support

           0       0.91      0.93      0.92     18613
           1       0.96      0.95      0.95     32922

    accuracy                           0.94     51535
   macro avg       0.93      0.94      0.94     51535
weighted avg       0.94      0.94      0.94     51535

Confusion Matrix for Decision Tree:
[[17370  1243]
 [ 1800 31122]]

Time for Decision Tree:1.061589241027832


------------------------------------------------------------------------------------------------------------------------------


Random Forest Accuracy with Best Hyperparameters: 0.9515
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.94      0.93      0.93     18613
           1       0.96      0.96      0.96     32922

    accuracy                           0.95     51535
   macro avg      

# ***MULTI-CLASS CLASSIFICATION***

In [None]:
y = pd.DataFrame(data['attack_cat'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

***NORMALIZING DATA***

In [None]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
for model_name, model in models.items():
    print(f"Training {model_name}...")
    accuracy, report, cm, execution_time = evaluate_model(model, X_train, y_train, X_test, y_test)
    print(f"\n{model_name} Accuracy: {accuracy:.4f}")
    print(f"Classification Report for {model_name}:\n{report}")
    print(f"Confusion Matrix for {model_name}:\n{cm}\n")
    print(f"Time for {model_name}:{execution_time}\n")
    print("\n------------------------------------------------------------------------------------------------------------------------------\n")

Training Decision Tree...

Decision Tree Accuracy: 0.8044
Classification Report for Decision Tree:
              precision    recall  f1-score   support

           0       0.92      0.92      0.92     18613
           1       0.39      0.10      0.16       468
           2       0.47      0.14      0.22       519
           3       0.62      0.62      0.62      4831
           4       0.60      0.59      0.59       309
           5       0.89      0.74      0.81      2852
           6       0.62      0.76      0.68      9080
           7       0.30      0.24      0.26      3292
           8       0.40      0.35      0.38        34
           9       0.98      0.98      0.98     11537

    accuracy                           0.80     51535
   macro avg       0.62      0.54      0.56     51535
weighted avg       0.80      0.80      0.80     51535

Confusion Matrix for Decision Tree:
[[17070     2    61  1245    23    10   158    30     0    14]
 [    1    46     2    72     2     3   252

##***HYPERPARAMETER TUNING FOR XGB***

In [None]:
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'objective': ['multi:softmax'],
    'num_class': [10]
}

In [None]:
grid_search = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1
)

In [None]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [None]:
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 300, 'num_class': 10, 'objective': 'multi:softmax'}


In [None]:
best_model = grid_search.best_estimator_
accuracy, report, cm, execution_time = evaluate_model(best_model, X_train, y_train, X_test, y_test)
print(f"\nXGB Classifier Accuracy with Best Hyperparameters: {accuracy:.4f}")
print(f"Classification Report for {model_name}:\n{report}")
print(f"Confusion Matrix for {model_name}:\n{cm}\n")
print(f"Time for {model_name}:{execution_time}\n")
print("\n------------------------------------------------------------------------------------------------------------------------------\n")


XGB Classifier Accuracy with Best Hyperparameters: 0.8294
Classification Report for XGB Classifier:
              precision    recall  f1-score   support

           0       0.91      0.94      0.93     18613
           1       0.78      0.08      0.15       468
           2       0.86      0.10      0.19       519
           3       0.70      0.61      0.65      4831
           4       0.64      0.68      0.66       309
           5       0.93      0.75      0.83      2852
           6       0.63      0.88      0.74      9080
           7       0.39      0.15      0.21      3292
           8       0.35      0.32      0.34        34
           9       1.00      0.98      0.99     11537

    accuracy                           0.83     51535
   macro avg       0.72      0.55      0.57     51535
weighted avg       0.83      0.83      0.81     51535

Confusion Matrix for XGB Classifier:
[[17547     0     5   900    22    11   113     8     0     7]
 [    4    39     0    52     3     2   