In [7]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [8]:
train_df = pd.read_csv('data/UNSW_NB15_training.csv')
test_df = pd.read_csv('data/UNSW_NB15_testing.csv')

In [20]:
# Assuming 'X' is your feature set and 'y' is your target
target = 'attack_cat'
features = [ "sbytes", "smean", "sttl", "dtcpb", "stcpb", "dbytes",
    "ct_srv_dst", "dload", "response_body_len", "ackdat", "dmean", "dur",
    "sload", "ct_srv_src", "synack", "dloss", "ct_dst_src_ltm", "sjit", "proto", "service", "state"
]

X_train = train_df[features]
y_train = train_df[target]

X_test = test_df[features]
y_test = test_df[target]

In [21]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

In [22]:
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
print("Categorical Features:", categorical_features)

Categorical Features: ['proto', 'service', 'state']


In [23]:
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    auto_class_weights='Balanced',
    verbose=100
)

# Train with categorical features specified
model.fit(X_train, y_train, cat_features=categorical_features, eval_set=(X_test, y_test), early_stopping_rounds=50)

0:	learn: 0.7075726	test: 0.7075726	best: 0.7075726 (0)	total: 68.3ms	remaining: 1m 8s
100:	learn: 0.8439860	test: 0.8437138	best: 0.8440787 (99)	total: 8.29s	remaining: 1m 13s
200:	learn: 0.8662011	test: 0.8656255	best: 0.8664475 (194)	total: 16.9s	remaining: 1m 7s
300:	learn: 0.9101623	test: 0.9087464	best: 0.9089543 (299)	total: 26.2s	remaining: 1m
400:	learn: 0.9298054	test: 0.9286929	best: 0.9286929 (400)	total: 35.4s	remaining: 52.9s
500:	learn: 0.9392847	test: 0.9379709	best: 0.9379709 (500)	total: 45.1s	remaining: 44.9s
600:	learn: 0.9464328	test: 0.9451520	best: 0.9451520 (600)	total: 54.1s	remaining: 35.9s
700:	learn: 0.9521830	test: 0.9506300	best: 0.9507359 (699)	total: 1m 3s	remaining: 27s
800:	learn: 0.9580766	test: 0.9561456	best: 0.9562826 (797)	total: 1m 12s	remaining: 18s
900:	learn: 0.9610696	test: 0.9595497	best: 0.9600641 (897)	total: 1m 21s	remaining: 8.95s
999:	learn: 0.9629616	test: 0.9613643	best: 0.9617443 (982)	total: 1m 30s	remaining: 0us

bestTest = 0.96174

<catboost.core.CatBoostClassifier at 0x247167f2fc0>

In [24]:
y_pred = model.predict(X_test)

In [25]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9409066666666667
                precision    recall  f1-score   support

      Backdoor       0.79      1.00      0.88        83
           DoS       0.88      0.91      0.89      1216
       Fuzzers       0.78      0.96      0.86       836
       Generic       0.88      1.00      0.94       289
        Normal       1.00      0.93      0.96      5348
Reconnaissance       0.94      0.97      0.96      1603

      accuracy                           0.94      9375
     macro avg       0.88      0.96      0.92      9375
  weighted avg       0.95      0.94      0.94      9375



In [26]:
feature_importance = model.get_feature_importance()
feature_names = X_train.columns

# Display top 10 important features
important_features = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
print(important_features.sort_values(by="Importance", ascending=False))

              Feature  Importance
0              sbytes   12.540742
1               smean    8.055733
9              ackdat    7.528744
2                sttl    7.140500
14             synack    6.917090
5              dbytes    6.140008
10              dmean    5.655818
4               stcpb    5.270881
3               dtcpb    5.109082
6          ct_srv_dst    5.013236
7               dload    4.677776
8   response_body_len    4.599335
11                dur    4.524358
12              sload    4.125449
13         ct_srv_src    3.969528
17               sjit    3.770154
16     ct_dst_src_ltm    2.370913
15              dloss    2.158056
20              state    0.432598
18              proto    0.000000
19            service    0.000000


In [27]:
import catboost
model.save_model("catboost_model.cbm")