In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [2]:
train_df = pd.read_csv('data/UNSW_NB15_training.csv')
test_df = pd.read_csv('data/Book1.csv')

In [16]:
# Assuming 'X' is your feature set and 'y' is your target
features = ['service', 'proto', 'smean', 'sbytes', 'ct_dst_src_ltm', 'sttl', 'ct_srv_dst', 'state', 'dbytes', 'dmean', 'ct_src_dport_ltm',
            'sload', 'ct_srv_src', 'ct_src_ltm', 'sjit', 'ct_dst_ltm', 'dload', 'dloss', 'sinpkt']
target = 'attack_cat'

X_train = train_df[features]
y_train = train_df[target]

X_test = test_df[features]
y_test = test_df[target]

In [17]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

In [18]:
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
print("Categorical Features:", categorical_features)

Categorical Features: ['service', 'proto', 'state']


In [19]:
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    auto_class_weights='Balanced',
    verbose=100
)

# Train with categorical features specified
model.fit(X_train, y_train, cat_features=categorical_features, eval_set=(X_test, y_test), early_stopping_rounds=50)

0:	learn: 0.4944498	test: 0.4950201	best: 0.4950201 (0)	total: 638ms	remaining: 10m 37s
100:	learn: 0.6856029	test: 0.6844288	best: 0.6844288 (100)	total: 56.2s	remaining: 8m 19s
200:	learn: 0.7094240	test: 0.7011685	best: 0.7012206 (199)	total: 1m 47s	remaining: 7m 5s
300:	learn: 0.7231362	test: 0.7094345	best: 0.7096435 (295)	total: 2m 42s	remaining: 6m 18s
400:	learn: 0.7355648	test: 0.7154136	best: 0.7154136 (400)	total: 4m 39s	remaining: 6m 57s
500:	learn: 0.7443970	test: 0.7183147	best: 0.7183550 (496)	total: 5m 41s	remaining: 5m 39s
600:	learn: 0.7527241	test: 0.7197328	best: 0.7197592 (599)	total: 6m 41s	remaining: 4m 26s
700:	learn: 0.7588628	test: 0.7214399	best: 0.7214614 (696)	total: 7m 43s	remaining: 3m 17s
800:	learn: 0.7643869	test: 0.7228046	best: 0.7229551 (795)	total: 8m 46s	remaining: 2m 10s
900:	learn: 0.7697580	test: 0.7240681	best: 0.7240681 (900)	total: 9m 47s	remaining: 1m 4s
999:	learn: 0.7731748	test: 0.7248666	best: 0.7249712 (995)	total: 10m 50s	remaining: 0

<catboost.core.CatBoostClassifier at 0x1c604425a30>

In [20]:
y_pred = model.predict(X_test)

In [21]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7464369428713193
                precision    recall  f1-score   support

      Analysis       0.09      0.52      0.16      2000
      Backdoor       0.08      0.50      0.14      1746
           DoS       0.37      0.35      0.36     12264
      Exploits       0.89      0.46      0.61     33393
       Fuzzers       0.60      0.86      0.71     18184
       Generic       1.00      0.98      0.99     40000
        Normal       1.00      0.80      0.89     56000
Reconnaissance       0.83      0.81      0.82     10491
     Shellcode       0.36      0.98      0.52      1133
         Worms       0.12      1.00      0.22       130

      accuracy                           0.75    175341
     macro avg       0.53      0.72      0.54    175341
  weighted avg       0.86      0.75      0.78    175341



In [22]:
feature_importance = model.get_feature_importance()
feature_names = X_train.columns

# Display top 10 important features
important_features = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
print(important_features.sort_values(by="Importance", ascending=False))

             Feature  Importance
0            service   17.806392
1              proto   10.997594
5               sttl    9.749658
3             sbytes    9.307785
2              smean    8.892607
4     ct_dst_src_ltm    6.305317
6         ct_srv_dst    5.496520
7              state    4.425178
8             dbytes    4.300813
9              dmean    3.462033
10  ct_src_dport_ltm    3.322657
11             sload    3.163245
13        ct_src_ltm    2.781727
18            sinpkt    2.366579
14              sjit    2.001209
12        ct_srv_src    1.713987
15        ct_dst_ltm    1.624787
16             dload    1.318615
17             dloss    0.963296
