In [43]:
import optuna

from utils_kdd99 import *
print_version()


python:      3.10.11
sklearn:     1.2.2
tensorflow:  2.12.0
keras:       2.12.0
numpy:       1.23.5
pandas:      1.5.3


In [44]:
# データの読み込み
X, y = load_data(use_full_dataset=False, standard_scale=True, verbose=0, )
# 分類モデルによっては数値ラベルしか対応していないため、目的変数を分類クラスから数値へ変換
y = y.map(lambda x: attack_label_class[x]).map(lambda x: correspondences[x])
y.value_counts()


0    391458
1     97278
2      4107
3      1126
4        52
Name: true_label, dtype: int64

In [45]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=RANDOM_SEED, stratify=y)

In [46]:
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

In [47]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [48]:
model_svm = SVC()
model_svm.fit(x_train, y_train)
y_pred = model_svm.predict(x_test)

In [49]:
print(classification_report(y_test, y_pred, target_names=correspondences.keys()))


              precision    recall  f1-score   support

         dos       1.00      1.00      1.00    129181
      normal       1.00      1.00      1.00     32102
       probe       0.99      0.98      0.99      1355
         r2l       0.95      0.90      0.92       372
         u2r       1.00      0.29      0.45        17

    accuracy                           1.00    163027
   macro avg       0.99      0.83      0.87    163027
weighted avg       1.00      1.00      1.00    163027



In [50]:
for key, confusion_matrix in zip(correspondences.keys(), multilabel_confusion_matrix(y_test, y_pred)):
    print(f"{key}    TP: {confusion_matrix[1][1]}, TN: {confusion_matrix[0][0]}, FP: {confusion_matrix[0][1]}, FN: {confusion_matrix[1][0]}")

dos    TP: 129164, TN: 33801, FP: 45, FN: 17
normal    TP: 32040, TN: 130841, FP: 84, FN: 62
probe    TP: 1328, TN: 161665, FP: 7, FN: 27
r2l    TP: 335, TN: 162636, FP: 19, FN: 37
u2r    TP: 5, TN: 163010, FP: 0, FN: 12


In [51]:
model_svm_balanced = SVC(class_weight='balanced')
model_svm_balanced.fit(x_train, y_train)
y_pred_balanced = model_svm_balanced.predict(x_test)


In [52]:
print(classification_report(y_test, y_pred_balanced, target_names=correspondences.keys()))


              precision    recall  f1-score   support

         dos       1.00      1.00      1.00    129181
      normal       1.00      0.99      0.99     32102
       probe       0.93      0.99      0.96      1355
         r2l       0.64      0.96      0.77       372
         u2r       0.06      0.47      0.11        17

    accuracy                           1.00    163027
   macro avg       0.73      0.88      0.77    163027
weighted avg       1.00      1.00      1.00    163027



In [53]:
for key, confusion_matrix in zip(correspondences.keys(), multilabel_confusion_matrix(y_test, y_pred_balanced)):
    print(f"{key}    TP: {confusion_matrix[1][1]}, TN: {confusion_matrix[0][0]}, FP: {confusion_matrix[0][1]}, FN: {confusion_matrix[1][0]}")


dos    TP: 129125, TN: 33829, FP: 17, FN: 56
normal    TP: 31707, TN: 130875, FP: 50, FN: 395
probe    TP: 1342, TN: 161575, FP: 97, FN: 13
r2l    TP: 357, TN: 162457, FP: 198, FN: 15
u2r    TP: 8, TN: 162884, FP: 126, FN: 9


In [54]:
ae_model = keras.Sequential([
    Dense(units=10, activation='relu', input_dim=38, name='encoder1'),
    Dense(units=5, activation='relu', name='encoder2'),
    Dense(units=10, activation='relu'),
    Dense(units=38, activation='relu'),
])
ae_model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
ae_model.summary()


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder1 (Dense)            (None, 10)                390       
                                                                 
 encoder2 (Dense)            (None, 5)                 55        
                                                                 
 dense_2 (Dense)             (None, 10)                60        
                                                                 
 dense_3 (Dense)             (None, 38)                418       
                                                                 
Total params: 923
Trainable params: 923
Non-trainable params: 0
_________________________________________________________________


In [55]:
# Dosのみのデータセットを作成
dos_x_train = x_train[y_train == correspondences['dos']]


In [56]:
ae_model.fit(dos_x_train, dos_x_train,
            epochs=5, # データセットを使って学習する回数
            batch_size=32,
            shuffle=True,
            verbose=1,
            use_multiprocessing=True
          )


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x166353640>

In [57]:
# エンコーダー部分を抜き取る
encoder = keras.Sequential([ae_model.get_layer('encoder1'),
                            ae_model.get_layer('encoder2')])
# DOSエンコーダを用いた特徴量に命名
dos_columns = list(map(lambda x: 'dos' + str(x), range(5)))
dos_columns


['dos0', 'dos1', 'dos2', 'dos3', 'dos4']

In [58]:
# 特徴量抽出し、マージする。
x_train_encoded = pd.DataFrame(data=encoder.predict(x_train), index=x_train.index, columns=dos_columns)
x_test_encoded = pd.DataFrame(data=encoder.predict(x_test), index=x_test.index, columns=dos_columns)
x_train_encoded = x_train.merge(x_train_encoded, right_index=True, left_index=True)
x_test_encoded = x_test.merge(x_test_encoded, right_index=True, left_index=True)
x_train_encoded.head()




Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,dos0,dos1,dos2,dos3,dos4
212221,-0.067792,-0.002017,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,...,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,0.0,1.807431,0.286052,0.0,0.0
30903,-0.067792,-0.002774,0.472896,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,...,0.553404,-0.464418,-0.463202,-0.25204,-0.249464,1.142389,1.244851,1.075876,0.842139,0.0
9739,-0.067792,-0.002017,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,...,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,0.0,1.803711,0.283434,0.0,0.0
37540,-0.067792,-0.002776,-0.01412,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,...,0.790749,-0.464418,-0.463202,-0.25204,-0.249464,1.189391,1.299393,1.144139,0.808007,0.0
418638,-0.067792,-0.002535,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,...,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,0.0,1.804341,0.28382,0.0,0.0


In [59]:
model_svm = SVC()
model_svm.fit(x_train_encoded, y_train)
y_pred = model_svm.predict(x_test_encoded)


In [60]:
print(classification_report(y_test, y_pred, target_names=correspondences.keys()))


              precision    recall  f1-score   support

         dos       1.00      1.00      1.00    129181
      normal       1.00      1.00      1.00     32102
       probe       0.99      0.98      0.99      1355
         r2l       0.95      0.90      0.92       372
         u2r       1.00      0.29      0.45        17

    accuracy                           1.00    163027
   macro avg       0.99      0.83      0.87    163027
weighted avg       1.00      1.00      1.00    163027



In [61]:
for key, confusion_matrix in zip(correspondences.keys(), multilabel_confusion_matrix(y_test, y_pred)):
    print(f"{key}    TP: {confusion_matrix[1][1]}, TN: {confusion_matrix[0][0]}, FP: {confusion_matrix[0][1]}, FN: {confusion_matrix[1][0]}")

dos    TP: 129164, TN: 33798, FP: 48, FN: 17
normal    TP: 32040, TN: 130842, FP: 83, FN: 62
probe    TP: 1328, TN: 161664, FP: 8, FN: 27
r2l    TP: 334, TN: 162638, FP: 17, FN: 38
u2r    TP: 5, TN: 163010, FP: 0, FN: 12


In [62]:
model_svm_balanced = SVC(class_weight='balanced')
model_svm_balanced.fit(x_train_encoded, y_train)
y_pred_balanced = model_svm_balanced.predict(x_test_encoded)


In [63]:
print(classification_report(y_test, y_pred_balanced, target_names=correspondences.keys()))


              precision    recall  f1-score   support

         dos       1.00      1.00      1.00    129181
      normal       1.00      0.99      0.99     32102
       probe       0.93      0.99      0.96      1355
         r2l       0.62      0.96      0.75       372
         u2r       0.05      0.47      0.10        17

    accuracy                           1.00    163027
   macro avg       0.72      0.88      0.76    163027
weighted avg       1.00      1.00      1.00    163027



In [64]:
for key, confusion_matrix in zip(correspondences.keys(), multilabel_confusion_matrix(y_test, y_pred_balanced)):
    print(f"{key}    TP: {confusion_matrix[1][1]}, TN: {confusion_matrix[0][0]}, FP: {confusion_matrix[0][1]}, FN: {confusion_matrix[1][0]}")


dos    TP: 129108, TN: 33826, FP: 20, FN: 73
normal    TP: 31669, TN: 130867, FP: 58, FN: 433
probe    TP: 1343, TN: 161564, FP: 108, FN: 12
r2l    TP: 357, TN: 162438, FP: 217, FN: 15
u2r    TP: 8, TN: 162871, FP: 139, FN: 9
