
## ハイパーパラメータチューニング（optuna）を試す。

### 背景
- パラメータによって、`u2r`の学習率に大きく影響があると考えられるため。
### 目的
- 最適なパラメータを知ること
### 使用するデータセット
- KDD99 10%
### 手法
### 結果
### 考察

In [2]:
import lightgbm

from utils_kdd99 import *
print_version()

python:      3.10.5
sklearn:     1.2.2
tensorflow:  2.13.0-rc0
keras:       2.13.1rc0
numpy:       1.23.5
pandas:      1.5.3


In [4]:
# データの読み込み
X, y = load_data(use_full_dataset=False, standard_scale=True, verbose=0, )
# 分類モデルによっては数値ラベルしか対応していないため、目的変数を分類クラスから数値へ変換
y = y.map(lambda x: attack_label_class[x]).map(lambda x: correspondences[x])
y.value_counts()

0    391458
1     97278
2      4107
3      1126
4        52
Name: true_label, dtype: int64

In [6]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=RANDOM_SEED, stratify=y)

### Dosのみを学習したオートエンコーダの作成
- 隠れ層の次元数(38->10->5->10->38)
- 活性化関数：ReLU
- 最適化関数：adam
- 損失関数：平均二乗誤差
- エポック数：5
- バッチサイズ：32

In [10]:
ae_model = keras.Sequential([
    Dense(units=10, activation='relu', input_dim=38, name='encoder1'),
    Dense(units=5, activation='relu', name='encoder2'),
    Dense(units=10, activation='relu'),
    Dense(units=38, activation='relu'),
])
ae_model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
ae_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder1 (Dense)            (None, 10)                390       
                                                                 
 encoder2 (Dense)            (None, 5)                 55        
                                                                 
 dense_2 (Dense)             (None, 10)                60        
                                                                 
 dense_3 (Dense)             (None, 38)                418       
                                                                 
Total params: 923 (3.61 KB)
Trainable params: 923 (3.61 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
# Dosのみのデータセットを作成
dos_x_train = x_train[y_train == correspondences['dos']]

In [12]:
ae_model.fit(dos_x_train, dos_x_train,
            epochs=5, # データセットを使って学習する回数
            batch_size=32,
            shuffle=True,
            verbose=1,
            use_multiprocessing=True
          )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x10a049f1630>

In [14]:
# エンコーダー部分を抜き取る
encoder = keras.Sequential([ae_model.get_layer('encoder1'),
                            ae_model.get_layer('encoder2')])
# DOSエンコーダを用いた特徴量に命名
dos_columns = list(map(lambda x: 'dos' + str(x), range(5)))
dos_columns

['dos0', 'dos1', 'dos2', 'dos3', 'dos4']

In [15]:
# 特徴量抽出し、マージする。
x_train_encoded = pd.DataFrame(data=encoder.predict(x_train), index=x_train.index, columns=dos_columns)
x_test_encoded = pd.DataFrame(data=encoder.predict(x_test), index=x_test.index, columns=dos_columns)
x_train_encoded = x_train.merge(x_train_encoded, right_index=True, left_index=True)
x_test_encoded = x_test.merge(x_test_encoded, right_index=True, left_index=True)
x_train_encoded.head()



Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,dos0,dos1,dos2,dos3,dos4
212221,-0.067792,-0.002017,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,...,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,0.971426,3.588529,0.0,1.988687,1.14327
30903,-0.067792,-0.002774,0.472896,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,...,0.553404,-0.464418,-0.463202,-0.25204,-0.249464,1.204473,0.534857,1.009054,0.0,0.0
9739,-0.067792,-0.002017,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,...,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,0.970986,3.58872,0.0,1.990658,1.145948
37540,-0.067792,-0.002776,-0.01412,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,...,0.790749,-0.464418,-0.463202,-0.25204,-0.249464,0.947306,0.529504,0.977704,0.0,0.0
418638,-0.067792,-0.002535,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,...,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,0.971012,3.588875,0.0,1.990835,1.146163


### optuna＋lightGBMを用いた学習
- 元の特徴量のみを使う。
- 

In [49]:
import optuna.integration.lightgbm as lgb
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)
# LightGBM parameters
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'num_class': 5,
        'metric': 'multi_error', # 評価指標 : 誤り率(= 1-正答率)  another multi_logloss
        'learning_rate': 0.1,
        'num_leaves': 23,
        'min_data_in_leaf': 1,
        'verbose': -1,
        'random_state': RANDOM_SEED, 
}

# モデルの学習
model = lgb.train(params, # パラメータ
                  lgb_train, # トレーニングデータの指定
                  valid_sets=[lgb_train], # 検証データの指定
                  callbacks=[lgb.early_stopping(50, verbose=False)],
               )


[I 2023-08-01 08:18:54,825] A new study created in memory with name: no-name-7e363df8-0665-45a9-aed3-8880889c0848
feature_fraction, val_score: inf:   0%|          | 0/7 [00:00<?, ?it/s]



feature_fraction, val_score: 0.000480:  14%|#4        | 1/7 [00:03<00:21,  3.59s/it][I 2023-08-01 08:18:58,417] Trial 0 finished with value: 0.00048037124540021875 and parameters: {'feature_fraction': 0.7}. Best is trial 0 with value: 0.00048037124540021875.
feature_fraction, val_score: 0.000396:  29%|##8       | 2/7 [00:06<00:16,  3.25s/it][I 2023-08-01 08:19:01,430] Trial 1 finished with value: 0.0003957775669649601 and parameters: {'feature_fraction': 1.0}. Best is trial 1 with value: 0.0003957775669649601.
feature_fraction, val_score: 0.000396:  43%|####2     | 3/7 [00:09<00:13,  3.30s/it][I 2023-08-01 08:19:04,789] Trial 2 finished with value: 0.0003957775669649601 and parameters: {'feature_fraction': 0.5}. Best is trial 1 with value: 0.0003957775669649601.
feature_fraction, val_score: 0.000396:  57%|#####7    | 4/7 [00:12<00:09,  3.18s/it][I 2023-08-01 08:19:07,780] Trial 3 finished with value: 0.0003957775669649601 and parameters: {'feature_fraction': 0.8}. Best is trial 1 with 

In [52]:
y_pred_prob = model.predict(x_test)
# テストデータの予測 (予測クラス(0 or 1 or...)を返す)
y_pred = np.argmax(y_pred_prob, axis=1) # 一番大きい予測確率のクラスを予測クラスに
y_pred = pd.Series(y_pred)
y_pred.value_counts()

0    129187
1     32118
2      1339
3       371
4        12
dtype: int64

In [53]:
print(classification_report(y_test, y_pred, target_names=correspondences.keys()))

              precision    recall  f1-score   support

         dos       1.00      1.00      1.00    129181
      normal       1.00      1.00      1.00     32102
       probe       1.00      0.99      0.99      1355
         r2l       0.96      0.96      0.96       372
         u2r       0.75      0.53      0.62        17

    accuracy                           1.00    163027
   macro avg       0.94      0.90      0.92    163027
weighted avg       1.00      1.00      1.00    163027


In [54]:
# 
model.params

{'task': 'train',
 'boosting_type': 'gbdt',
 'objective': 'multiclass',
 'num_class': 5,
 'metric': 'multi_error',
 'learning_rate': 0.1,
 'num_leaves': 69,
 'min_data_in_leaf': 1,
 'verbose': -1,
 'random_state': 2018,
 'feature_pre_filter': False,
 'lambda_l1': 8.236356257390715e-05,
 'lambda_l2': 0.08105453758970475,
 'feature_fraction': 0.9799999999999999,
 'bagging_fraction': 0.9554052830148927,
 'bagging_freq': 7,
 'min_child_samples': 20,
 'num_iterations': 1000,
 'early_stopping_round': None}

In [55]:
for key, confusion_matrix in zip(correspondences.keys(), multilabel_confusion_matrix(y_test, y_pred)):
    print(f"{key}    TP: {confusion_matrix[1][1]}, TN: {confusion_matrix[0][0]}, FP: {confusion_matrix[0][1]}, FN: {confusion_matrix[1][0]}")

dos    TP: 129176, TN: 33835, FP: 11, FN: 5
normal    TP: 32084, TN: 130891, FP: 34, FN: 18
probe    TP: 1339, TN: 161672, FP: 0, FN: 16
r2l    TP: 358, TN: 162642, FP: 13, FN: 14
u2r    TP: 9, TN: 163007, FP: 3, FN: 8


In [56]:
lgb_train_encoded = lgb.Dataset(x_train_encoded, y_train)
lgb_eval_encoded = lgb.Dataset(x_test_encoded, y_test, reference=lgb_train_encoded)

In [57]:
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'num_class': 5,
        'metric': 'multi_error', # 評価指標 : 誤り率(= 1-正答率)  another multi_logloss
        'learning_rate': 0.1,
        'num_leaves': 23,
        'min_data_in_leaf': 1,
        'verbose': -1,
        'random_state': RANDOM_SEED, 
}

# モデルの学習
model_encoded = lgb.train(params, # パラメータ
                  lgb_train_encoded, # トレーニングデータの指定
                  valid_sets=[lgb_train_encoded], # 検証データの指定
                  callbacks=[lgb.early_stopping(50, verbose=False)],
               )

[I 2023-08-01 09:25:44,649] A new study created in memory with name: no-name-47a7342f-080f-4ab2-b674-a645e65eee27
feature_fraction, val_score: inf:   0%|          | 0/7 [00:00<?, ?it/s]



feature_fraction, val_score: 0.000447:  14%|#4        | 1/7 [00:06<00:41,  6.86s/it][I 2023-08-01 09:25:51,514] Trial 0 finished with value: 0.00044713801458636714 and parameters: {'feature_fraction': 0.4}. Best is trial 0 with value: 0.00044713801458636714.
feature_fraction, val_score: 0.000447:  29%|##8       | 2/7 [00:11<00:27,  5.57s/it][I 2023-08-01 09:25:56,176] Trial 1 finished with value: 0.00044713801458636714 and parameters: {'feature_fraction': 0.8}. Best is trial 0 with value: 0.00044713801458636714.
feature_fraction, val_score: 0.000447:  43%|####2     | 3/7 [00:16<00:21,  5.32s/it][I 2023-08-01 09:26:01,203] Trial 2 finished with value: 0.00044713801458636714 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 0 with value: 0.00044713801458636714.
feature_fraction, val_score: 0.000287:  57%|#####7    | 4/7 [00:19<00:13,  4.53s/it][I 2023-08-01 09:26:04,529] Trial 3 finished with value: 0.00028701426611962757 and parameters: {'feature_fraction': 1.0}. B

In [60]:
y_pred_prob = model_encoded.predict(x_test_encoded)
# テストデータの予測 (予測クラス(0 or 1 or...)を返す)
y_pred_encoded = np.argmax(y_pred_prob, axis=1) # 一番大きい予測確率のクラスを予測クラスに
y_pred_encoded = pd.Series(y_pred_encoded)
y_pred_encoded.value_counts()

0    129186
1     32102
2      1354
3       370
4        15
dtype: int64

In [61]:
print(classification_report(y_test, y_pred_encoded, target_names=correspondences.keys()))

              precision    recall  f1-score   support

         dos       1.00      1.00      1.00    129181
      normal       1.00      1.00      1.00     32102
       probe       0.99      0.99      0.99      1355
         r2l       0.95      0.95      0.95       372
         u2r       0.47      0.41      0.44        17

    accuracy                           1.00    163027
   macro avg       0.88      0.87      0.88    163027
weighted avg       1.00      1.00      1.00    163027


In [62]:
model_encoded.params

{'task': 'train',
 'boosting_type': 'gbdt',
 'objective': 'multiclass',
 'num_class': 5,
 'metric': 'multi_error',
 'learning_rate': 0.1,
 'num_leaves': 91,
 'min_data_in_leaf': 1,
 'verbose': -1,
 'random_state': 2018,
 'feature_pre_filter': False,
 'lambda_l1': 4.278332719939832e-07,
 'lambda_l2': 0.014688909679206469,
 'feature_fraction': 0.9520000000000001,
 'bagging_fraction': 1.0,
 'bagging_freq': 0,
 'min_child_samples': 20,
 'num_iterations': 1000,
 'early_stopping_round': None}

In [63]:
for key, confusion_matrix in zip(correspondences.keys(), multilabel_confusion_matrix(y_test, y_pred)):
    print(f"{key}    TP: {confusion_matrix[1][1]}, TN: {confusion_matrix[0][0]}, FP: {confusion_matrix[0][1]}, FN: {confusion_matrix[1][0]}")

dos    TP: 129176, TN: 33835, FP: 11, FN: 5
normal    TP: 32084, TN: 130891, FP: 34, FN: 18
probe    TP: 1339, TN: 161672, FP: 0, FN: 16
r2l    TP: 358, TN: 162642, FP: 13, FN: 14
u2r    TP: 9, TN: 163007, FP: 3, FN: 8
