
## ハイパーパラメータチューニング（optuna）を試す。(再検証)

### 目的
- `2023-7-31`に行った実験の各クラスごとの混合行列も表示する。
### 使用するデータセット
- KDD99 10%
### 手法
### 結果
### 考察

In [1]:
import pandas as pd

from utils_kdd99 import *
print_version()

python:      3.10.5
sklearn:     1.2.2
tensorflow:  2.13.0-rc0
keras:       2.13.1rc0
numpy:       1.23.5
pandas:      1.5.3


In [2]:
# データの読み込み
X, y = load_data(use_full_dataset=False, standard_scale=True, verbose=0, )
# 分類モデルによっては数値ラベルしか対応していないため、目的変数を分類クラスから数値へ変換
y = y.map(lambda x: attack_label_class[x]).map(lambda x: correspondences[x])
y.value_counts()

0    391458
1     97278
2      4107
3      1126
4        52
Name: true_label, dtype: int64

In [3]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=RANDOM_SEED, stratify=y)

### Dosのみを学習したオートエンコーダの作成
- 隠れ層の次元数(38->10->5->10->38)
- 活性化関数：ReLU
- 最適化関数：adam
- 損失関数：平均二乗誤差
- エポック数：5
- バッチサイズ：32

In [4]:
ae_model = keras.Sequential([
    Dense(units=10, activation='relu', input_dim=38, name='encoder1'),
    Dense(units=5, activation='relu', name='encoder2'),
    Dense(units=10, activation='relu'),
    Dense(units=38, activation='relu'),
])
ae_model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
ae_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder1 (Dense)            (None, 10)                390       
                                                                 
 encoder2 (Dense)            (None, 5)                 55        
                                                                 
 dense (Dense)               (None, 10)                60        
                                                                 
 dense_1 (Dense)             (None, 38)                418       
                                                                 
Total params: 923 (3.61 KB)
Trainable params: 923 (3.61 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [5]:
# Dosのみのデータセットを作成
dos_x_train = x_train[y_train == correspondences['dos']]

In [6]:
ae_model.fit(dos_x_train, dos_x_train,
            epochs=5, # データセットを使って学習する回数
            batch_size=32,
            shuffle=True,
            verbose=1,
            use_multiprocessing=True
          )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x226807e0520>

In [7]:
# エンコーダー部分を抜き取る
encoder = keras.Sequential([ae_model.get_layer('encoder1'),
                            ae_model.get_layer('encoder2')])
# DOSエンコーダを用いた特徴量に命名
dos_columns = list(map(lambda x: 'dos' + str(x), range(5)))
dos_columns

['dos0', 'dos1', 'dos2', 'dos3', 'dos4']

In [8]:
# 特徴量抽出し、マージする。
x_train_encoded = pd.DataFrame(data=encoder.predict(x_train), index=x_train.index, columns=dos_columns)
x_test_encoded = pd.DataFrame(data=encoder.predict(x_test), index=x_test.index, columns=dos_columns)
x_train_encoded = x_train.merge(x_train_encoded, right_index=True, left_index=True)
x_test_encoded = x_test.merge(x_test_encoded, right_index=True, left_index=True)
x_train_encoded.head()



Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,dos0,dos1,dos2,dos3,dos4
212221,-0.067792,-0.002017,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,...,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,0.0,2.195888,1.095266,0.448647,0.0
30903,-0.067792,-0.002774,0.472896,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,...,0.553404,-0.464418,-0.463202,-0.25204,-0.249464,2.07961,0.410976,0.466956,0.835854,0.248984
9739,-0.067792,-0.002017,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,...,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,0.0,2.198084,1.093214,0.446404,0.0
37540,-0.067792,-0.002776,-0.01412,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,...,0.790749,-0.464418,-0.463202,-0.25204,-0.249464,2.003659,0.624644,0.726423,0.977872,0.048583
418638,-0.067792,-0.002535,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,...,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,0.0,2.198297,1.093663,0.446661,0.0


### optuna＋lightGBMを用いた学習
- 元の特徴量のみを使う。
- 

In [9]:
import optuna.integration.lightgbm as lgb
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)
# LightGBM parameters
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'num_class': 5,
        'metric': 'multi_error', # 評価指標 : 誤り率(= 1-正答率)  another multi_logloss
        'learning_rate': 0.1,
        'num_leaves': 23,
        'min_data_in_leaf': 1,
        'verbose': -1,
        'random_state': RANDOM_SEED, 
}
import lightgbm
# モデルの学習
model:lightgbm.Booster = lgb.train(params, # パラメータ
                  lgb_train, # トレーニングデータの指定
                  valid_sets=[lgb_train], # 検証データの指定
                  callbacks=[lgb.early_stopping(50, verbose=False)],
               )


[I 2023-08-08 02:39:30,213] A new study created in memory with name: no-name-9166f064-6ea1-4f17-bd0e-af3871a023b8
feature_fraction, val_score: inf:   0%|          | 0/7 [00:00<?, ?it/s]



feature_fraction, val_score: 0.000571:  14%|#4        | 1/7 [00:05<00:34,  5.75s/it][I 2023-08-08 02:39:35,971] Trial 0 finished with value: 0.0005710073294379959 and parameters: {'feature_fraction': 0.4}. Best is trial 0 with value: 0.0005710073294379959.
feature_fraction, val_score: 0.000396:  29%|##8       | 2/7 [00:09<00:23,  4.75s/it][I 2023-08-08 02:39:40,024] Trial 1 finished with value: 0.0003957775669649601 and parameters: {'feature_fraction': 1.0}. Best is trial 1 with value: 0.0003957775669649601.
feature_fraction, val_score: 0.000396:  43%|####2     | 3/7 [00:13<00:17,  4.32s/it][I 2023-08-08 02:39:43,838] Trial 2 finished with value: 0.0003957775669649601 and parameters: {'feature_fraction': 0.7}. Best is trial 1 with value: 0.0003957775669649601.
feature_fraction, val_score: 0.000396:  57%|#####7    | 4/7 [00:17<00:12,  4.31s/it][I 2023-08-08 02:39:48,142] Trial 3 finished with value: 0.0003957775669649601 and parameters: {'feature_fraction': 0.6}. Best is trial 1 with va

In [10]:
y_pred_prob = model.predict(x_test)
# テストデータの予測 (予測クラス(0 or 1 or...)を返す)
y_pred = np.argmax(y_pred_prob, axis=1) # 一番大きい予測確率のクラスを予測クラスに
y_pred = pd.Series(y_pred)
y_pred.value_counts()

0    129191
1     32100
2      1346
3       369
4        21
dtype: int64

In [11]:
print(classification_report(y_test, y_pred, target_names=correspondences.keys()))

              precision    recall  f1-score   support

         dos       1.00      1.00      1.00    129181
      normal       1.00      1.00      1.00     32102
       probe       0.99      0.99      0.99      1355
         r2l       0.97      0.97      0.97       372
         u2r       0.62      0.76      0.68        17

    accuracy                           1.00    163027
   macro avg       0.92      0.94      0.93    163027
weighted avg       1.00      1.00      1.00    163027


In [13]:
print(multilabel_confusion_matrix(y_test, y_pred))

[[[ 33831     15]
  [     5 129176]]

 [[130895     30]
  [    32  32070]]

 [[161664      8]
  [    17   1338]]

 [[162645     10]
  [    13    359]]

 [[163002      8]
  [     4     13]]]


In [25]:
from sklearn.metrics import confusion_matrix
print(pd.DataFrame(confusion_matrix(y_test, y_pred), index=correspondences.keys(), columns=correspondences.keys()))
correspondences.keys()

           dos  normal  probe  r2l  u2r
dos     129176       4      0    1    0
normal      10   32070      8    9    5
probe        4      13   1338    0    0
r2l          1       9      0  359    3
u2r          0       4      0    0   13


dict_keys(['dos', 'normal', 'probe', 'r2l', 'u2r'])

In [26]:
# 
model.params

{'task': 'train',
 'boosting_type': 'gbdt',
 'objective': 'multiclass',
 'num_class': 5,
 'metric': 'multi_error',
 'learning_rate': 0.1,
 'num_leaves': 65,
 'min_data_in_leaf': 1,
 'verbose': -1,
 'random_state': 2018,
 'feature_pre_filter': False,
 'lambda_l1': 1.1738879225826304e-07,
 'lambda_l2': 0.01079234421922761,
 'feature_fraction': 0.948,
 'bagging_fraction': 1.0,
 'bagging_freq': 0,
 'min_child_samples': 20,
 'num_iterations': 1000,
 'early_stopping_round': None}

In [None]:
model

In [27]:
for key, confusion_matrix in zip(correspondences.keys(), multilabel_confusion_matrix(y_test, y_pred)):
    print(f"{key}    TP: {confusion_matrix[1][1]}, TN: {confusion_matrix[0][0]}, FP: {confusion_matrix[0][1]}, FN: {confusion_matrix[1][0]}")

dos    TP: 129176, TN: 33831, FP: 15, FN: 5
normal    TP: 32070, TN: 130895, FP: 30, FN: 32
probe    TP: 1338, TN: 161664, FP: 8, FN: 17
r2l    TP: 359, TN: 162645, FP: 10, FN: 13
u2r    TP: 13, TN: 163002, FP: 8, FN: 4


In [28]:
lgb_train_encoded = lgb.Dataset(x_train_encoded, y_train)
lgb_eval_encoded = lgb.Dataset(x_test_encoded, y_test, reference=lgb_train_encoded)

In [29]:
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'num_class': 5,
        'metric': 'multi_error', # 評価指標 : 誤り率(= 1-正答率)  another multi_logloss
        'learning_rate': 0.1,
        'num_leaves': 23,
        'min_data_in_leaf': 1,
        'verbose': -1,
        'random_state': RANDOM_SEED, 
}

# モデルの学習
model_encoded = lgb.train(params, # パラメータ
                  lgb_train_encoded, # トレーニングデータの指定
                  valid_sets=[lgb_train_encoded], # 検証データの指定
                  callbacks=[lgb.early_stopping(50, verbose=False)],
               )

[I 2023-08-08 02:51:06,047] A new study created in memory with name: no-name-069ec59e-f54e-43d6-b98b-f7f3251b645e
feature_fraction, val_score: inf:   0%|          | 0/7 [00:00<?, ?it/s]



feature_fraction, val_score: 0.000622:  14%|#4        | 1/7 [00:05<00:31,  5.24s/it][I 2023-08-08 02:51:11,293] Trial 0 finished with value: 0.0006223677770594029 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 0 with value: 0.0006223677770594029.
feature_fraction, val_score: 0.000622:  29%|##8       | 2/7 [00:11<00:28,  5.64s/it][I 2023-08-08 02:51:17,209] Trial 1 finished with value: 0.0006223677770594029 and parameters: {'feature_fraction': 0.6}. Best is trial 0 with value: 0.0006223677770594029.
feature_fraction, val_score: 0.000372:  43%|####2     | 3/7 [00:16<00:21,  5.44s/it][I 2023-08-08 02:51:22,411] Trial 2 finished with value: 0.00037160794455488616 and parameters: {'feature_fraction': 0.5}. Best is trial 2 with value: 0.00037160794455488616.
feature_fraction, val_score: 0.000372:  57%|#####7    | 4/7 [00:20<00:15,  5.07s/it][I 2023-08-08 02:51:26,907] Trial 3 finished with value: 0.00037160794455488616 and parameters: {'feature_fraction': 0.7}. Best 

In [30]:
y_pred_prob = model_encoded.predict(x_test_encoded)
# テストデータの予測 (予測クラス(0 or 1 or...)を返す)
y_pred_encoded = np.argmax(y_pred_prob, axis=1) # 一番大きい予測確率のクラスを予測クラスに
y_pred_encoded = pd.Series(y_pred_encoded)
y_pred_encoded.value_counts()

0    129182
1     32086
2      1342
3       381
4        36
dtype: int64

In [31]:
print(classification_report(y_test, y_pred_encoded, target_names=correspondences.keys()))

              precision    recall  f1-score   support

         dos       1.00      1.00      1.00    129181
      normal       1.00      1.00      1.00     32102
       probe       0.99      0.99      0.99      1355
         r2l       0.92      0.94      0.93       372
         u2r       0.25      0.53      0.34        17

    accuracy                           1.00    163027
   macro avg       0.83      0.89      0.85    163027
weighted avg       1.00      1.00      1.00    163027


In [37]:
from sklearn.metrics import confusion_matrix
print(pd.DataFrame(confusion_matrix(y_test, y_pred_encoded), index=correspondences.keys(), columns=correspondences.keys()))
correspondences.keys()

           dos  normal  probe  r2l  u2r
dos     129175       3      0    2    1
normal       5   32046      6   24   21
probe        0      15   1335    5    0
r2l          0      17      1  349    5
u2r          2       5      0    1    9


dict_keys(['dos', 'normal', 'probe', 'r2l', 'u2r'])

In [33]:
model_encoded.params

{'task': 'train',
 'boosting_type': 'gbdt',
 'objective': 'multiclass',
 'num_class': 5,
 'metric': 'multi_error',
 'learning_rate': 0.1,
 'num_leaves': 149,
 'min_data_in_leaf': 1,
 'verbose': -1,
 'random_state': 2018,
 'feature_pre_filter': False,
 'lambda_l1': 9.90562677232788e-07,
 'lambda_l2': 0.005704129889078178,
 'feature_fraction': 0.4,
 'bagging_fraction': 1.0,
 'bagging_freq': 0,
 'min_child_samples': 20,
 'num_iterations': 1000,
 'early_stopping_round': None}

In [34]:
for key, confusion_matrix in zip(correspondences.keys(), multilabel_confusion_matrix(y_test, y_pred)):
    print(f"{key}    TP: {confusion_matrix[1][1]}, TN: {confusion_matrix[0][0]}, FP: {confusion_matrix[0][1]}, FN: {confusion_matrix[1][0]}")

dos    TP: 129176, TN: 33831, FP: 15, FN: 5
normal    TP: 32070, TN: 130895, FP: 30, FN: 32
probe    TP: 1338, TN: 161664, FP: 8, FN: 17
r2l    TP: 359, TN: 162645, FP: 10, FN: 13
u2r    TP: 13, TN: 163002, FP: 8, FN: 4


In [35]:
import pickle
with open("models/lightgbm/lgb_param_tuned_booster.pkl", 'wb') as fp:
    pickle.dump(model.dump_model(), fp)

In [36]:
with open("models/lightgbm/lgb+ae_param_tuned_booster.pkl", 'wb') as fp:
    pickle.dump(model_encoded.dump_model(), fp)

In [38]:
model.save_model('models/lightgbm/lgb_tuned_booster.model')
model_encoded.save_model('models/lightgbm/lgb+ae_tuned_booster.model')

<lightgbm.basic.Booster at 0x226974ec5b0>