
## ハイパーパラメータチューニング（optuna）を試す。(再検証)

### 目的
- `2023-7-31`に行った実験の各クラスごとの混合行列も表示する。
### 使用するデータセット
- KDD99 10%
### 手法
### 結果
### 考察

In [1]:
import pandas as pd

from utils_kdd99 import *
print_version()

python:      3.10.11
sklearn:     1.2.2
tensorflow:  2.12.0
keras:       2.12.0
numpy:       1.23.5
pandas:      1.5.3


In [2]:
# データの読み込み
X, y = load_data(use_full_dataset=False, standard_scale=True, verbose=0, )
# 分類モデルによっては数値ラベルしか対応していないため、目的変数を分類クラスから数値へ変換
y = y.map(lambda x: attack_label_class[x]).map(lambda x: correspondences[x])
y.value_counts()

0    391458
1     97278
2      4107
3      1126
4        52
Name: true_label, dtype: int64

In [3]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=RANDOM_SEED, stratify=y)

### Dosのみを学習したオートエンコーダの作成
- 隠れ層の次元数(38->10->5->10->38)
- 活性化関数：ReLU
- 最適化関数：adam
- 損失関数：平均二乗誤差
- エポック数：5
- バッチサイズ：32

In [4]:
ae_model = keras.Sequential([
    Dense(units=10, activation='relu', input_dim=38, name='encoder1'),
    Dense(units=5, activation='relu', name='encoder2'),
    Dense(units=10, activation='relu'),
    Dense(units=38, activation='relu'),
])
ae_model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
ae_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder1 (Dense)            (None, 10)                390       
                                                                 
 encoder2 (Dense)            (None, 5)                 55        
                                                                 
 dense (Dense)               (None, 10)                60        
                                                                 
 dense_1 (Dense)             (None, 38)                418       
                                                                 
Total params: 923
Trainable params: 923
Non-trainable params: 0
_________________________________________________________________


In [5]:
# Dosのみのデータセットを作成
dos_x_train = x_train[y_train == correspondences['dos']]

In [6]:
ae_model.fit(dos_x_train, dos_x_train,
            epochs=5, # データセットを使って学習する回数
            batch_size=32,
            shuffle=True,
            verbose=1,
            use_multiprocessing=True
          )

Epoch 1/5


2023-08-08 11:40:48.915995: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x17f6f7370>

In [7]:
# エンコーダー部分を抜き取る
encoder = keras.Sequential([ae_model.get_layer('encoder1'),
                            ae_model.get_layer('encoder2')])
# DOSエンコーダを用いた特徴量に命名
dos_columns = list(map(lambda x: 'dos' + str(x), range(5)))
dos_columns

['new_feature0',
 'new_feature1',
 'new_feature2',
 'new_feature3',
 'new_feature4']

In [8]:
# 特徴量抽出し、マージする。
x_train_encoded = pd.DataFrame(data=encoder.predict(x_train), index=x_train.index, columns=dos_columns)
x_test_encoded = pd.DataFrame(data=encoder.predict(x_test), index=x_test.index, columns=dos_columns)
x_train_encoded = x_train.merge(x_train_encoded, right_index=True, left_index=True)
x_test_encoded = x_test.merge(x_test_encoded, right_index=True, left_index=True)
x_train_encoded.head()



Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,new_feature0,new_feature1,new_feature2,new_feature3,new_feature4
212221,-0.067792,-0.002017,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,...,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,0.692042,0.222451,1.115143,1.364652,0.056114
30903,-0.067792,-0.002774,0.472896,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,...,0.553404,-0.464418,-0.463202,-0.25204,-0.249464,0.619383,1.740016,1.137264,2.078098,1.170766
9739,-0.067792,-0.002017,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,...,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,0.689879,0.221864,1.116348,1.364235,0.054315
37540,-0.067792,-0.002776,-0.01412,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,...,0.790749,-0.464418,-0.463202,-0.25204,-0.249464,0.557589,1.872177,1.244722,2.036932,1.210736
418638,-0.067792,-0.002535,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,...,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,0.690494,0.222162,1.116061,1.364087,0.054615


### optuna＋lightGBMを用いた学習
- 元の特徴量のみを使う。
- 

In [9]:
import optuna.integration.lightgbm as lgb
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)
# LightGBM parameters
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'num_class': 5,
        'metric': 'multi_error', # 評価指標 : 誤り率(= 1-正答率)  another multi_logloss
        'learning_rate': 0.1,
        'num_leaves': 23,
        'min_data_in_leaf': 1,
        'verbose': -1,
        'random_state': RANDOM_SEED, 
}
import lightgbm
# モデルの学習
model:lightgbm.Booster = lgb.train(params, # パラメータ
                  lgb_train, # トレーニングデータの指定
                  valid_sets=[lgb_train], # 検証データの指定
                  callbacks=[lgb.early_stopping(50, verbose=False)],
               )


[I 2023-08-08 11:41:16,334] A new study created in memory with name: no-name-648e4cc0-ca0f-4766-a524-c39e9859e136
feature_fraction, val_score: inf:   0%|          | 0/7 [00:00<?, ?it/s]



feature_fraction, val_score: 0.000480:  14%|#4        | 1/7 [00:03<00:18,  3.13s/it][I 2023-08-08 11:41:19,532] Trial 0 finished with value: 0.00048037124540021875 and parameters: {'feature_fraction': 0.7}. Best is trial 0 with value: 0.00048037124540021875.
feature_fraction, val_score: 0.000480:  29%|##8       | 2/7 [00:07<00:18,  3.63s/it][I 2023-08-08 11:41:23,508] Trial 1 finished with value: 0.00048037124540021875 and parameters: {'feature_fraction': 0.4}. Best is trial 0 with value: 0.00048037124540021875.
feature_fraction, val_score: 0.000480:  43%|####2     | 3/7 [00:10<00:13,  3.35s/it][I 2023-08-08 11:41:26,532] Trial 2 finished with value: 0.00048037124540021875 and parameters: {'feature_fraction': 0.8}. Best is trial 0 with value: 0.00048037124540021875.
feature_fraction, val_score: 0.000480:  57%|#####7    | 4/7 [00:13<00:09,  3.16s/it][I 2023-08-08 11:41:29,401] Trial 3 finished with value: 0.00048037124540021875 and parameters: {'feature_fraction': 0.6}. Best is trial 0 

In [10]:
y_pred_prob = model.predict(x_test)
# テストデータの予測 (予測クラス(0 or 1 or...)を返す)
y_pred = np.argmax(y_pred_prob, axis=1) # 一番大きい予測確率のクラスを予測クラスに
y_pred = pd.Series(y_pred)
y_pred.value_counts()

0    129191
1     32127
2      1335
3       358
4        16
dtype: int64

In [11]:
print(classification_report(y_test, y_pred, target_names=correspondences.keys()))

              precision    recall  f1-score   support

         dos       1.00      1.00      1.00    129181
      normal       1.00      1.00      1.00     32102
       probe       1.00      0.98      0.99      1355
         r2l       0.98      0.94      0.96       372
         u2r       0.81      0.76      0.79        17

    accuracy                           1.00    163027
   macro avg       0.96      0.94      0.95    163027
weighted avg       1.00      1.00      1.00    163027


In [12]:
print(multilabel_confusion_matrix(y_test, y_pred))

[[[ 33831     15]
  [     5 129176]]

 [[130882     43]
  [    18  32084]]

 [[161671      1]
  [    21   1334]]

 [[162648      7]
  [    21    351]]

 [[163007      3]
  [     4     13]]]


In [13]:
from sklearn.metrics import confusion_matrix
print(pd.DataFrame(confusion_matrix(y_test, y_pred), index=correspondences.keys(), columns=correspondences.keys()))
correspondences.keys()

           dos  normal  probe  r2l  u2r
dos     129176       5      0    0    0
normal       8   32084      1    7    2
probe        6      15   1334    0    0
r2l          1      19      0  351    1
u2r          0       4      0    0   13


dict_keys(['dos', 'normal', 'probe', 'r2l', 'u2r'])

In [14]:
# 
model.params

{'task': 'train',
 'boosting_type': 'gbdt',
 'objective': 'multiclass',
 'num_class': 5,
 'metric': 'multi_error',
 'learning_rate': 0.1,
 'num_leaves': 64,
 'min_data_in_leaf': 1,
 'verbose': -1,
 'random_state': 2018,
 'feature_pre_filter': False,
 'lambda_l1': 0.27499882674690457,
 'lambda_l2': 0.14774039219275795,
 'feature_fraction': 0.8999999999999999,
 'bagging_fraction': 1.0,
 'bagging_freq': 0,
 'min_child_samples': 20,
 'num_iterations': 1000,
 'early_stopping_round': None}

In [15]:
model

<lightgbm.basic.Booster at 0x16bd66800>

In [16]:
for key, confusion_matrix in zip(correspondences.keys(), multilabel_confusion_matrix(y_test, y_pred)):
    print(f"{key}    TP: {confusion_matrix[1][1]}, TN: {confusion_matrix[0][0]}, FP: {confusion_matrix[0][1]}, FN: {confusion_matrix[1][0]}")

dos    TP: 129176, TN: 33831, FP: 15, FN: 5
normal    TP: 32084, TN: 130882, FP: 43, FN: 18
probe    TP: 1334, TN: 161671, FP: 1, FN: 21
r2l    TP: 351, TN: 162648, FP: 7, FN: 21
u2r    TP: 13, TN: 163007, FP: 3, FN: 4


In [17]:
lgb_train_encoded = lgb.Dataset(x_train_encoded, y_train)
lgb_eval_encoded = lgb.Dataset(x_test_encoded, y_test, reference=lgb_train_encoded)

In [18]:
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'num_class': 5,
        'metric': 'multi_error', # 評価指標 : 誤り率(= 1-正答率)  another multi_logloss
        'learning_rate': 0.1,
        'num_leaves': 23,
        'min_data_in_leaf': 1,
        'verbose': -1,
        'random_state': RANDOM_SEED, 
}

# モデルの学習
model_encoded = lgb.train(params, # パラメータ
                  lgb_train_encoded, # トレーニングデータの指定
                  valid_sets=[lgb_train_encoded], # 検証データの指定
                  callbacks=[lgb.early_stopping(50, verbose=False)],
               )

[I 2023-08-08 11:47:12,338] A new study created in memory with name: no-name-10788ab1-0e23-411e-abf7-29e7b4f9dc1b
feature_fraction, val_score: inf:   0%|          | 0/7 [00:00<?, ?it/s]



feature_fraction, val_score: 0.000580:  14%|#4        | 1/7 [00:03<00:19,  3.33s/it][I 2023-08-08 11:47:15,669] Trial 0 finished with value: 0.0005800709378417736 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 0 with value: 0.0005800709378417736.
feature_fraction, val_score: 0.000399:  29%|##8       | 2/7 [00:06<00:16,  3.38s/it][I 2023-08-08 11:47:19,093] Trial 1 finished with value: 0.0003987987697662193 and parameters: {'feature_fraction': 0.7}. Best is trial 1 with value: 0.0003987987697662193.
feature_fraction, val_score: 0.000399:  43%|####2     | 3/7 [00:10<00:13,  3.39s/it][I 2023-08-08 11:47:22,487] Trial 2 finished with value: 0.0003987987697662193 and parameters: {'feature_fraction': 0.6}. Best is trial 1 with value: 0.0003987987697662193.
feature_fraction, val_score: 0.000399:  57%|#####7    | 4/7 [00:14<00:10,  3.65s/it][I 2023-08-08 11:47:26,536] Trial 3 finished with value: 0.0003987987697662193 and parameters: {'feature_fraction': 0.5}. Best is 

In [None]:
y_pred_prob = model_encoded.predict(x_test_encoded)
# テストデータの予測 (予測クラス(0 or 1 or...)を返す)
y_pred_encoded = np.argmax(y_pred_prob, axis=1) # 一番大きい予測確率のクラスを予測クラスに
y_pred_encoded = pd.Series(y_pred_encoded)
y_pred_encoded.value_counts()

In [19]:
print(classification_report(y_test, y_pred_encoded, target_names=correspondences.keys()))

0    129182
1     32129
2      1346
3       358
4        12
dtype: int64

In [20]:
from sklearn.metrics import confusion_matrix
print(pd.DataFrame(confusion_matrix(y_test, y_pred_encoded), index=correspondences.keys(), columns=correspondences.keys()))
correspondences.keys()

              precision    recall  f1-score   support

         dos       1.00      1.00      1.00    129181
      normal       1.00      1.00      1.00     32102
       probe       1.00      0.99      0.99      1355
         r2l       0.99      0.95      0.97       372
         u2r       0.75      0.53      0.62        17

    accuracy                           1.00    163027
   macro avg       0.95      0.89      0.92    163027
weighted avg       1.00      1.00      1.00    163027


           dos  normal  probe  r2l  u2r
dos     129177       4      0    0    0
normal       4   32094      3    1    0
probe        0      11   1343    0    1
r2l          1      14      0  355    2
u2r          0       6      0    2    9


dict_keys(['dos', 'normal', 'probe', 'r2l', 'u2r'])

In [21]:
model_encoded.params

In [22]:
for key, confusion_matrix in zip(correspondences.keys(), multilabel_confusion_matrix(y_test, y_pred)):
    print(f"{key}    TP: {confusion_matrix[1][1]}, TN: {confusion_matrix[0][0]}, FP: {confusion_matrix[0][1]}, FN: {confusion_matrix[1][0]}")

{'task': 'train',
 'boosting_type': 'gbdt',
 'objective': 'multiclass',
 'num_class': 5,
 'metric': 'multi_error',
 'learning_rate': 0.1,
 'num_leaves': 105,
 'min_data_in_leaf': 1,
 'verbose': -1,
 'random_state': 2018,
 'feature_pre_filter': False,
 'lambda_l1': 1.3428666550913902e-08,
 'lambda_l2': 7.162419995940924,
 'feature_fraction': 0.7,
 'bagging_fraction': 1.0,
 'bagging_freq': 0,
 'min_child_samples': 20,
 'num_iterations': 1000,
 'early_stopping_round': None}

In [23]:
import pickle
with open("models/lightgbm/lgb_param_tuned_booster.pkl", 'wb') as fp:
    pickle.dump(model.dump_model(), fp)

dos    TP: 129176, TN: 33831, FP: 15, FN: 5
normal    TP: 32084, TN: 130882, FP: 43, FN: 18
probe    TP: 1334, TN: 161671, FP: 1, FN: 21
r2l    TP: 351, TN: 162648, FP: 7, FN: 21
u2r    TP: 13, TN: 163007, FP: 3, FN: 4


In [24]:
with open("models/lightgbm/lgb+ae_param_tuned_booster.pkl", 'wb') as fp:
    pickle.dump(model_encoded.dump_model(), fp)

In [25]:
model.save_model('models/lightgbm/lgb_tuned_booster.model')
model_encoded.save_model('models/lightgbm/lgb+ae_tuned_booster.model')

<lightgbm.basic.Booster at 0x16bd80310>